In [11]:
import pandas as pd
import numpy as np
import unicodedata
import re


In [51]:
propertytype = {}
propertytype[0] = 0
propertytype["appartment"] = 1
propertytype["apartment"] = 1
propertytype["departamento"] = 1
propertytype["house"] = 2
propertytype["casa"] = 2
propertytype["PH"] = 3
propertytype["ph"] = 3
propertytype["store"] = 4
propertytype["local"] = 4

def date_to_float(dt64):
    return (dt64 - np.datetime64('2013-01-01T00:00:00Z')) / np.timedelta64(1, 's')


def define_category(d):
    if("picina" in d or "garage" in d):
        return 25

    if("pileta" in d or "cochera" in d):
        return 20

    if("gimnasio" in d):
        return 15
    
    if("sum" in d):
        return 10
    
    if("reciclar" in d or "refaccionar" in d ):
        return -20
    return 0

def big_fill_nan_and_convert_to_float(datos):
    columns = ['created_on','property_type','lat','lon', 'place_name','state_name',
                 'surface_total_in_m2','surface_covered_in_m2','description']
   
    datos = datos.loc[:,columns]
    
    #para el tamaño total de la propiedad uso el promedio
    datos["surface_total_in_m2"] = datos["surface_total_in_m2"].map(lambda t: np.NaN if(t== 0 ) else t)
    datos["surface_total_in_m2"] = datos['surface_total_in_m2'].fillna(datos['surface_total_in_m2'].mean())

    #tamaño cubierto, tomo el total de la fila
    datos["surface_covered_in_m2"] = datos['surface_covered_in_m2'].fillna(datos['surface_total_in_m2'])
  
    #tipo de propiedad
    datos["property_type"] = datos.property_type.map(lambda t: propertytype.get(t))
    
    #descripcion
    datos["description"] = datos.description.map(lambda d: define_category(str(d)))
    
    
    #fecha
    datos["created_on"] =  pd.to_datetime(datos['created_on'])
    datos['created_on'] = datos['created_on'].map(lambda dt64 :date_to_float(dt64))
    
    #barrios a numeros
    datos["place_name"] = pd.Categorical(datos.place_name)
    datos["place_name"] = datos.place_name.cat.codes
    
    #zonas a numeros
    datos["state_name"] = pd.Categorical(datos.state_name)
    datos["state_name"] = datos.state_name.cat.codes
    
    #latitud y longitud la relleno con el promedio del barrio
    datos['lat'] = datos.groupby('place_name')['lat'].apply(lambda x: x.fillna(x.mean()))
    datos['lon'] = datos.groupby('place_name')['lon'].apply(lambda x: x.fillna(x.mean()))
    
    #por si queda algun nan
    datos.fillna(0, inplace=True)
    return datos
   

def fill_nan_and_convert_to_float(datos):
    columns = ['created_on','property_type','lat','lon', 'place_name','state_name',
                 'surface_total_in_m2','surface_covered_in_m2','description',
                 'floor','rooms','expenses']

    datos = datos.loc[:,columns]
    
    #para el tamaño total de la propiedad uso el promedio
    datos["surface_total_in_m2"] = datos["surface_total_in_m2"].map(lambda t: np.NaN if(t== 0 ) else t)
    datos["surface_total_in_m2"] = datos['surface_total_in_m2'].fillna(datos['surface_total_in_m2'].mean())

    #tamaño cubierto, tomo el total de la fila
    datos["surface_covered_in_m2"] = datos['surface_covered_in_m2'].fillna(datos['surface_total_in_m2'])
  
    
    #relleno el piso y las habitaciones con la moda
    datos["floor"] = datos['floor'].map(lambda f: np.NaN if(f > 30) else f)
    datos["floor"] = datos['floor'].fillna(datos.dropna(subset=['floor']).floor.value_counts().idxmax())
    datos["rooms"] = datos['rooms'].fillna(datos.dropna(subset=['rooms']).rooms.value_counts().idxmax())
    
    #tipo de propiedad
    datos["property_type"] = datos.property_type.map(lambda t: propertytype.get(t))
    
    #descripcion
    datos["description"] = datos.description.map(lambda d: define_category(str(d)))
    
    
    #fecha
    datos["created_on"] =  pd.to_datetime(datos['created_on'])
    datos['created_on'] = datos['created_on'].map(lambda dt64 :date_to_float(dt64))
    
    #barrios a numeros
    datos["place_name"] = pd.Categorical(datos.place_name)
    datos["place_name"] = datos.place_name.cat.codes
    
    #zonas a numeros
    datos["state_name"] = pd.Categorical(datos.state_name)
    datos["state_name"] = datos.state_name.cat.codes
    
    #latitud y longitud la relleno con el promedio del barrio
    datos['lat'] = datos.groupby('place_name')['lat'].apply(lambda x: x.fillna(x.mean()))
    datos['lon'] = datos.groupby('place_name')['lon'].apply(lambda x: x.fillna(x.mean()))
    
    
     #si en el campo de las expensas dice no, entonces le asigno expensas cero
    datos["expenses"] = datos["expenses"].map(lambda exp: "0" if(re.search(str(exp), 'no', re.IGNORECASE)) else str(exp))
                                                            
    #si no tiene un valor numerico lo cambio a nan para luego asignarle el promedio del barrio                                                   
    datos["expenses"] = datos["expenses"].map(lambda exp: re.sub("[^0-9]", "",str(exp) ))
    datos["expenses"] = datos["expenses"].map(lambda exp: np.NaN if(str(exp)== "") else exp)
    datos["expenses"] = datos["expenses"].map(lambda exp: float(exp))
    datos["expenses"] = datos.groupby('place_name')['expenses'].apply(lambda x: x.fillna(x.mean()))
    
    #por si queda algun nan
    datos.fillna(0, inplace=True)
    return datos

In [39]:
def delete_signs(text):
    return ''.join([i if ord(i) < 128 else ' ' for i in text])
def delete_accent_mark(s):
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

def unificate_description(d):
    if(pd.notnull(d)):
        return delete_signs(delete_accent_mark(d).lower())
    return d
    

In [40]:
#lista con todos los barrios posibles sin repetir (sin tildes y en minuscula)
def get_neighborhoods(properati_df,nombre_conj_barios):
    neighborhoods = properati_df[pd.notnull(properati_df['place_name'])].place_name
    neighborhoods = neighborhoods.drop_duplicates(keep='first')
    neighborhoods = neighborhoods.map(lambda b: delete_accent_mark(b).lower())
    neighborhoods.replace(nombre_conj_barios, "sin barrio", inplace = True)
    return neighborhoods



def get_place_name(data):
    places =[]
    for index, row in data.iterrows():
        if(pd.notnull(row['place_name_y']) and (row['place_name_y']!="sin barrio") ):
            places.append(row['place_name_y'].title())
        else:
            if(pd.notnull(row['place_name_x'])):
                places.append(row['place_name_x'].title())
            else:
                places.append(row['place_name_x'])
    return places



def assign_neighborhoods(description,neighborhoods):
    for neighborhood in neighborhoods:
        if(str(description.encode('utf-8')).find(neighborhood) >= 0):
              return neighborhood
    return "sin barrio"




def guess_neighborhoods(df,  nombre_conj_barios):
    
    neighborhoods = get_neighborhoods(df,nombre_conj_barios)

    sin_barrio = df.loc[df.place_name.str.contains(nombre_conj_barios , na=False),:]
    sin_barrio.place_name = sin_barrio.description.map(lambda description: assign_neighborhoods(description,neighborhoods))

    
    barrio_asignado = sin_barrio.loc[sin_barrio.place_name.str.contains("sin barrio", na=False) == False,:]
    
    columnas_no_price = ['id', 'created_on', 'property_type', 'operation',
       'place_with_parent_names', 'country_name', 'state_name',
       'lat-lon', 'lat', u'lon', 'surface_total_in_m2',
       'surface_covered_in_m2', 'floor', 'rooms', 'expenses',
       'description']
    
    
    properati_barrios = pd.merge(df, barrio_asignado,how='outer', on=columnas_no_price)
    
    properati_barrios['place_name']= get_place_name(properati_barrios)
    columnas_no_price.append('place_name')
    properati_barrios=properati_barrios.loc[:,columnas_no_price]
    
    return properati_barrios

In [41]:
def get_df_properati_to_predict(df, file_name):

    df.description = df.description.map(lambda d: unificate_description(d))
    
    guess_neighborhoods(df,"Capital Federal")
    guess_neighborhoods(df,"Buenos Aires Interior")
    
    return fill_nan_and_convert_to_float(df)

def big_get_df_properati_to_predict(df, file_name):

    df.description = df.description.map(lambda d: unificate_description(d))
    
    guess_neighborhoods(df,"Capital Federal")
    guess_neighborhoods(df,"Buenos Aires Interior")
    
    return big_fill_nan_and_convert_to_float(df)


In [52]:
properati_no_price= pd.read_csv('../tp1/properati_dataset_testing_noprice.csv',encoding='UTF-8')
normal = properati_no_price.loc[properati_no_price.surface_total_in_m2 < 5000,:]
normal = get_df_properati_to_predict(normal, 'nada')


  


KeyError: 'price_aprox_usd'

In [None]:
normal

In [None]:
big = properati_no_price.loc[properati_no_price.surface_total_in_m2 >= 5000,:]
big = big_get_df_properati_to_predict(big, 'nada')

In [None]:
big

In [17]:
big.to_csv('big_data_filled_ready_to_predict.csv',encoding='utf-8',index=False)