In [1]:
import os
import sys
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
sys.stderr = stderr

import pandas as pd
from sklearn.utils import shuffle
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("sets_de_datos/train.csv")
lat_long = pd.read_csv("sets_de_datos/train_final1.csv").astype('float64')
df["precio"] /= 10000
df[["lat","lng"]] = lat_long[["lat","lng"]]
df_precios_por_tipo = df.groupby('tipodepropiedad').agg({'precio':'mean'}).rename(columns={'precio':'precio_por_tipo'})

In [3]:
test_kaggle = pd.read_csv('sets_de_datos/test.csv', index_col = 0)
lat_long_kaggle = pd.read_csv('sets_de_datos/test_final1.csv', index_col = 0)
test_kaggle[['lat','lng']] = lat_long_kaggle[['lat','lng']]

In [4]:
def limpiar(df):
    df.garages = df.garages.fillna(0)
    df.metroscubiertos = df.metroscubiertos.fillna(df.metroscubiertos.mean())
    df.antiguedad = df.antiguedad.fillna(df.antiguedad.mean())
    df.banos = df.banos.fillna(1)
    df.habitaciones = df.habitaciones.fillna(df.habitaciones.mean())
    df.tipodepropiedad = df.tipodepropiedad.fillna('Casa')
    df.metrostotales = df.metrostotales.fillna(0)
    df['metroscubiertos'] = df['metroscubiertos'].fillna(df['metroscubiertos'].mean())
    df.ciudad = df.ciudad.fillna("")

def nuevas_features(df, precios_tipo,precio_m2,promedios,default_m2):
    df['ratio_cubierto'] = df.apply(lambda x: x['metroscubiertos']/x['metrostotales'] if x['metrostotales'] else 1, axis = 1)
    df['tipodepropiedad'] = df['tipodepropiedad'].apply(lambda x: precios_tipo.loc[x]['precio_por_tipo'])
    df['precio_x_m2'] = df.apply(lambda x: precio_x_m2.get(x['ciudad'],default_m2), axis = 1)
    df['mean_hab'] = df.apply(lambda x: promedios['mean_hab'].get(x['ciudad'],default_m2), axis = 1)
    df['mean_ban'] = df.apply(lambda x: promedios['mean_ban'].get(x['ciudad'],default_m2), axis = 1)
    df['mean_gar'] = df.apply(lambda x: promedios['mean_gar'].get(x['ciudad'],default_m2), axis = 1)

def evaluar_rf(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Performance del modelo:')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
    
df_precios_por_tipo

Unnamed: 0_level_0,precio_por_tipo
tipodepropiedad,Unnamed: 1_level_1
Apartamento,276.376866
Bodega comercial,269.37577
Casa,239.815771
Casa en condominio,289.892704
Casa uso de suelo,373.246923
Departamento Compartido,256.016189
Duplex,127.175797
Edificio,500.949937
Garage,50.0
Hospedaje,300.0


In [5]:
limpiar(df)
df['precio_x_m2'] = df['precio']/df['metroscubiertos']
precio_x_m2 = df.groupby('ciudad').agg({'precio_x_m2':'mean'}).to_dict()['precio_x_m2']
default = df.groupby('ciudad').agg({'precio_x_m2':'mean'})['precio_x_m2'].mean()
promedios = df.set_index('ciudad')\
            .join(df.groupby('ciudad')\
                  .agg({'habitaciones':'mean', 'garages':'mean', 'banos':'mean'})\
                      .rename(columns={'habitaciones':'mean_hab', 'banos':'mean_ban', 'garages':'mean_gar'}))\
                        [['mean_hab','mean_gar','mean_ban']].to_dict()
nuevas_features(df, df_precios_por_tipo, precio_x_m2,promedios,default)

In [6]:
limpiar(test_kaggle)
nuevas_features(test_kaggle, df_precios_por_tipo, precio_x_m2,promedios,default)

In [7]:
test_kaggle

Unnamed: 0_level_0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto,precio_x_m2,mean_hab,mean_ban,mean_gar
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4941,"casa en venta en miguel hidalgo, distrito federal",<p>excelente casa estilo moderno.</p>,239.815771,Bosque de Cedros,Miguel Hidalgo,Distrito Federal,29.000000,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,3.153119,2.508828,1.866091,1.303192
51775,departamentos en venta en montebello,<p>departamento una recamara:\n</p><p>departam...,276.376866,,Mérida,Yucatán,8.152599,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.962266,2.928739,2.425859,1.357163
115253,departamento nuevo delegación coyoacán de 87 m...,"departamento nuevo de 87.06 m2, 1 cajón de est...",276.376866,"Pueblo de los Reyes, Coyoacán, Mexico D.F.",Coyoacán,Distrito Federal,0.000000,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.870000,2.049670,3.025226,1.979029,1.336671
299321,departamento en venta en acapulco,<p> raíces dv001 precioso departamento tipo k...,276.376866,,Acapulco de Juárez,Guerrero,2.000000,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,1.543191,2.823817,2.198452,1.224531
173570,bonita casa sola equipada de dos niveles en lo...,"<p>casa sola, bonita de dos rec&aacute;maras u...",239.815771,CEDROS,Tultitlán,Edo. de México,10.000000,2.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.052632,0.869196,2.799797,1.340136,1.073858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75094,oportunidad!! se vende amplia casa en col. moc...,oportunidad!! ideal para oficina o casa habita...,239.815771,Oriente 172 # 265,Venustiano Carranza,Distrito Federal,20.000000,4.0,3.0,3.0,...,0.0,0.0,0.0,1.0,1.0,1.000000,1.582998,2.824782,1.497598,0.849005
171847,colinas de ecatepec,"casa, sala comedor, patio de servicio, buenas ...",239.815771,colinas,Ecatepec de Morelos,Edo. de México,10.000000,3.0,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.816092,0.872189,2.951562,1.503011,1.185734
138313,estrene hermosa casa en sierra morena,hermosa casa lista para habitarse ubicada en f...,239.815771,s/calle,Guadalupe,Nuevo León,5.000000,3.0,2.0,2.0,...,0.0,0.0,0.0,1.0,1.0,1.000000,0.877376,2.973335,1.864879,1.074531
271268,zen house i venta de linda casa con acabados ...,hermosa casa con acabados de lujo en fracciona...,239.815771,Zen House l,Querétaro,Querétaro,0.000000,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.902778,1.164389,2.974428,2.269413,1.479203


In [8]:
FEATURES = ['tipodepropiedad', 'lat', 'lng', 'garages', 'habitaciones', 'antiguedad', 'metroscubiertos', 'banos', 'ratio_cubierto', 'mean_hab','mean_ban', 'mean_gar', 'precio_x_m2']
df = shuffle(df)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
target_train = pd.DataFrame(train["precio"])
test = df[~msk]
target_test = pd.DataFrame(test["precio"])
train = train.drop(columns = ["id", "precio"])
test = test.drop(columns = ["id", "precio"])

In [17]:
params = {'max_depth': 51, 'max_features': 9, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 236}

In [38]:
rf = RandomForestRegressor(**params)

In [39]:
rf.fit(train[FEATURES], target_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=51,
                      max_features=9, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=9,
                      min_weight_fraction_leaf=0.0, n_estimators=236,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [40]:
evaluar_rf(rf, test[FEATURES], target_test.values.ravel())

Performance del modelo:
Average Error: 61.0515 degrees.
Accuracy = 70.93%.


70.93208172861048

In [41]:
rf.predict(test[FEATURES])

array([215.4373489 ,  50.00516587, 133.29037434, ..., 101.2635097 ,
       139.23027935, 495.46705466])

In [42]:
for i, x in enumerate(rf.feature_importances_):
  print(FEATURES[i], x)

tipodepropiedad 0.048989918590532235
lat 0.03480366690518197
lng 0.027669798926507727
garages 0.018526461147910477
habitaciones 0.02333193364812081
antiguedad 0.033615566415062484
metroscubiertos 0.38427965815292553
banos 0.05573156484728923
ratio_cubierto 0.0662139867283907
mean_hab 0.0196376263651204
mean_ban 0.013147929259040797
mean_gar 0.01335098740151981
precio_x_m2 0.25322902021752053
gimnasio 0.004389795507421135
piscina 0.0030820858874562713


In [23]:
test_kaggle = pd.read_csv('sets_de_datos/test.csv', index_col = 0)
lat_long_kaggle = pd.read_csv('sets_de_datos/test_final1.csv', index_col = 0)
test_kaggle[['lat','lng']] = lat_long_kaggle[['lat','lng']]

In [27]:
precio_x_m2 = df.groupby('ciudad').agg({'precio_x_m2':'mean'}).to_dict()['precio_x_m2']
default = df.groupby('ciudad').agg({'precio_x_m2':'mean'})['precio_x_m2'].mean()
limpiar(test_kaggle)
nuevas_features(test_kaggle, df_precios_por_tipo, precio_x_m2, promedios, default)

In [28]:
test_kaggle

Unnamed: 0_level_0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto,precio_x_m2,mean_hab,mean_ban,mean_gar
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4941,"casa en venta en miguel hidalgo, distrito federal",<p>excelente casa estilo moderno.</p>,239.815771,Bosque de Cedros,Miguel Hidalgo,Distrito Federal,29.000000,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,3.153119,2.508828,1.866091,1.303192
51775,departamentos en venta en montebello,<p>departamento una recamara:\n</p><p>departam...,276.376866,,Mérida,Yucatán,8.152599,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.962266,2.928739,2.425859,1.357163
115253,departamento nuevo delegación coyoacán de 87 m...,"departamento nuevo de 87.06 m2, 1 cajón de est...",276.376866,"Pueblo de los Reyes, Coyoacán, Mexico D.F.",Coyoacán,Distrito Federal,0.000000,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.870000,2.049670,3.025226,1.979029,1.336671
299321,departamento en venta en acapulco,<p> raíces dv001 precioso departamento tipo k...,276.376866,,Acapulco de Juárez,Guerrero,2.000000,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,1.543191,2.823817,2.198452,1.224531
173570,bonita casa sola equipada de dos niveles en lo...,"<p>casa sola, bonita de dos rec&aacute;maras u...",239.815771,CEDROS,Tultitlán,Edo. de México,10.000000,2.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.052632,0.869196,2.799797,1.340136,1.073858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75094,oportunidad!! se vende amplia casa en col. moc...,oportunidad!! ideal para oficina o casa habita...,239.815771,Oriente 172 # 265,Venustiano Carranza,Distrito Federal,20.000000,4.0,3.0,3.0,...,0.0,0.0,0.0,1.0,1.0,1.000000,1.582998,2.824782,1.497598,0.849005
171847,colinas de ecatepec,"casa, sala comedor, patio de servicio, buenas ...",239.815771,colinas,Ecatepec de Morelos,Edo. de México,10.000000,3.0,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.816092,0.872189,2.951562,1.503011,1.185734
138313,estrene hermosa casa en sierra morena,hermosa casa lista para habitarse ubicada en f...,239.815771,s/calle,Guadalupe,Nuevo León,5.000000,3.0,2.0,2.0,...,0.0,0.0,0.0,1.0,1.0,1.000000,0.877376,2.973335,1.864879,1.074531
271268,zen house i venta de linda casa con acabados ...,hermosa casa con acabados de lujo en fracciona...,239.815771,Zen House l,Querétaro,Querétaro,0.000000,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.902778,1.164389,2.974428,2.269413,1.479203


In [30]:
submit = rf.predict(test_kaggle[FEATURES])
submit_df = pd.DataFrame({"id": test_kaggle.index, "target": submit*10000})
submit_df

Unnamed: 0,id,target
0,4941,7.350948e+06
1,51775,9.178678e+05
2,115253,2.549329e+06
3,299321,1.156507e+06
4,173570,6.712142e+05
...,...,...
59995,75094,3.906498e+06
59996,171847,8.021699e+05
59997,138313,8.320168e+05
59998,271268,1.418399e+06


In [None]:
submit_df.to_csv("submits/submit.csv", index=False)

In [31]:
for i, x in enumerate(rf.feature_importances_):
  try:
    print(FEATURES[i],x)
  except:
    print(i - 9, x)

tipodepropiedad 0.04780524819967794
lat 0.03170084291334149
lng 0.028546492651908985
garages 0.017696308471675148
habitaciones 0.020730437219092096
antiguedad 0.03492451602002491
metroscubiertos 0.3883015819254899
banos 0.054948748899680835
ratio_cubierto 0.0669776171168624
mean_hab 0.01764177530869396
mean_ban 0.011116132011113892
mean_gar 0.011676503832979733
precio_x_m2 0.26793379542945867


In [37]:
FEATURES = ['tipodepropiedad', 'lat', 'lng', 'garages', 'habitaciones', 'antiguedad', 'metroscubiertos', 'banos', 'ratio_cubierto', 'mean_hab','mean_ban', 'mean_gar', 'precio_x_m2','gimnasio','piscina']
