In [1]:
import os
import sys
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
sys.stderr = stderr

import pandas as pd
from sklearn.utils import shuffle
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("sets_de_datos/train.csv")
lat_long = pd.read_csv("sets_de_datos/train_final1.csv").astype('float64')
df["precio"] /= 10000
df[["lat","lng"]] = lat_long[["lat","lng"]]
df_precios_por_tipo = df.groupby('tipodepropiedad').agg({'precio':'mean'}).rename(columns={'precio':'precio_por_tipo'})

In [3]:
def limpiar(df):
  df.garages = df.garages.fillna(0)
  df.metroscubiertos = df.metroscubiertos.fillna(df.metroscubiertos.mean())
  df.antiguedad = df.antiguedad.fillna(df.antiguedad.mean())
  df.banos = df.banos.fillna(1)
  df.habitaciones = df.habitaciones.fillna(df.habitaciones.mean())
  df.tipodepropiedad = df.tipodepropiedad.fillna('Casa')
  df.metrostotales = df.metrostotales.fillna(0)

def nuevas_features(df, precios_tipo):
  df['ratio_cubierto'] = df.apply(lambda x: x['metroscubiertos']/x['metrostotales'] if x['metrostotales'] else 1, axis = 1)
  df['tipodepropiedad'] = df['tipodepropiedad'].apply(lambda x: precios_tipo.loc[x]['precio_por_tipo'])

def evaluar_rf(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Performance del modelo:')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
    
df_precios_por_tipo

Unnamed: 0_level_0,precio_por_tipo
tipodepropiedad,Unnamed: 1_level_1
Apartamento,276.376866
Bodega comercial,269.37577
Casa,239.815771
Casa en condominio,289.892704
Casa uso de suelo,373.246923
Departamento Compartido,256.016189
Duplex,127.175797
Edificio,500.949937
Garage,50.0
Hospedaje,300.0


In [4]:
limpiar(df)
nuevas_features(df, df_precios_por_tipo)

In [5]:
df = shuffle(df)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
target_train = pd.DataFrame(train["precio"])
test = df[~msk]
target_test = pd.DataFrame(test["precio"])
train = train.drop(columns = ["id", "precio"])
test = test.drop(columns = ["id", "precio"])
train

Unnamed: 0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto
8923,casa en venta en fraccionamiento bosques del c...,vendo casa nueva cerca del bosque de centinela...,239.815771,Paseo del Cedro 617,Zapopan,Jalisco,1.000000,4.000000,2.0,4.0,...,311170.0,19.770019,-104.369997,2012-07-02 00:00:00,0.0,0.0,0.0,1.0,1.0,1.213636
139705,casa en venta en la col. villa mitras,funcional casa construida en una sola planta c...,239.815771,,Monterrey,Nuevo León,35.000000,3.000000,0.0,2.0,...,68559.0,25.726710,-100.358913,2016-12-26 00:00:00,0.0,0.0,0.0,0.0,0.0,0.700000
50405,naranjos iii,"fraccionamiento cerrado con vigilancia, alberc...",276.376866,MISION DE SAN FRANCISCO,Querétaro,Querétaro,3.000000,2.000000,2.0,2.0,...,84343.0,20.379982,-100.000031,2014-09-30 00:00:00,0.0,0.0,1.0,0.0,1.0,1.000000
224317,departamento en venta,"hermoso departamento seminuevo, para quien gus...",276.376866,,Puebla,Puebla,10.000000,2.000000,2.0,3.0,...,128249.0,19.092200,-98.130400,2016-11-30 00:00:00,0.0,0.0,0.0,0.0,0.0,1.149425
190555,vendo hermoso pent house en calacoaya,<p>vendo impecable pent house en calacoaya con...,276.376866,AV CALACOAYA,Atizapán de Zaragoza,Edo. de México,4.000000,2.902326,2.0,1.0,...,55776.0,19.544865,-99.240317,2016-12-30 00:00:00,0.0,0.0,0.0,1.0,1.0,1.495726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236335,casa nueva,excelentes casas de reciente terminacion lista...,239.815771,CALLE BARCELONA,Uriangato,Guanajuato,0.000000,3.000000,1.0,1.0,...,36992.0,20.670016,-101.499991,2014-09-03 00:00:00,0.0,0.0,0.0,1.0,1.0,0.601852
140788,"casa provincia, lomas de cocoyoc/ morelos, méxico","<p>pergola, bar y asador, cocina integral, var...",239.815771,,,Distrito Federal,8.116114,4.000000,0.0,3.0,...,,19.442442,-99.130988,2014-02-19 00:00:00,0.0,0.0,0.0,0.0,0.0,1.000000
69362,comodidad y lujo en exclusivo departamento en ...,<p>cada una de los departamentos es una soluci...,276.376866,MONTE CAPITOLIO,San Pedro Garza García,Nuevo León,0.000000,3.000000,2.0,2.0,...,70420.0,25.189999,-99.839989,2016-09-27 00:00:00,1.0,1.0,1.0,1.0,1.0,1.000000
74215,ch en rincon de xochimilco,"hermosa casa en condominio de 203m2, cuenta re...",289.892704,NICOLAS BRAVO NO. 2,Xochimilco,Distrito Federal,10.000000,3.000000,2.0,2.0,...,25058.0,19.277548,-99.130064,2015-04-27 00:00:00,0.0,0.0,0.0,0.0,0.0,2.030000


In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
FEATURES = ['tipodepropiedad', 'lat', 'lng', 'garages', 'habitaciones', 'antiguedad', 'metroscubiertos', 'banos', 'ratio_cubierto']
train

Unnamed: 0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,idzona,lat,lng,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto
8923,casa en venta en fraccionamiento bosques del c...,vendo casa nueva cerca del bosque de centinela...,239.815771,Paseo del Cedro 617,Zapopan,Jalisco,1.000000,4.000000,2.0,4.0,...,311170.0,19.770019,-104.369997,2012-07-02 00:00:00,0.0,0.0,0.0,1.0,1.0,1.213636
139705,casa en venta en la col. villa mitras,funcional casa construida en una sola planta c...,239.815771,,Monterrey,Nuevo León,35.000000,3.000000,0.0,2.0,...,68559.0,25.726710,-100.358913,2016-12-26 00:00:00,0.0,0.0,0.0,0.0,0.0,0.700000
50405,naranjos iii,"fraccionamiento cerrado con vigilancia, alberc...",276.376866,MISION DE SAN FRANCISCO,Querétaro,Querétaro,3.000000,2.000000,2.0,2.0,...,84343.0,20.379982,-100.000031,2014-09-30 00:00:00,0.0,0.0,1.0,0.0,1.0,1.000000
224317,departamento en venta,"hermoso departamento seminuevo, para quien gus...",276.376866,,Puebla,Puebla,10.000000,2.000000,2.0,3.0,...,128249.0,19.092200,-98.130400,2016-11-30 00:00:00,0.0,0.0,0.0,0.0,0.0,1.149425
190555,vendo hermoso pent house en calacoaya,<p>vendo impecable pent house en calacoaya con...,276.376866,AV CALACOAYA,Atizapán de Zaragoza,Edo. de México,4.000000,2.902326,2.0,1.0,...,55776.0,19.544865,-99.240317,2016-12-30 00:00:00,0.0,0.0,0.0,1.0,1.0,1.495726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236335,casa nueva,excelentes casas de reciente terminacion lista...,239.815771,CALLE BARCELONA,Uriangato,Guanajuato,0.000000,3.000000,1.0,1.0,...,36992.0,20.670016,-101.499991,2014-09-03 00:00:00,0.0,0.0,0.0,1.0,1.0,0.601852
140788,"casa provincia, lomas de cocoyoc/ morelos, méxico","<p>pergola, bar y asador, cocina integral, var...",239.815771,,,Distrito Federal,8.116114,4.000000,0.0,3.0,...,,19.442442,-99.130988,2014-02-19 00:00:00,0.0,0.0,0.0,0.0,0.0,1.000000
69362,comodidad y lujo en exclusivo departamento en ...,<p>cada una de los departamentos es una soluci...,276.376866,MONTE CAPITOLIO,San Pedro Garza García,Nuevo León,0.000000,3.000000,2.0,2.0,...,70420.0,25.189999,-99.839989,2016-09-27 00:00:00,1.0,1.0,1.0,1.0,1.0,1.000000
74215,ch en rincon de xochimilco,"hermosa casa en condominio de 203m2, cuenta re...",289.892704,NICOLAS BRAVO NO. 2,Xochimilco,Distrito Federal,10.000000,3.000000,2.0,2.0,...,25058.0,19.277548,-99.130064,2015-04-27 00:00:00,0.0,0.0,0.0,0.0,0.0,2.030000


In [8]:
rf = RandomForestRegressor(n_estimators = 200)

In [9]:
rf.fit(train[FEATURES], target_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [10]:
evaluar_rf(rf, test[FEATURES], target_test.values.ravel())

Performance del modelo:
Average Error: 66.0679 degrees.
Accuracy = 67.62%.


67.6200838920814

In [11]:
rf.predict(test[FEATURES])

array([ 63.99018095, 152.253055  , 302.8925    , ..., 358.00411667,
       135.9298775 ,  52.76500714])

In [12]:
for i, x in enumerate(rf.feature_importances_):
  print(FEATURES[i], x)

tipodepropiedad 0.12299963109309581
lat 0.13246925288620393
lng 0.08339227525039358
garages 0.02102104109729058
habitaciones 0.03701147711778876
antiguedad 0.04948468557065044
metroscubiertos 0.40837552217870743
banos 0.04806697317041393
ratio_cubierto 0.0971791416354556


In [43]:
test_kaggle = pd.read_csv('sets_de_datos/test.csv', index_col = 0)
lat_long_kaggle =  pd.read_csv('sets_de_datos/test_final1.csv', index_col = 0)

In [49]:
test_kaggle[['lat','lng']] = lat_long_kaggle[['lat','lng']]
test_kaggle

Unnamed: 0,ciudad,id,titulo,descripcion,tipodepropiedad,direccion,provincia,antiguedad,habitaciones,garages,...,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,mean_hab,mean_gar,mean_ban,ratio_cubierto
0,,271231,venta de casas,"casa en venta circuito luna, san luis potosí, ...",239.815771,,San luis Potosí,1.000000,3.000000,2.0,...,2016-11-08 00:00:00,0.0,0.0,0.0,0.0,0.0,3.040048,1.163978,2.048387,1.142857
1,,283212,casa en venta en chihuahua,"casa en venta en cd. juárez, ubicada en el sec...",239.815771,Portal de Agripa,Chihuahua,8.152599,2.000000,0.0,...,2014-04-09 00:00:00,0.0,0.0,0.0,0.0,0.0,3.040048,1.163978,2.048387,0.264423
2,,283208,casa en venta en chihuahua,"casa en venta en cd. juárez, quinta montecarlo...",239.815771,Portiero,Chihuahua,8.152599,3.000000,2.0,...,2014-05-23 00:00:00,0.0,0.0,0.0,0.0,0.0,3.040048,1.163978,2.048387,0.614679
3,,224157,casas en venta el laurel privada montañez,,239.815771,,,8.152599,2.907287,0.0,...,2014-12-16 00:00:00,0.0,0.0,0.0,0.0,0.0,3.040048,1.163978,2.048387,1.342593
4,,51709,casa de 2 habitaciones y 1 baño en venta,<p>aprovecha y ten tu propia casa a 20 minutos...,239.815771,"HACIENDA EL ROSARIO, COL. LOS HORES 6",,4.000000,2.000000,1.0,...,2014-01-23 00:00:00,0.0,0.0,0.0,1.0,0.0,3.040048,1.163978,2.048387,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,otra,196172,casa como terreno,terreno pueblo san pedro pozohuacan \navenida ...,239.815771,PUEBLO SAN PEDRO POZOHUACAN,Distrito Federal,20.000000,7.000000,0.0,...,2016-08-14 00:00:00,0.0,0.0,0.0,0.0,0.0,2.668499,1.136364,1.909091,1.000000
59996,otra,53375,"departamento en villa del lago, venta","amplio departamento, piso 8, cuidadisimo, ilum...",276.376866,Jesus del Monte #47,Distrito Federal,5.000000,3.000000,2.0,...,2016-12-09 00:00:00,1.0,0.0,1.0,1.0,1.0,2.668499,1.136364,1.909091,1.000000
59997,otra,87381,venta de departamentos en la ciudad de mexico,venta de departamentos de m2. las mejores zon...,276.376866,Agricultura,Distrito Federal,0.000000,2.000000,1.0,...,2014-12-02 00:00:00,0.0,0.0,0.0,1.0,1.0,2.668499,1.136364,1.909091,0.500000
59998,ácatlán de Juárez,90144,pequeño rancho,preciosa casa de descanso a 15 minutos de las ...,403.268151,MIRAVALLE,Jalisco,5.000000,3.000000,0.0,...,2016-07-05 00:00:00,0.0,0.0,0.0,0.0,0.0,2.902326,0.000000,1.000000,1.000000


In [46]:
limpiar(test_kaggle)
nuevas_features(test_kaggle, df_precios_por_tipo)

In [17]:
submit = rf.predict(test_kaggle[FEATURES])

In [18]:
submit_df = pd.DataFrame({"id": test_kaggle.reset_index()['id'], "target": submit*10000})
submit_df

Unnamed: 0,id,target
0,4941,6.235713e+06
1,51775,9.766950e+05
2,115253,2.613740e+06
3,299321,1.097459e+06
4,173570,7.642082e+05
...,...,...
59995,75094,4.429171e+06
59996,171847,8.050000e+05
59997,138313,8.449228e+05
59998,271268,1.386168e+06


In [19]:
submit_df.to_csv("submit.csv", index=False)

In [20]:
from sklearn.feature_extraction import FeatureHasher

In [21]:
h = FeatureHasher(16, input_type='string')
train.descripcion = train.descripcion.fillna("e")

In [22]:
descripciones = pd.DataFrame(train["descripcion"])
descripciones

Unnamed: 0,descripcion
8923,vendo casa nueva cerca del bosque de centinela...
139705,funcional casa construida en una sola planta c...
50405,"fraccionamiento cerrado con vigilancia, alberc..."
224317,"hermoso departamento seminuevo, para quien gus..."
190555,<p>vendo impecable pent house en calacoaya con...
...,...
236335,excelentes casas de reciente terminacion lista...
140788,"<p>pergola, bar y asador, cocina integral, var..."
69362,<p>cada una de los departamentos es una soluci...
74215,"hermosa casa en condominio de 203m2, cuenta re..."


In [23]:
features_descripciones = descripciones.apply(lambda x: pd.Series(h.transform([x['descripcion']]).toarray()[0], index=[i for i in range(16)]), axis = 1)
features_descripciones

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
8923,12.0,-4.0,56.0,39.0,-28.0,-25.0,-3.0,34.0,72.0,-5.0,-3.0,-5.0,22.0,-10.0,0.0,-22.0
139705,14.0,-7.0,57.0,35.0,-32.0,-14.0,-5.0,33.0,75.0,-1.0,-3.0,1.0,22.0,-12.0,-1.0,-18.0
50405,5.0,-5.0,32.0,1.0,-9.0,-13.0,-5.0,16.0,30.0,0.0,-1.0,-3.0,12.0,-8.0,0.0,-8.0
224317,10.0,2.0,29.0,23.0,-16.0,-22.0,-5.0,29.0,46.0,-1.0,-3.0,-2.0,20.0,-10.0,-1.0,-14.0
190555,16.0,-3.0,64.0,39.0,-29.0,-22.0,-11.0,68.0,102.0,-1.0,-6.0,-3.0,44.0,-13.0,0.0,-37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236335,1.0,1.0,13.0,9.0,-8.0,-8.0,0.0,12.0,19.0,0.0,0.0,1.0,7.0,-4.0,0.0,-14.0
140788,0.0,6.0,14.0,-1.0,-1.0,-5.0,-2.0,5.0,12.0,-1.0,-5.0,-2.0,3.0,-1.0,0.0,-3.0
69362,28.0,7.0,198.0,137.0,-9.0,-170.0,-120.0,186.0,264.0,-7.0,-16.0,-19.0,153.0,48.0,-5.0,-63.0
74215,4.0,-5.0,26.0,17.0,-10.0,-11.0,1.0,17.0,33.0,-3.0,0.0,-3.0,10.0,-4.0,0.0,-7.0


In [24]:
rf.fit(train[FEATURES].join(features_descripciones), target_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [25]:
test_descripciones =  pd.DataFrame(test["descripcion"].fillna("e")).apply(lambda x: pd.Series(h.transform([x['descripcion']]).toarray()[0], index=[i for i in range(16)]), axis = 1)

In [26]:
test[FEATURES].join(test_descripciones)

Unnamed: 0,tipodepropiedad,lat,lng,garages,habitaciones,antiguedad,metroscubiertos,banos,ratio_cubierto,0,...,6,7,8,9,10,11,12,13,14,15
68871,276.376866,20.000000,-103.000000,1.0,2.0,10.0,61.0,1.0,1.000000,1.0,...,0.0,16.0,19.0,0.0,0.0,0.0,8.0,-2.0,-2.0,-9.0
126664,239.815771,18.952212,-99.271280,2.0,3.0,10.0,120.0,2.0,0.857143,11.0,...,-10.0,69.0,117.0,0.0,-6.0,3.0,41.0,-23.0,-3.0,-36.0
48487,180.520180,19.280380,-99.117022,0.0,3.0,0.0,45.0,1.0,0.187500,2.0,...,-2.0,7.0,8.0,0.0,0.0,-1.0,1.0,0.0,0.0,-2.0
116922,239.815771,19.517013,-99.246016,1.0,3.0,20.0,250.0,2.0,1.351351,29.0,...,-5.0,116.0,171.0,-4.0,-2.0,-4.0,66.0,-24.0,-3.0,-42.0
95187,239.815771,20.698976,-103.366357,0.0,5.0,20.0,400.0,1.0,1.593625,10.0,...,1.0,37.0,64.0,0.0,-4.0,-4.0,14.0,-11.0,-1.0,-18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204940,276.376866,19.442442,-99.130988,0.0,2.0,10.0,54.0,1.0,1.000000,5.0,...,-3.0,38.0,51.0,-2.0,-3.0,0.0,26.0,-16.0,0.0,-18.0
62509,239.815771,20.379982,-100.000031,0.0,3.0,0.0,256.0,3.0,1.024000,8.0,...,-15.0,28.0,64.0,0.0,-1.0,-6.0,27.0,5.0,-1.0,-18.0
54904,239.815771,20.630214,-100.364227,2.0,3.0,0.0,313.0,3.0,1.106007,12.0,...,-2.0,43.0,73.0,-2.0,-3.0,-4.0,28.0,-9.0,-3.0,-23.0
36792,276.376866,19.477841,-99.182240,1.0,2.0,5.0,71.0,2.0,1.000000,7.0,...,-3.0,27.0,47.0,0.0,-2.0,0.0,25.0,-7.0,0.0,-14.0


In [27]:
evaluar_rf(rf, test[FEATURES].join(test_descripciones), target_test.values.ravel())

Performance del modelo:
Average Error: 66.3398 degrees.
Accuracy = 67.14%.


67.1419432477932

In [28]:
for i, x in enumerate(rf.feature_importances_):
  try:
    print(FEATURES[i],x)
  except:
    print(i - 9, x)

tipodepropiedad 0.11771998330024668
lat 0.10447355423215568
lng 0.05473168900255341
garages 0.009909540664394836
habitaciones 0.02498851648521669
antiguedad 0.023125595546523513
metroscubiertos 0.37119244583070043
banos 0.03830398467428835
ratio_cubierto 0.06808064191876717
0 0.010325267007978216
1 0.01682314375232715
2 0.011355451626459743
3 0.015046664922743187
4 0.011702117257143839
5 0.012177376386634027
6 0.017708054183053434
7 0.010770440200274147
8 0.00935155453404821
9 0.0073302317842448865
10 0.008576705407007586
11 0.013631437840654544
12 0.010197357230949682
13 0.012560020589837738
14 0.008065922523689236
15 0.0118523030981076


# Prueba v2 con mas features

In [29]:
df.ciudad = df.ciudad.fillna("")

In [30]:
df = df.set_index('ciudad').join(df.groupby('ciudad').agg({'habitaciones':'mean', 'garages':'mean', 'banos':'mean'}).rename(columns={'habitaciones':'mean_hab', 'banos':'mean_ban', 'garages':'mean_gar'})).reset_index()

In [33]:
df = shuffle(df)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
target_train = pd.DataFrame(train["precio"])
test = df[~msk]
target_test = pd.DataFrame(test["precio"])
train = train.drop(columns = ["id", "precio"])
test = test.drop(columns = ["id", "precio"])

In [34]:
FEATURES = ['tipodepropiedad', 'lat', 'lng', 'garages', 'habitaciones', 'antiguedad', 'metroscubiertos', 'banos', 'ratio_cubierto', 'mean_hab','mean_ban', 'mean_gar']
train

Unnamed: 0,ciudad,titulo,descripcion,tipodepropiedad,direccion,provincia,antiguedad,habitaciones,garages,banos,...,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto,mean_hab,mean_gar,mean_ban
82811,Guadalajara,"departamento nuevo en santa tere, una habitaci...","departamentos lofts. santa teresita, gdl.- nue...",276.376866,ARISTA,Jalisco,0.000000,1.000000,0.0,1.0,...,2016-12-05 00:00:00,0.0,1.0,0.0,1.0,1.0,1.000000,3.274833,1.181727,2.005492
859,Acapulco de Juárez,"costa azul, acapulco","estrene, vista a la bahia, gran area de alberc...",276.376866,Balcones de Costa Azul,Guerrero,0.000000,3.000000,1.0,2.0,...,2015-06-11 00:00:00,0.0,0.0,1.0,0.0,0.0,1.000000,2.823817,1.224531,2.198452
187877,San Pedro Garza García,departamento en venta en valle,<p>departamento de lujo en planta baja en exce...,276.376866,,Nuevo León,4.000000,3.000000,3.0,4.0,...,2014-12-22 00:00:00,0.0,0.0,0.0,0.0,0.0,1.000000,2.771596,1.417687,2.380272
174929,San Andrés Cholula,departamento en zona angelopolis,3 recamaras \n2 baños y medio \n2 cocheras \ns...,276.376866,ZONA ANGELOPOLIS,Puebla,5.000000,3.000000,2.0,2.0,...,2016-03-29 00:00:00,1.0,1.0,1.0,1.0,1.0,0.841837,3.024133,1.564783,2.391590
112186,León,valverde y tellez 00 - industrial - león,"el terreno cuenta con 408 m2 de superficie, 12...",247.202459,Valverde Y Tellez,Guanajuato,5.000000,2.902326,0.0,1.0,...,2016-11-18 00:00:00,0.0,0.0,0.0,1.0,0.0,0.426512,2.943341,1.395620,2.132847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203593,Tlalnepantla de Baz,casa en los reyes ixtacala 1ra sección.,"<p>casa que consta de 2 plantas. pb: estancia,...",239.815771,,Edo. de México,8.116114,2.000000,0.0,1.0,...,2015-10-10 00:00:00,0.0,0.0,0.0,0.0,0.0,0.821138,3.081609,1.361483,1.884615
224983,Villahermosa,casa en venta en fraccionamiento pomoca,casa en privada dentro del fraccionamiento pom...,289.892704,POMOCA,Tabasco,3.000000,2.000000,2.0,2.0,...,2014-06-28 00:00:00,0.0,0.0,0.0,1.0,1.0,1.000000,2.948219,1.203125,2.192708
194741,Tepic,casa en venta en los sauces,"se vende casa en tepic, col. los sauces. excel...",239.815771,ALMENDRO,Nayarit,8.116114,3.000000,0.0,2.0,...,2015-09-04 00:00:00,0.0,0.0,0.0,0.0,0.0,0.780000,2.846602,0.411290,1.717742
126251,Monterrey,departamento en venta san jemo monterrey,bonito departamento en san jemo cerca de centr...,276.376866,Camino de los Ibis,Nuevo León,10.000000,3.000000,2.0,2.0,...,2016-12-04 00:00:00,0.0,0.0,0.0,1.0,1.0,1.000000,3.043701,1.234955,2.324791


In [35]:
rf = RandomForestRegressor(n_estimators = 200)

In [36]:
rf.fit(train[FEATURES], target_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [37]:
evaluar_rf(rf, test[FEATURES], target_test.values.ravel())

Performance del modelo:
Average Error: 62.0409 degrees.
Accuracy = 69.82%.


69.82312091947293

In [38]:
for i, x in enumerate(rf.feature_importances_):
  print(FEATURES[i], x)

tipodepropiedad 0.10313085670988624
lat 0.07109425072891003
lng 0.045360809195484915
garages 0.01815285415701325
habitaciones 0.03326196751806296
antiguedad 0.04221614702881128
metroscubiertos 0.39985327557825456
banos 0.04337628163006553
ratio_cubierto 0.08017060910405657
mean_hab 0.0860191613789217
mean_ban 0.04055617547765862
mean_gar 0.03680761149287429


In [44]:
test_kaggle.ciudad = test_kaggle.ciudad.fillna("")
test_kaggle=test_kaggle.reset_index()
test_kaggle = test_kaggle.set_index('ciudad').join(df.groupby('ciudad').agg({'habitaciones':'mean', 'garages':'mean', 'banos':'mean'}).rename(columns={'habitaciones':'mean_hab', 'banos':'mean_ban', 'garages':'mean_gar'})).reset_index()

In [45]:
test_kaggle['mean_ban'].fillna(df.banos.mean(), inplace=True)
test_kaggle['mean_gar'].fillna(df.garages.mean(), inplace=True)
test_kaggle['mean_hab'].fillna(df.habitaciones.mean(), inplace=True)

In [50]:
submit = rf.predict(test_kaggle[FEATURES])
submit_df = pd.DataFrame({"id": test_kaggle.reset_index()['id'], "target": submit*10000})
submit_df

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').