In [1]:
import os
import sys
stderr = sys.stderr
sys.stderr = open(os.devnull, 'w')
sys.stderr = stderr

import pandas as pd
from sklearn.utils import shuffle
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [67]:
df = pd.read_csv("sets_de_datos/train.csv")
lat_long = pd.read_csv("sets_de_datos/train_final1.csv").astype('float64')
df["precio"] /= 10000
df[["lat","lng"]] = lat_long[["lat","lng"]]
df_precios_por_tipo = df.groupby('tipodepropiedad').agg({'precio':'mean'}).rename(columns={'precio':'precio_por_tipo'})

In [68]:
test_kaggle = pd.read_csv('sets_de_datos/test.csv', index_col = 0)
lat_long_kaggle = pd.read_csv('sets_de_datos/test_final1.csv', index_col = 0)
test_kaggle[['lat','lng']] = lat_long_kaggle[['lat','lng']]

In [69]:
def limpiar(df):
    df.garages = df.garages.fillna(0)
    df.metroscubiertos = df.metroscubiertos.fillna(df.metroscubiertos.mean())
    df.antiguedad = df.antiguedad.fillna(df.antiguedad.mean())
    df.banos = df.banos.fillna(1)
    df.habitaciones = df.habitaciones.fillna(df.habitaciones.mean())
    df.tipodepropiedad = df.tipodepropiedad.fillna('Casa')
    df.metrostotales = df.metrostotales.fillna(0)
    df['metroscubiertos'] = df['metroscubiertos'].fillna(df['metroscubiertos'].mean())
    df.ciudad = df.ciudad.fillna("")

def nuevas_features(df, precios_tipo,precio_m2,promedios,default_m2):
    df['ratio_cubierto'] = df.apply(lambda x: x['metroscubiertos']/x['metrostotales'] if x['metrostotales'] else 1, axis = 1)
    df['tipodepropiedad'] = df['tipodepropiedad'].apply(lambda x: precios_tipo.loc[x]['precio_por_tipo'])
    df['precio_x_m2'] = df.apply(lambda x: precio_x_m2.get(x['ciudad'],default_m2), axis = 1)
    df['mean_hab'] = df.apply(lambda x: promedios['mean_hab'].get(x['ciudad'],default_m2), axis = 1)
    df['mean_ban'] = df.apply(lambda x: promedios['mean_ban'].get(x['ciudad'],default_m2), axis = 1)
    df['mean_gar'] = df.apply(lambda x: promedios['mean_gar'].get(x['ciudad'],default_m2), axis = 1)

def evaluar_rf(modelo, X_test, y_test):
    y_pred = modelo.predict(X_test)
    errors = abs(y_pred - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Performance del modelo:')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
    
df_precios_por_tipo

Unnamed: 0_level_0,precio_por_tipo
tipodepropiedad,Unnamed: 1_level_1
Apartamento,276.376866
Bodega comercial,269.37577
Casa,239.815771
Casa en condominio,289.892704
Casa uso de suelo,373.246923
Departamento Compartido,256.016189
Duplex,127.175797
Edificio,500.949937
Garage,50.0
Hospedaje,300.0


In [70]:
limpiar(df)
df['precio_x_m2'] = df['precio']/df['metroscubiertos']
precio_x_m2 = df.groupby('ciudad').agg({'precio_x_m2':'mean'}).to_dict()['precio_x_m2']
default = df.groupby('ciudad').agg({'precio_x_m2':'mean'})['precio_x_m2'].mean()
promedios = df.set_index('ciudad')\
            .join(df.groupby('ciudad')\
                  .agg({'habitaciones':'mean', 'garages':'mean', 'banos':'mean'})\
                      .rename(columns={'habitaciones':'mean_hab', 'banos':'mean_ban', 'garages':'mean_gar'}))\
                        [['mean_hab','mean_gar','mean_ban']].to_dict()
nuevas_features(df, df_precios_por_tipo, precio_x_m2,promedios,default)

In [71]:
limpiar(test_kaggle)
nuevas_features(test_kaggle, df_precios_por_tipo, precio_x_m2,promedios,default)

In [72]:
test_kaggle

Unnamed: 0_level_0,titulo,descripcion,tipodepropiedad,direccion,ciudad,provincia,antiguedad,habitaciones,garages,banos,...,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto,precio_x_m2,mean_hab,mean_ban,mean_gar
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4941,"casa en venta en miguel hidalgo, distrito federal",<p>excelente casa estilo moderno.</p>,239.815771,Bosque de Cedros,Miguel Hidalgo,Distrito Federal,29.000000,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,3.153119,2.508828,1.866091,1.303192
51775,departamentos en venta en montebello,<p>departamento una recamara:\n</p><p>departam...,276.376866,,Mérida,Yucatán,8.152599,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.962266,2.928739,2.425859,1.357163
115253,departamento nuevo delegación coyoacán de 87 m...,"departamento nuevo de 87.06 m2, 1 cajón de est...",276.376866,"Pueblo de los Reyes, Coyoacán, Mexico D.F.",Coyoacán,Distrito Federal,0.000000,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.870000,2.049670,3.025226,1.979029,1.336671
299321,departamento en venta en acapulco,<p> raíces dv001 precioso departamento tipo k...,276.376866,,Acapulco de Juárez,Guerrero,2.000000,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,1.543191,2.823817,2.198452,1.224531
173570,bonita casa sola equipada de dos niveles en lo...,"<p>casa sola, bonita de dos rec&aacute;maras u...",239.815771,CEDROS,Tultitlán,Edo. de México,10.000000,2.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.052632,0.869196,2.799797,1.340136,1.073858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75094,oportunidad!! se vende amplia casa en col. moc...,oportunidad!! ideal para oficina o casa habita...,239.815771,Oriente 172 # 265,Venustiano Carranza,Distrito Federal,20.000000,4.0,3.0,3.0,...,0.0,0.0,0.0,1.0,1.0,1.000000,1.582998,2.824782,1.497598,0.849005
171847,colinas de ecatepec,"casa, sala comedor, patio de servicio, buenas ...",239.815771,colinas,Ecatepec de Morelos,Edo. de México,10.000000,3.0,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.816092,0.872189,2.951562,1.503011,1.185734
138313,estrene hermosa casa en sierra morena,hermosa casa lista para habitarse ubicada en f...,239.815771,s/calle,Guadalupe,Nuevo León,5.000000,3.0,2.0,2.0,...,0.0,0.0,0.0,1.0,1.0,1.000000,0.877376,2.973335,1.864879,1.074531
271268,zen house i venta de linda casa con acabados ...,hermosa casa con acabados de lujo en fracciona...,239.815771,Zen House l,Querétaro,Querétaro,0.000000,2.0,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.902778,1.164389,2.974428,2.269413,1.479203


In [5]:
FEATURES = ['tipodepropiedad', 'lat', 'lng', 'garages', 'habitaciones', 'antiguedad', 'metroscubiertos', 'banos', 'ratio_cubierto', 'mean_hab','mean_ban', 'mean_gar', 'precio_x_m2']
df = shuffle(df)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
target_train = pd.DataFrame(train["precio"])
test = df[~msk]
target_test = pd.DataFrame(test["precio"])
train = train.drop(columns = ["id", "precio"])
test = test.drop(columns = ["id", "precio"])

In [6]:
train

Unnamed: 0,ciudad,id,titulo,descripcion,tipodepropiedad,direccion,provincia,antiguedad,habitaciones,garages,...,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,ratio_cubierto,precio_x_m2,mean_hab,mean_gar,mean_ban
146034,Naucalpan de Juárez,250322,"excelente casa en privada en calle conchita, f...",valor comercial: \nprecio de venta: \n\n**un...,239.815771,conchita,Edo. de México,20.000000,4.000000,3.0,...,0.0,0.0,1.0,1.0,410.0000,0.757576,1.831155,3.198494,1.591700,2.355813
50762,Coyoacán,72350,casa en venta en coyoacán,"col. privada, enorme parque enfrente, escuelas...",239.815771,Zinmaba 24,Distrito Federal,15.000000,3.000000,3.0,...,0.0,0.0,1.0,1.0,330.0000,1.437500,2.049670,3.025226,1.336671,1.979029
105796,Iztapalapa,144848,"casa en remate, calle vicente guerrero col. pu...",casa en remate en venta \navaluo comercial $ ...,239.815771,Vicente Guerrero,Distrito Federal,20.000000,2.902326,0.0,...,0.0,0.0,0.0,0.0,104.6666,1.000000,1.317212,2.886065,0.993534,1.531523
171733,San Agustín Tlaxiaca,149526,claveles no. 15,casa en venta con todos los servicios en priva...,239.815771,,Hidalgo,0.000000,3.000000,2.0,...,0.0,0.0,0.0,0.0,115.0000,0.662983,0.997499,2.832186,1.440000,1.800000
203273,Tlalnepantla de Baz,192093,"oferta casa en venta valor de remate, col. val...","ofertacasa, en venta. valor de remate. recuper...",239.815771,ROBLE,Edo. de México,10.000000,3.000000,1.0,...,0.0,0.0,1.0,1.0,155.0000,0.714286,1.448095,3.081609,1.361483,1.884615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197212,Tijuana,80353,oportunidad casa seminueva en urbi quinta mars...,"<p>bonita casa en urbi quinta marsella, el fra...",239.815771,"PRIVADA INDRE 21, FRACCIONAMIENTO URBI QUINTA ...",Baja California Norte,5.000000,3.000000,3.0,...,0.0,0.0,1.0,1.0,78.8000,0.512953,0.780531,2.568428,1.484108,1.327279
6308,Alvaro Obregón,78362,pre-venta departamento 80m2 en san anguel,<p>descripción de departamento\n</p><p>2 recam...,276.376866,san anguel,Distrito Federal,0.000000,2.902326,1.0,...,0.0,1.0,0.0,0.0,490.0500,1.000000,2.544162,2.769321,1.487562,2.186492
232353,Zapopan,110939,terreno en venta en santa maria del pueblito,terreno sobre av. indepencencia zona comercial...,180.520180,INDEPENDENCIA,Jalisco,8.116114,2.902326,0.0,...,0.0,0.0,0.0,0.0,380.0000,0.435042,1.481265,3.002699,1.439479,2.195270
162692,Querétaro,199826,casa sola en excelente zona de queretaro,magnifica oportunidad casa sola en excelente z...,239.815771,Sendero de la Girola 127,Querétaro,5.000000,3.000000,2.0,...,0.0,0.0,1.0,1.0,180.0000,1.000000,1.164389,2.974428,1.479203,2.269413


In [7]:
rf = RandomForestRegressor(n_estimators = 200)

In [8]:
rf.fit(train[FEATURES], target_train.values.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [9]:
evaluar_rf(rf, test[FEATURES], target_test.values.ravel())

Performance del modelo:
Average Error: 61.9943 degrees.
Accuracy = 70.56%.


70.55743913695896

In [10]:
rf.predict(test[FEATURES])

array([205.578118  ,  46.2581175 , 414.93049967, ..., 192.879     ,
       484.906618  , 456.856915  ])

In [11]:
for i, x in enumerate(rf.feature_importances_):
  print(FEATURES[i], x)

tipodepropiedad 0.04951259176336902
lat 0.03615306515574909
lng 0.0361958720208176
garages 0.017502507162726732
habitaciones 0.022499441689431535
antiguedad 0.044387296179961726
metroscubiertos 0.3989341020185248
banos 0.020856622338352285
ratio_cubierto 0.07142826514690181
mean_hab 0.014616454796798642
mean_ban 0.010865306867816511
mean_gar 0.01147810866888661
precio_x_m2 0.2655703661906636


In [38]:
test_kaggle = pd.read_csv('sets_de_datos/test.csv', index_col = 0)
lat_long_kaggle = pd.read_csv('sets_de_datos/test_final1.csv', index_col = 0)
test_kaggle[['lat','lng']] = lat_long_kaggle[['lat','lng']]

In [40]:
precio_x_m2 = df.groupby('ciudad').agg({'precio_x_m2':'mean'}).to_dict()['precio_x_m2']
default = df.groupby('ciudad').agg({'precio_x_m2':'mean'})['precio_x_m2'].mean()
limpiar(test_kaggle)
test_kaggle = nuevas_features(test_kaggle, df_precios_por_tipo, precio_x_m2, default)

In [41]:
test_kaggle

Unnamed: 0,ciudad,titulo,descripcion,tipodepropiedad,direccion,provincia,antiguedad,habitaciones,garages,banos,...,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,ratio_cubierto,precio_x_m2,mean_hab,mean_gar,mean_ban
0,,venta de casas,"casa en venta circuito luna, san luis potosí, ...",239.815771,,San luis Potosí,1.000000,3.000000,2.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.142857,1.428557,2.879279,1.156627,1.86747
1,,casa en venta en chihuahua,"casa en venta en cd. juárez, ubicada en el sec...",239.815771,Portal de Agripa,Chihuahua,8.152599,2.000000,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.264423,1.428557,2.879279,1.156627,1.86747
2,,casa en venta en chihuahua,"casa en venta en cd. juárez, quinta montecarlo...",239.815771,Portiero,Chihuahua,8.152599,3.000000,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.614679,1.428557,2.879279,1.156627,1.86747
3,,casas en venta el laurel privada montañez,,239.815771,,,8.152599,2.907287,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.342593,1.428557,2.879279,1.156627,1.86747
4,,casa de 2 habitaciones y 1 baño en venta,<p>aprovecha y ten tu propia casa a 20 minutos...,239.815771,"HACIENDA EL ROSARIO, COL. LOS HORES 6",,4.000000,2.000000,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.000000,1.428557,2.879279,1.156627,1.86747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,otra,casa como terreno,terreno pueblo san pedro pozohuacan \navenida ...,239.815771,PUEBLO SAN PEDRO POZOHUACAN,Distrito Federal,20.000000,7.000000,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,2.226815,3.200000,1.000000,2.20000
59996,otra,"departamento en villa del lago, venta","amplio departamento, piso 8, cuidadisimo, ilum...",276.376866,Jesus del Monte #47,Distrito Federal,5.000000,3.000000,2.0,3.0,...,1.0,0.0,1.0,1.0,1.0,1.000000,2.226815,3.200000,1.000000,2.20000
59997,otra,venta de departamentos en la ciudad de mexico,venta de departamentos de m2. las mejores zon...,276.376866,Agricultura,Distrito Federal,0.000000,2.000000,1.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.500000,2.226815,3.200000,1.000000,2.20000
59998,ácatlán de Juárez,pequeño rancho,preciosa casa de descanso a 15 minutos de las ...,403.268151,MIRAVALLE,Jalisco,5.000000,3.000000,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,6.120904,2.500000,0.500000,2.00000


In [74]:
submit = rf.predict(test_kaggle[FEATURES])
submit_df = pd.DataFrame({"id": ids, "target": submit*10000})
submit_df

Unnamed: 0,id,target
0,4941,7.393706e+06
1,51775,8.207300e+05
2,115253,2.561266e+06
3,299321,1.190002e+06
4,173570,5.668484e+05
...,...,...
59995,75094,3.835388e+06
59996,171847,7.614844e+05
59997,138313,8.509299e+05
59998,271268,1.302182e+06


In [75]:
submit_df.to_csv("submit.csv", index=False)

In [76]:
for i, x in enumerate(rf.feature_importances_):
  try:
    print(FEATURES[i],x)
  except:
    print(i - 9, x)

tipodepropiedad 0.04951259176336902
lat 0.03615306515574909
lng 0.0361958720208176
garages 0.017502507162726732
habitaciones 0.022499441689431535
antiguedad 0.044387296179961726
metroscubiertos 0.3989341020185248
banos 0.020856622338352285
ratio_cubierto 0.07142826514690181
mean_hab 0.014616454796798642
mean_ban 0.010865306867816511
mean_gar 0.01147810866888661
precio_x_m2 0.2655703661906636
