In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [0]:
import feature_string as feature_string
import feature_num as feature_num
import feature_categorica as feature_categorica
import common_machine_learning as common
import my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import time

#### Ignore Future Warning 

In [0]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [0]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
XGB_RE_OBJECTIVE = 'reg:squarederror'
N_ITER = 50

## Train 

In [0]:
train = common.cargar_set_optimizado('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv')

## Features

In [0]:
feature_num.agregar_feature_fecha_numerica(train)

Esto toma unos segundos

In [0]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [0]:
train = feature_categorica.agregar_tipodepropiedad_precio_mean(train)

In [0]:
train = feature_categorica.train_agregar_feature_provincias_ciudades_ohe_reducido_df(train)

In [0]:
train = feature_string.train_agregar_feature_string_todos_df(train)

In [17]:
train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'fecha_numerica',
       'tipodepropiedad_precio_mean', 'ciudad_San_Luis_Potosí',
       'ciudad_Querétaro', 'ciudad_Zapopan', 'ciudad_Huixquilucan',
       'ciudad_Mérida', 'provincia_Distrito_Federal',
       'provincia_Edo._de_México', 'provincia_San_luis_Potosí',
       'provincia_Yucatán', 'provincia_Querétaro',
       'titulo_cantidad_palabras_importantes',
       'descripcion_cantidad_palabras_importantes',
       'direccion_cantidad_palabras_importantes',
       'titulo_cantidad_caracteres_en_palabras_importantes',
       'descripcion_cantidad_caracteres_en_palabras_importantes',
       'direccion_cantidad_caracteres_en_palabras_importantes',
 

In [0]:
drop_columns = ['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 'idzona']

In [0]:
train = train.drop(drop_columns, axis = 1)

In [20]:
train.shape

(240000, 57)

In [21]:
train.isnull().sum()

id                                                             0
antiguedad                                                 43555
habitaciones                                               22471
garages                                                    37765
banos                                                      26221
metroscubiertos                                            17400
metrostotales                                              51467
lat                                                            0
lng                                                            0
fecha                                                          0
gimnasio                                                       0
usosmultiples                                                  0
piscina                                                        0
escuelascercanas                                               0
centroscomercialescercanos                                     0
precio                   

In [0]:
train.set_index('id', inplace = True)

In [0]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

In [0]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

def objective(hyper_parametros):
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('xgb_regressor', RandomForestRegressor(**hyper_parametros))
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

In [25]:
train.shape[1]

56

In [0]:
space = {
    "n_estimators": hp.randint("n_estimators", 1000),
    "max_depth": hp.randint("max_depth", 15) + 2,
    "min_samples_split": hp.randint("min_samples_split", 10) + 2,
    "min_samples_leaf": hp.choice("min_samples_leaf", [2, 3, 4, 5, 6, 7, 8, 9, 10]),
    "max_features": hp.uniform("max_features", 0.6, 0.8)
}

In [29]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -1195823.793 params {'max_depth': 2, 'max_features': 0.7320420643915713, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 380}
SCORE: -986937.124 params {'max_depth': 9, 'max_features': 0.6696957828982146, 'min_samples_leaf': 9, 'min_samples_split': 8, 'n_estimators': 122}
SCORE: -976050.769 params {'max_depth': 10, 'max_features': 0.7701903251461797, 'min_samples_leaf': 10, 'min_samples_split': 9, 'n_estimators': 94}
SCORE: -976510.703 params {'max_depth': 10, 'max_features': 0.7177776573992107, 'min_samples_leaf': 2, 'min_samples_split': 11, 'n_estimators': 62}
SCORE: -1001378.127 params {'max_depth': 8, 'max_features': 0.7405097915539923, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 999}
SCORE: -1001236.585 params {'max_depth': 8, 'max_features': 0.6844298444352478, 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 704}
SCORE: -1117264.564 params {'max_depth': 4, 'max_features': 0.6047681352516926, 'min_samples_leaf': 3, 'min_samp

In [1]:
print(best)

NameError: ignored

Random Forest  
{  
     'max_depth': 16,  
     'max_features': 0.7093997095411249,  
     'min_samples_leaf': 4,  
     'min_samples_split': 7,  
     'n_estimators': 778  
}  
SCORE = -944143.902