In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [0]:
import feature_string as feature_string
import feature_num as feature_num
import feature_categorica as feature_categorica
import common_machine_learning as common
import my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRFRegressor
import xgboost as xgb
import time

#### Ignore Future Warning 

In [0]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [0]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
XGB_RE_OBJECTIVE = 'reg:squarederror'
N_ITER = 50

## Train 

In [0]:
train = common.cargar_set_optimizado('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv')

## Features

In [0]:
feature_num.agregar_feature_fecha_numerica(train)

Esto toma unos segundos

In [0]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [0]:
train = feature_categorica.agregar_tipodepropiedad_precio_mean(train)

In [0]:
train = feature_categorica.train_agregar_feature_provincias_ciudades_ohe_reducido_df(train)

In [0]:
train = feature_string.train_agregar_feature_string_todos_df(train)

In [20]:
train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'fecha_numerica',
       'tipodepropiedad_precio_mean', 'ciudad_San_Luis_Potosí',
       'ciudad_Querétaro', 'ciudad_Zapopan', 'ciudad_Huixquilucan',
       'ciudad_Mérida', 'provincia_Distrito_Federal',
       'provincia_Edo._de_México', 'provincia_San_luis_Potosí',
       'provincia_Yucatán', 'provincia_Querétaro',
       'titulo_cantidad_palabras_importantes',
       'descripcion_cantidad_palabras_importantes',
       'direccion_cantidad_palabras_importantes',
       'titulo_cantidad_caracteres_en_palabras_importantes',
       'descripcion_cantidad_caracteres_en_palabras_importantes',
       'direccion_cantidad_caracteres_en_palabras_importantes',
 

In [0]:
drop_columns = ['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 'idzona']

In [0]:
train = train.drop(drop_columns, axis = 1)

In [23]:
train.shape

(240000, 57)

In [24]:
train.isnull().sum()

id                                                             0
antiguedad                                                 43555
habitaciones                                               22471
garages                                                    37765
banos                                                      26221
metroscubiertos                                            17400
metrostotales                                              51467
lat                                                            0
lng                                                            0
fecha                                                          0
gimnasio                                                       0
usosmultiples                                                  0
piscina                                                        0
escuelascercanas                                               0
centroscomercialescercanos                                     0
precio                   

In [0]:
train.set_index('id', inplace = True)

In [0]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

In [0]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

def objective(hyper_parametros):
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('xgb_regressor', XGBRFRegressor(objective = XGB_RE_OBJECTIVE, **hyper_parametros))
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

In [0]:
space = {
    "n_estimators": hp.randint("xgb_re__n_estimators", 1000), 
    "learning_rate": hp.uniform("xgb_re__learning_rate", 0.005, 0.03), 
    "gamma": hp.uniform("xgb_re__gamma", 0.1, 5),
    "lambda": hp.uniform("xgb_re__lambda",0.1, 5),
    "max_depth": hp.randint("xgb_re__max_depth", 11) + 1, 
    "subsample": 0.79,
    "colsample_bytree": 0.8,
    "min_child_weight": hp.randint("xgb_re__min_child_weight", 5) + 1
}

In [29]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -2509840.500 params {'colsample_bytree': 0.8, 'gamma': 2.428368405243723, 'lambda': 3.8646906351852572, 'learning_rate': 0.008340336571426752, 'max_depth': 11, 'min_child_weight': 5, 'n_estimators': 164, 'subsample': 0.79}
SCORE: -2483126.000 params {'colsample_bytree': 0.8, 'gamma': 1.350766692360818, 'lambda': 1.5796173193623455, 'learning_rate': 0.018858712297080263, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 497, 'subsample': 0.79}
SCORE: -2510424.750 params {'colsample_bytree': 0.8, 'gamma': 3.488423567701276, 'lambda': 0.6661071931191829, 'learning_rate': 0.008066170422133824, 'max_depth': 1, 'min_child_weight': 5, 'n_estimators': 194, 'subsample': 0.79}
SCORE: -2481739.167 params {'colsample_bytree': 0.8, 'gamma': 4.49164987933878, 'lambda': 1.5977612192284554, 'learning_rate': 0.019412670805599202, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 876, 'subsample': 0.79}
SCORE: -2515718.583 params {'colsample_bytree': 0.8, 'gamma': 0.484344728851656, 'la

In [30]:
print(best)

{'xgb_re__gamma': 2.404847311811813, 'xgb_re__lambda': 0.22037929992167735, 'xgb_re__learning_rate': 0.02998472878410937, 'xgb_re__max_depth': 3, 'xgb_re__min_child_weight': 0, 'xgb_re__n_estimators': 144}


Xgboost + Random Forest  

{  
    'xgb_re__gamma': 2.404847311811813,  
    'xgb_re__lambda': 0.22037929992167735,  
    'xgb_re__learning_rate': 0.02998472878410937,   
    'xgb_re__max_depth': 3,  
    'xgb_re__min_child_weight': 0,   
    'xgb_re__n_estimators': 144  
}  
Score = 2454966.5833333335