In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [0]:
import feature_string as feature_string
import feature_num as feature_num
import feature_categorica as feature_categorica
import common_machine_learning as common
import my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import time

#### Ignore Future Warning 

In [0]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [0]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
N_ITER = 50

## Train 

In [0]:
train = common.cargar_set_optimizado('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv')

## Features

In [0]:
feature_num.agregar_feature_fecha_numerica(train)

Esto toma unos segundos

In [0]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [0]:
train = feature_categorica.agregar_tipodepropiedad_precio_mean(train)

In [0]:
train = feature_categorica.train_agregar_feature_provincias_ciudades_ohe_reducido_df(train)

In [0]:
train = feature_string.train_agregar_feature_string_todos_df(train)

In [24]:
train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'fecha_numerica',
       'tipodepropiedad_precio_mean', 'ciudad_San_Luis_Potosí',
       'ciudad_Querétaro', 'ciudad_Zapopan', 'ciudad_Huixquilucan',
       'ciudad_Mérida', 'provincia_Distrito_Federal',
       'provincia_Edo._de_México', 'provincia_San_luis_Potosí',
       'provincia_Yucatán', 'provincia_Querétaro',
       'titulo_cantidad_palabras_importantes',
       'descripcion_cantidad_palabras_importantes',
       'direccion_cantidad_palabras_importantes',
       'titulo_cantidad_caracteres_en_palabras_importantes',
       'descripcion_cantidad_caracteres_en_palabras_importantes',
       'direccion_cantidad_caracteres_en_palabras_importantes',
 

In [0]:
drop_columns = ['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 'idzona']

In [0]:
train = train.drop(drop_columns, axis = 1)

In [27]:
train.shape

(240000, 57)

In [28]:
train.isnull().sum()

id                                                             0
antiguedad                                                 43555
habitaciones                                               22471
garages                                                    37765
banos                                                      26221
metroscubiertos                                            17400
metrostotales                                              51467
lat                                                            0
lng                                                            0
fecha                                                          0
gimnasio                                                       0
usosmultiples                                                  0
piscina                                                        0
escuelascercanas                                               0
centroscomercialescercanos                                     0
precio                   

In [0]:
train.set_index('id', inplace = True)

In [0]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

In [0]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

def objective(hyper_parametros):
    
    rf_regressor = RandomForestRegressor(
        n_estimators = hyper_parametros['n_estimators'],
        max_depth = hyper_parametros['rf_max_depth'],
        min_samples_split = hyper_parametros['rf_min_samples_split'],
        min_samples_leaf = hyper_parametros['rf_min_samples_leaf'],
        max_features = hyper_parametros['rf_max_features']
    )
    
    ada_regressor = AdaBoostRegressor(
                        base_estimator = rf_regressor, 
                        n_estimators = hyper_parametros['n_estimators'],
                        learning_rate = hyper_parametros['ada_learning_rate']
                    )
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('ada_regressor', ada_regressor)
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

In [0]:
space = {
    "n_estimators": hp.randint("n_estimators", 1000) + 1,
    'ada_learning_rate' : hp.uniform('ada_learning_rate',1,5),
    "rf_max_depth": hp.randint("rf_max_depth", 15) + 1,
    "rf_min_samples_split": hp.uniform('rf_min_samples_split',0.1,0.9),
    "rf_min_samples_leaf": hp.choice("rf_min_samples_leaf", [1 ,2, 3, 4, 5, 6, 7, 8, 9, 10]),
    "rf_max_features": hp.uniform("rf_max_features", 0.6, 0.8)
}

In [37]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -1135903.352 params {'ada_learning_rate': 3.1773118525217985, 'n_estimators': 871, 'rf_max_depth': 15, 'rf_max_features': 0.6431714423258921, 'rf_min_samples_leaf': 7, 'rf_min_samples_split': 0.10805996845539251}
SCORE: -1608228.274 params {'ada_learning_rate': 2.4681083745164365, 'n_estimators': 896, 'rf_max_depth': 9, 'rf_max_features': 0.7880138719060416, 'rf_min_samples_leaf': 2, 'rf_min_samples_split': 0.7029949583162792}
SCORE: -1321070.087 params {'ada_learning_rate': 3.4112179088958836, 'n_estimators': 576, 'rf_max_depth': 6, 'rf_max_features': 0.6490597446800356, 'rf_min_samples_leaf': 3, 'rf_min_samples_split': 0.6302464136215312}
SCORE: -1609433.978 params {'ada_learning_rate': 4.017283432102209, 'n_estimators': 427, 'rf_max_depth': 15, 'rf_max_features': 0.6164658966010832, 'rf_min_samples_leaf': 9, 'rf_min_samples_split': 0.6539979761292918}
SCORE: -1310405.495 params {'ada_learning_rate': 2.6318061729226274, 'n_estimators': 629, 'rf_max_depth': 7, 'rf_max_features'

In [38]:
print(best)

{'ada_learning_rate': 3.1303607894832504, 'n_estimators': 269, 'rf_max_depth': 8, 'rf_max_features': 0.74243479691193, 'rf_min_samples_leaf': 6, 'rf_min_samples_split': 0.10367744935025583}


Adaboost + Random Forest  
{  
  'ada_learning_rate': 3.1303607894832504,  
  'n_estimators': 269,  
  'rf_max_depth': 8,  
  'rf_max_features': 0.74243479691193,  
  'rf_min_samples_leaf': 6,  
  'rf_min_samples_split': 0.10367744935025583  
}  
Score = 1128379.4504845273