## Imports

In [57]:
import common.feature_string as feature_string
import common.feature_num as feature_num
import common.feature_categorica as feature_categorica
import common.common_machine_learning as common
import common.my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import time

#### Ignore Future Warning 

In [36]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [37]:
TARGET = 'precio'
K = 2
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
N_ITER = 1

## Train 

In [38]:
train = common.cargar_set_optimizado('sets_de_datos/train.csv')

## Features

In [39]:
feature_num.agregar_feature_fecha_numerica(train)

Esto toma unos segundos

In [40]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [41]:
train = feature_categorica.agregar_tipodepropiedad_precio_mean(train)

In [42]:
train = feature_categorica.train_agregar_feature_provincias_ciudades_ohe_reducido_df(train)

In [43]:
train = feature_string.train_agregar_feature_string_todos_df(train)

In [44]:
train.columns

Index(['id', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad',
       'provincia', 'antiguedad', 'habitaciones', 'garages', 'banos',
       'metroscubiertos', 'metrostotales', 'idzona', 'lat', 'lng', 'fecha',
       'gimnasio', 'usosmultiples', 'piscina', 'escuelascercanas',
       'centroscomercialescercanos', 'precio', 'fecha_numerica',
       'tipodepropiedad_precio_mean', 'ciudad_San_Luis_Potosí',
       'ciudad_Querétaro', 'ciudad_Zapopan', 'ciudad_Huixquilucan',
       'ciudad_Mérida', 'provincia_Distrito_Federal',
       'provincia_Edo._de_México', 'provincia_San_luis_Potosí',
       'provincia_Yucatán', 'provincia_Querétaro',
       'titulo_cantidad_palabras_importantes',
       'descripcion_cantidad_palabras_importantes',
       'direccion_cantidad_palabras_importantes',
       'titulo_cantidad_caracteres_en_palabras_importantes',
       'descripcion_cantidad_caracteres_en_palabras_importantes',
       'direccion_cantidad_caracteres_en_palabras_importantes',
 

In [45]:
drop_columns = ['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 'idzona']

In [46]:
train = train.drop(drop_columns, axis = 1)

In [47]:
train.shape

(240000, 57)

In [48]:
train.isnull().sum()

id                                                             0
antiguedad                                                 43555
habitaciones                                               22471
garages                                                    37765
banos                                                      26221
metroscubiertos                                            17400
metrostotales                                              51467
lat                                                            0
lng                                                            0
fecha                                                          0
gimnasio                                                       0
usosmultiples                                                  0
piscina                                                        0
escuelascercanas                                               0
centroscomercialescercanos                                     0
precio                   

In [49]:
train.set_index('id', inplace = True)

In [50]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

In [59]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

def objective(hyper_parametros):
    
    rf_regressor = RandomForestRegressor(
        n_estimators = hyper_parametros['n_estimators'],
        max_depth = hyper_parametros['rf_max_depth'],
        min_samples_split = hyper_parametros['rf_min_samples_split'],
        min_samples_leaf = hyper_parametros['rf_min_samples_leaf'],
        max_features = hyper_parametros['rf_max_features']
    )
    
    ada_regressor = AdaBoostRegressor(
                        base_estimator = rf_regressor, 
                        n_estimators = hyper_parametros['n_estimators'],
                        learning_rate = hyper_parametros['ada_learning_rate']
                    )
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('ada_regressor', ada_regressor)
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

In [53]:
space = {
    "n_estimators": hp.randint("n_estimators", 1000) + 1,
    'ada_learning_rate' : hp.uniform('ada_learning_rate',1,5),
    "rf_max_depth": hp.randint("rf_max_depth", 15) + 1,
    "rf_min_samples_split": hp.randint("rf_min_samples_split", 10) + 1,
    "rf_min_samples_leaf": hp.choice("rf_min_samples_leaf", [1 ,2, 3, 4, 5, 6, 7, 8, 9, 10]),
    "rf_max_features": hp.uniform("rf_max_features", 0.6, 0.8)
}

In [60]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -1251970.509 params {'ada_learning_rate': 3.4254758756396324, 'n_estimators': 156, 'rf_max_depth': 11, 'rf_max_features': 0.7385796133727545, 'rf_min_samples_leaf': 6, 'rf_min_samples_split': 5}
100%|██████████| 1/1 [02:22<00:00, 142.32s/it, best loss: 1251970.5093388588]
Tiempo = 2.37 minutos


In [None]:
print(best)