## Imports 

In [12]:
import common.feature_string as feature_string
import common.feature_num as feature_num
import common.feature_categorica as feature_categorica
import common.common_machine_learning as common
import common.my_pipeline as my_pipe
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import time
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score

### Ignore Warnings 

In [13]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes

In [14]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
XGB_RE_OBJECTIVE = 'reg:squarederror'
N_ITER = 15

In [15]:
t0 = time.time()
train = common.cargar_set_optimizado('sets_de_datos/train.csv', index_col = 0)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1-t0)/60))

Tiempo = 0.06 minutos


## Features agregables por registro (no necesita entrenamiento previo)

Esto puede tardar unos minutos.  
Dima'pc = 3 minutos (aprox)

In [16]:
t0 = time.time()
feature_num.agregar_feature_fecha_numerica(train)
train = feature_categorica.agregar_feature_one_hot_encoding(train)
feature_string.agregar_feature_todos_ref(train)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1-t0)/60))

Tiempo = 2.77 minutos


Elimino features no usar

In [17]:
feature_num.eliminar_num_no_feature(train)
feature_categorica.eliminar_categoria_no_feature(train)
feature_string.eliminar_string_no_feature(train)

## Muestra y Target

In [18]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

## Objective, Pipeline & Space 

In [19]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

In [20]:
def objective(hyper_parametros):
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('xgb_regressor', XGBRegressor(objective = XGB_RE_OBJECTIVE, **hyper_parametros))
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

"""
LAST BEST:
{'xgb_re__colsample_bytree': 0.6269206294878382, 
 'xgb_re__gamma': 0.3627342789120286, 
 'xgb_re__lambda': 1.1126608037893504, 
 'xgb_re__learning_rate': 0.011511019740908655, 
 'xgb_re__max_depth': 75, 
 'xgb_re__min_child_weight': 8, 
 'xgb_re__n_estimators': 528, 
 'xgb_re__subsample': 0.5914933850528934}
"""

space = {
    "n_estimators": hp.randint("xgb_re__n_estimators", 1000) + 400, 
    "learning_rate": hp.uniform("xgb_re__learning_rate", 0.005, 0.03), 
    "gamma": hp.uniform("xgb_re__gamma", 0.2, 5),
    "lambda": hp.uniform("xgb_re__lambda",0.9, 3),
    "max_depth": hp.randint("xgb_re__max_depth", 100) + 70, 
    "subsample": hp.uniform("xgb_re__subsample", 0.7, 0.5),
    "colsample_bytree": hp.uniform("xgb_re__colsample_bytree", 0.8, 0.5),
    "min_child_weight": hp.randint("xgb_re__min_child_weight", 15) + 5
}

## Tunning

In [21]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -946958.062 params {'colsample_bytree': 0.6745284479404787, 'gamma': 3.459452183773617, 'lambda': 2.8925100961829124, 'learning_rate': 0.010207833244775434, 'max_depth': 81, 'min_child_weight': 18, 'n_estimators': 1023, 'subsample': 0.6715247786684874}
SCORE: -948601.104 params {'colsample_bytree': 0.6203429356978385, 'gamma': 2.600321260170619, 'lambda': 2.8075934638758437, 'learning_rate': 0.006322422838602877, 'max_depth': 110, 'min_child_weight': 6, 'n_estimators': 1009, 'subsample': 0.597480581067935}
SCORE: -972190.479 params {'colsample_bytree': 0.5797651918737197, 'gamma': 2.4907080624380584, 'lambda': 2.751097738736561, 'learning_rate': 0.015910206793269287, 'max_depth': 111, 'min_child_weight': 9, 'n_estimators': 1252, 'subsample': 0.6465691999472108}
SCORE: -927340.812 params {'colsample_bytree': 0.758825416563804, 'gamma': 1.8586006281475769, 'lambda': 2.7701007509455757, 'learning_rate': 0.007938209899495991, 'max_depth': 134, 'min_child_weight': 12, 'n_estimators':

Mejores resultados:

In [24]:
310/60

5.166666666666667

5 horas (aprox.)

In [23]:
310/15

20.666666666666668

21 minutos/iteracion (aprox)

## Resultados 

In [25]:
print(best)

{'xgb_re__colsample_bytree': 0.758825416563804, 'xgb_re__gamma': 1.8586006281475769, 'xgb_re__lambda': 2.7701007509455757, 'xgb_re__learning_rate': 0.007938209899495991, 'xgb_re__max_depth': 64, 'xgb_re__min_child_weight': 7, 'xgb_re__n_estimators': 70, 'xgb_re__subsample': 0.6819576248705131}


Estos "mejores resultados" no se corresponden a los resultados anteriores.

Parece ser que fmin devuelve los mejores valores obtenidos de los hp.randint, pero no sabe que a eso le sumamos constantes: 

Ahora los resultados si se corresponden