## Imports 

In [35]:
import common.feature_string as feature_string
import common.feature_num as feature_num
import common.feature_categorica as feature_categorica
import common.common_machine_learning as common
import common.my_pipeline as my_pipe
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import time
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score

### Ignore Warnings 

In [4]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes

In [5]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
XGB_RE_OBJECTIVE = 'reg:squarederror'
N_ITER = 15

In [6]:
t0 = time.time()
train = common.cargar_set_optimizado('sets_de_datos/train.csv', index_col = 0)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1-t0)/60))

Tiempo = 0.09 minutos


## Features agregables por registro (no necesita entrenamiento previo)

In [7]:
t0 = time.time()
feature_num.agregar_feature_fecha_numerica(train)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1-t0)/60))

Tiempo = 0.00 minutos


In [8]:
t0 = time.time()
train = feature_categorica.agregar_feature_one_hot_encoding(train)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1-t0)/60))

Tiempo = 0.10 minutos


Esto puede tardar unos minutos.  
Dima'pc = 3 minutos (aprox)

In [9]:
t0 = time.time()
feature_string.agregar_feature_todos_ref(train)
t1 = time.time()
print('Tiempo : {0:.2f} minutos'.format((t1-t0)/60))

Tiempo : 2.75 minutos


Elimino features no usar

In [None]:
train.drop(['fecha', 'titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia'], 
           axis = 1, inplace = True)

## Muestra y Target

In [15]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

## Objective, Pipeline & Space 

In [38]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

In [40]:
def objective(hyper_parametros):
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('xgb_regressor', XGBRegressor(objective = XGB_RE_OBJECTIVE, **hyper_parametros))
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

space = {
    "n_estimators": hp.randint("xgb_re__n_estimators", 1000) + 200, 
    "learning_rate": hp.uniform("xgb_re__learning_rate", 0.01, 0.05), 
    "gamma": hp.uniform("xgb_re__gamma", 0.2, 0.8),
    "lambda": hp.uniform("xgb_re__lambda",0.8, 1.2),
    "max_depth": hp.randint("xgb_re__max_depth", 100) + 0, 
    "subsample": hp.uniform("xgb_re__subsample", 0.7, 0.3),
    "colsample_bytree": hp.uniform("xgb_re__colsample_bytree", 0.7, 0.3),
    "min_child_weight": hp.randint("xgb_re__min_child_weight", 10) + 3
}

## Tunning

In [41]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -982961.208 params {'colsample_bytree': 0.4799463108251202, 'gamma': 0.5240451034358968, 'lambda': 0.8449436558155858, 'learning_rate': 0.01085809260576967, 'max_depth': 91, 'min_child_weight': 10, 'n_estimators': 928, 'subsample': 0.36254794079645364}
SCORE: -1017062.188 params {'colsample_bytree': 0.6369612663512698, 'gamma': 0.47805721771571624, 'lambda': 1.1622963072085777, 'learning_rate': 0.04801343139585802, 'max_depth': 44, 'min_child_weight': 6, 'n_estimators': 988, 'subsample': 0.3701504843769523}
SCORE: -1010385.354 params {'colsample_bytree': 0.3628948343116692, 'gamma': 0.6282533274568436, 'lambda': 0.8397094506563801, 'learning_rate': 0.011261018364973188, 'max_depth': 24, 'min_child_weight': 5, 'n_estimators': 385, 'subsample': 0.3262675458746168}
SCORE: -996300.396 params {'colsample_bytree': 0.5467219291201606, 'gamma': 0.5945045068026142, 'lambda': 1.0518282382183866, 'learning_rate': 0.027991749432059586, 'max_depth': 50, 'min_child_weight': 7, 'n_estimators':

## Resultados 

In [42]:
print(best)

{'xgb_re__colsample_bytree': 0.6269206294878382, 'xgb_re__gamma': 0.3627342789120286, 'xgb_re__lambda': 1.1126608037893504, 'xgb_re__learning_rate': 0.011511019740908655, 'xgb_re__max_depth': 75, 'xgb_re__min_child_weight': 8, 'xgb_re__n_estimators': 528, 'xgb_re__subsample': 0.5914933850528934}
