# XGBoost - train simple - RandomSearch 1

Continuamos la busqueda de hiper-parametros para el modelo de train simple (sin columnas complejas), a partir de los resultados del random search 0

## Imports 

In [4]:
import common.common_machine_learning as common
import common.my_pipeline as my_pipe
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import time

### Ignorar FutureWarnings 

In [2]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [3]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
XGB_RE_OBJECTIVE = 'reg:squarederror'

La ultima busqueda realizo 10 iteraciones con K = 3 => Fueron 30 fit's.  
Todo eso tardo aprox. 20 minutos.

In [3]:
20/30 

0.6666666666666666

Aproximadamente tarda 0.7 minutos / fit   

In [5]:
30/(20/30)

45.0

En 45 fits gasto 30 minutos. Y hago 3 fits por cada iter.

In [6]:
45/3

15.0

Entonces con 15 iteracion deberia tardar 30 minutos aprox.

In [8]:
N_ITER = 15

## Carga del set de datos y pre-pre-procesamiento 

In [5]:
train = common.cargar_set_optimizado('sets_de_datos/train.csv', index_col = 0)

X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

X = common.eliminar_columnas_complejas(X)
X = X.fillna(value = {'tipodepropiedad' : 'nan', 'provincia' : 'nan', 'ciudad' : 'nan'})

## Pipeline 

In [6]:
pre_procesar_pipe = my_pipe.get_train_simple_pre_pipeline()
busqueda_pipe = Pipeline(steps = [
    ('pre-procesar', pre_procesar_pipe),
    ('xgb_regressor', XGBRegressor(objective = XGB_RE_OBJECTIVE))
])  

## Busqueda 1 

In [9]:
hyper_params = {
    "xgb_regressor__n_estimators": [650 ,850], # default 100,
    "xgb_regressor__learning_rate": [0.01, 0.05], # default 0.1
    "xgb_regressor__gamma": [0.2, 0.8],
    "xgb_regressor__lambda": [0.8, 1.2],
    "xgb_regressor__max_depth": [4, 7], # default 3 # < 15 columnas = cantidad columnas feature originales
    "xgb_regressor__subsample": [0.65, 0.45],
    "xgb_regressor__colsample_bytree": [0.65, 0.45],
    "xgb_regressor__min_child_weight": [3, 6]
}

busqueda = RandomizedSearchCV(
            busqueda_pipe, 
            param_distributions=hyper_params,
            random_state = RANDOM_SEMILLA,
            n_iter = N_ITER,
            cv=K, 
            verbose=1, 
            n_jobs=1, 
            return_train_score=True,
            iid = True
        )

In [12]:
tiempo_inicio = time.time()
busqueda = busqueda.fit(X,y)
tiempo_fin = time.time()
minutos = (tiempo_fin - tiempo_inicio) / 60
print('Tiempo de busqueda : {0:.2f} minutos'.format(minutos))

Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 33.8min finished


Tiempo de busqueda : 35.28 minutos


Nice timing !

In [None]:
common.busqueda_mostrar_resultados_df(busqueda.cv_results_)
#common.busqueda_guardar_resultados_df(busqueda.cv_results_, 'resultados_busquedas/dima_xgboost_train_simple_busqueda_1_resultados.csv')

In [84]:
pd.read_csv('resultados_busquedas/dima_xgboost_train_simple_busqueda_1_resultados.csv')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb_regressor__subsample,param_xgb_regressor__n_estimators,param_xgb_regressor__min_child_weight,param_xgb_regressor__max_depth,param_xgb_regressor__learning_rate,param_xgb_regressor__lambda,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,27.72346,0.699899,1.607233,0.076304,0.65,650,3,4,0.05,0.8,...,0.516062,0.515008,0.515396,0.000473,7,0.525111,0.525693,0.525193,0.525332,0.000257
1,56.104165,0.131847,5.050726,0.06057,0.45,850,6,7,0.05,0.8,...,0.531258,0.531151,0.531237,6.3e-05,1,0.577172,0.577093,0.578927,0.57773,0.000847
2,45.909231,0.455328,5.220357,0.019243,0.65,850,6,7,0.05,1.2,...,0.519387,0.518627,0.519214,0.000427,4,0.549751,0.550694,0.5511,0.550515,0.000565
3,28.910614,0.046303,1.579733,0.012599,0.45,650,3,4,0.01,0.8,...,0.4896,0.489077,0.489094,0.000406,13,0.492043,0.491444,0.491753,0.491747,0.000245
4,43.721508,0.104325,3.825348,0.030971,0.45,650,3,7,0.05,1.2,...,0.531611,0.530374,0.531154,0.000554,2,0.574525,0.574501,0.574592,0.574539,3.9e-05
5,23.742591,0.013895,1.723847,0.002533,0.65,650,3,4,0.05,0.8,...,0.507899,0.507176,0.507501,0.000299,8,0.514799,0.514582,0.514519,0.514633,0.00012
6,37.023111,0.023565,2.155373,0.022482,0.65,850,6,4,0.01,0.8,...,0.496987,0.495778,0.496164,0.000583,10,0.49972,0.499564,0.499107,0.499463,0.00026
7,29.229402,0.030524,1.602895,0.011864,0.45,650,6,4,0.01,1.2,...,0.489568,0.488886,0.489019,0.000405,14,0.491986,0.491383,0.491522,0.49163,0.000258
8,59.490831,0.034887,5.414282,0.034787,0.65,850,6,7,0.01,0.8,...,0.526457,0.526051,0.526353,0.000217,3,0.544459,0.544217,0.544809,0.544495,0.000243
9,36.50359,0.015209,3.731583,0.012041,0.65,650,6,7,0.01,0.8,...,0.498778,0.498641,0.498742,7.3e-05,9,0.508404,0.508249,0.508573,0.508409,0.000132
