## XGBoost

In [1]:
import pandas as pd
import numpy as np
from common import metrica
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [2]:
from sklearn.model_selection import train_test_split

In [24]:
from category_encoders import TargetEncoder

In [27]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [15]:
df = pd.read_csv("sets_de_datos/train.csv")
df = df.set_index("id")

In [16]:
col_borrar = ["direccion", "lat", "lng", "titulo", "descripcion", "idzona", "fecha"]
df = df.drop(col_borrar, axis=1)

In [18]:
col_categorias = ["ciudad", "provincia", "tipodepropiedad"]
df_categorico = df[col_categorias].dropna()
df_num = df.drop(col_categorias, axis=1)

In [20]:
df2 = df_categorico.join(df_num)

In [22]:
datos = df2.drop(["precio"], axis=1)
precios = df2["precio"]
datos_train, datos_test, precio_train, precio_test = train_test_split(datos, precios, test_size=0.25, random_state=42)

In [28]:
columnas_pipe = ColumnTransformer(transformers = [
    ('nan_to_mean', SimpleImputer(strategy = 'mean'), ['metrostotales', 'metroscubiertos', 'antiguedad']),
    ('nan_to_cero', SimpleImputer(strategy = 'constant', fill_value = 0), ['habitaciones', 'banos', 'garages'])
])

pre_processor_pipe = Pipeline(steps =[
    ('ordinal_encoder', TargetEncoder(cols = col_categorias)),
    ('columns_pipe', columnas_pipe)
])

ppal_pipe = Pipeline(steps = [
    ('preprocessing', pre_processor_pipe), 
    ('xgb_regressor', XGBRegressor(objective = 'reg:squarederror'))
])

In [29]:
ppal_pipe.fit(datos_train, precio_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 Pipeline(memory=None,
                          steps=[('ordinal_encoder',
                                  TargetEncoder(cols=['ciudad', 'provincia',
                                                      'tipodepropiedad'],
                                                drop_invariant=False,
                                                handle_missing='value',
                                                handle_unknown='value',
                                                min_samples_leaf=1,
                                                return_df=True, smoothing=1.0,
                                                verbose=0)),
                                 ('columns_pipe',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='drop',
                                                    sparse_t...
                              

In [30]:
pred = ppal_pipe.predict(datos_test)

In [36]:
metrica.resultados(precio_test, pd.Series(pred))

+-------------------------+
| RMSLE   | MAE           |
+-------------------------+
| 1.06175 | 1943497.72061 |
+-------------------------+


### ===================================GridSearch===================================

In [41]:
import time
from sklearn.model_selection import RandomizedSearchCV

params = {
    "xgb_regressor__n_estimators": [100, 250, 750, 1000], # default 100,
    "xgb_regressor__learning_rate": [0.03, 0.01, 0.1, 0.3], # default 0.1
    "xgb_regressor__gamma": [0, 0.25, 0.75, 1],
    "xgb_regressor__lambda": [0, 0.25, 0.75, 1],
    "xgb_regressor__max_depth": [3, 5, 7, 9], # default 3 # < 15 columnas = cantidad columnas feature originales
    "xgb_regressor__subsample": [0.6, 0.5, 0.4, 0.3],
    "xgb_regressor__colsample_bytree": [0.6, 0.5, 0.4, 0.3],
    "xgb_regressor__min_child_weight": [2, 3, 4, 5]
}

search = RandomizedSearchCV(
            ppal_pipe, 
            param_distributions=params,
            random_state = 42,
            n_iter = 100,
            cv=3, 
            verbose=1, 
            n_jobs=1, 
            return_train_score=True,
            iid = True
        )
inicio = time.time()
search = search.fit(datos_train, precio_train)
fin = time.time()

minutos = (fin-inicio)/60

print("Tiempo de busqueda : {0:.2f}".format(minutos))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed: 190.4min finished


Tiempo de busqueda : 192.76


In [48]:
import common.common_machine_learning as common
search.best_params_

{'xgb_regressor__subsample': 0.6,
 'xgb_regressor__n_estimators': 1000,
 'xgb_regressor__min_child_weight': 3,
 'xgb_regressor__max_depth': 9,
 'xgb_regressor__learning_rate': 0.01,
 'xgb_regressor__lambda': 0.25,
 'xgb_regressor__gamma': 0.75,
 'xgb_regressor__colsample_bytree': 0.6}

In [50]:
search.best_score_

0.534356794512862

In [55]:
search.cv_results_

{'mean_fit_time': array([ 9.17915527,  8.40742262, 55.61169092, 35.59310985, 26.55389865,
         8.57367047, 59.12431105,  9.53552183,  9.01403149, 40.80434982,
         9.96352975, 11.12714847, 35.61245974, 70.1861062 , 11.44604413,
         9.09677037,  4.7646033 ,  8.46280718, 16.46529492,  6.35395638,
        18.37812837,  7.633099  ,  4.01221625,  8.55600651, 30.80019021,
        41.05062914,  7.6459616 ,  8.2362531 , 64.6047314 , 40.03178589,
         8.78555179,  8.0491337 , 30.8790578 , 28.52763335,  9.70626346,
        89.22252369, 30.84200859, 39.0912745 ,  3.2388041 , 53.87482508,
        27.07565729, 91.77586269, 30.84712466, 81.22003539, 30.78177389,
        48.11025484, 41.00429479, 28.35459272, 72.19414862, 40.67986385,
        69.08300416, 59.55401945,  3.21430127, 35.74690008,  7.51537712,
         9.30819662, 32.70963001, 33.19281872,  7.94060055, 24.61779793,
         3.89346957, 15.08970443,  9.5190889 , 26.25188375, 18.82171663,
         4.17286491,  7.51117722, 

#### Voy a dejar como están los parámetros que me dieron valores intermedios y probar de nuevo con parametros que quedaron punteros 

##### Esto es: 
        -Subsample: Aumentar
        -n_estimators: Aumentar
        -min_child_weigth: OK
        -max_depth: Aumentar
        -learning_rate: OK
        -lambda: OK
        -gamma: OK
        -colsample_bytree: Aumentar

OBS: No estoy teniendo en cuenta para nada el overfitting o underfitting, tengo que leer al respecto y ver si afecta en algo todo esto

In [59]:
params_2 = {
    "xgb_regressor__n_estimators": [900, 1100, 1300, 1500], # default 100,
    "xgb_regressor__learning_rate": [0.005, 0.02], # default 0.1
    "xgb_regressor__gamma": [0.6, 0.8],
    "xgb_regressor__lambda": [0.2, 0.3],
    "xgb_regressor__max_depth": [9, 11, 13], # default 3 # < 15 columnas = cantidad columnas feature originales
    "xgb_regressor__subsample": [0.6, 0.8, 0.9], #No se si tiene sentido un subsample tan grande pero despues veré
    "xgb_regressor__colsample_bytree": [0.6, 0.8, 0.9],
    "xgb_regressor__min_child_weight": [3]
}

search_2 = RandomizedSearchCV(
            ppal_pipe, 
            param_distributions=params_2,
            random_state = 13,
            n_iter = 100,
            cv=3, 
            verbose=1, 
            n_jobs=2, 
            return_train_score=True,
            iid = True
        )
inicio = time.time()
search_2 = search_2.fit(datos_train, precio_train)
fin = time.time()

minutos = (fin-inicio)/60

print("Tiempo de busqueda : {0:.2f}".format(minutos))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 109.8min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 385.3min
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed: 621.0min finished


Tiempo de busqueda : 627.14


In [60]:
search_2.best_params_

{'xgb_regressor__subsample': 0.8,
 'xgb_regressor__n_estimators': 1300,
 'xgb_regressor__min_child_weight': 3,
 'xgb_regressor__max_depth': 11,
 'xgb_regressor__learning_rate': 0.005,
 'xgb_regressor__lambda': 0.3,
 'xgb_regressor__gamma': 0.8,
 'xgb_regressor__colsample_bytree': 0.9}

In [61]:
search_2.best_score_

0.5429221293508483

In [62]:
search_2.cv_results_

{'mean_fit_time': array([125.01336686, 188.52930935, 252.36009208, 253.76569263,
        273.8361148 , 196.81946953, 159.09494781, 156.55861235,
        106.30447499, 129.10388764,  91.98181295, 248.18895984,
        212.74370837, 136.05978735, 181.03654909, 143.28366852,
        122.24966089, 164.75452447, 141.10355449, 190.08968091,
        114.20598078, 129.58563423, 190.02671496, 167.18470581,
        140.33271996, 169.40908058, 139.11033909, 205.37191017,
        204.70117243, 128.0105919 , 132.37691228, 136.67968432,
        146.3402168 , 145.8776048 , 229.88324904, 111.65159456,
         85.48922276, 173.84009488, 128.05826577, 122.07431912,
        135.51006929, 134.75718125, 163.41309317, 123.17901738,
        126.8605667 , 126.32570903, 182.51156179, 137.94179432,
        100.32571785, 130.23884145,  99.67260377, 100.98324482,
        107.84316715, 238.48278896, 119.33533835, 116.64307849,
        115.17833217, 263.7869033 , 129.23924899, 133.62007165,
        101.81043728, 1