# XGBoost Regressor - Pipeline - Random Search - Train simple - 0 

Se descartan columnas complejas.

## Imports 

In [1]:
import common.common_machine_learning as common
import pandas as pd
import numpy as np
from category_encoders import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import time

### Ignorar FutureWarnings 

In [2]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [3]:
TARGET = 'precio'
K = 3
N_ITER = 10 # 0.5 minutos * 10 = 5 minutos (aprox.)
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)

## Set de datos 

In [4]:
train = common.cargar_set_optimizado('sets_de_datos/train.csv')

In [5]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

In [6]:
X = common.eliminar_columnas_complejas(X)
X = X.fillna(value = {'tipodepropiedad' : 'nan', 'provincia' : 'nan', 'ciudad' : 'nan'})

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

## Pipeline 

In [8]:
columns_pipe = ColumnTransformer(transformers = [
    ('nan_to_mean', SimpleImputer(strategy = 'mean'), ['metrostotales', 'metroscubiertos', 'antiguedad']),
    ('nan_to_cero', SimpleImputer(strategy = 'constant', fill_value = 0), ['habitaciones', 'banos', 'garages'])
])

pre_processor_pipe = Pipeline(steps =[
    ('ordinal_encoder', OrdinalEncoder(cols = ['tipodepropiedad', 'provincia', 'ciudad'])),
    ('columns_pipe', columns_pipe)
])

ppal_pipe = Pipeline(steps = [
    ('preprocessing', pre_processor_pipe), 
    ('xgb_regressor', XGBRegressor(objective = 'reg:squarederror'))
])

In [12]:
ppal_pipe = ppal_pipe.fit(X_train, y_train)

In [13]:
y_pred = ppal_pipe.predict(X_test)

print('RMSE : {0:.2f}'.format(np.sqrt(mean_squared_error(y_pred, y_test))))

RMSE : 1532881.75


## RandomSearch 

In [9]:
params = {
    "xgb_regressor__n_estimators": [100, 250, 750, 1000], # default 100,
    "xgb_regressor__learning_rate": [0.03, 0.01, 0.1, 0.3], # default 0.1
    "xgb_regressor__gamma": [0, 0.25, 0.75, 1],
    "xgb_regressor__lambda": [0, 0.25, 0.75, 1],
    "xgb_regressor__max_depth": [3, 5, 7, 9], # default 3 # < 15 columnas = cantidad columnas feature originales
    "xgb_regressor__subsample": [0.6, 0.5, 0.4, 0.3],
    "xgb_regressor__colsample_bytree": [0.6, 0.5, 0.4, 0.3],
    "xgb_regressor__min_child_weight": [2, 3, 4, 5]
}

search = RandomizedSearchCV(
            ppal_pipe, 
            param_distributions=params,
            random_state = RANDOM_SEMILLA,
            n_iter = N_ITER,
            cv=K, 
            verbose=1, 
            n_jobs=1, 
            return_train_score=True,
            iid = True
        )
inicio = time.time()
search = search.fit(X_train, y_train)
fin = time.time()

minutos = (fin-inicio)/60

print("Tiempo de busqueda : {0:.2f}".format(minutos))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 17.8min finished


Tiempo de busqueda : 19.02


In [10]:
common.busqueda_reportar_mejores_resultados(search.cv_results_, 30)

Modelo con rango: 1
MEAN: 0.531 (STD: 0.001)
Hiper-Parametros: {'xgb_regressor__subsample': 0.5, 'xgb_regressor__n_estimators': 750, 'xgb_regressor__min_child_weight': 5, 'xgb_regressor__max_depth': 9, 'xgb_regressor__learning_rate': 0.03, 'xgb_regressor__lambda': 1, 'xgb_regressor__gamma': 0.75, 'xgb_regressor__colsample_bytree': 0.6}

Modelo con rango: 2
MEAN: 0.522 (STD: 0.001)
Hiper-Parametros: {'xgb_regressor__subsample': 0.6, 'xgb_regressor__n_estimators': 750, 'xgb_regressor__min_child_weight': 3, 'xgb_regressor__max_depth': 7, 'xgb_regressor__learning_rate': 0.1, 'xgb_regressor__lambda': 0.25, 'xgb_regressor__gamma': 0, 'xgb_regressor__colsample_bytree': 0.6}

Modelo con rango: 3
MEAN: 0.517 (STD: 0.002)
Hiper-Parametros: {'xgb_regressor__subsample': 0.3, 'xgb_regressor__n_estimators': 100, 'xgb_regressor__min_child_weight': 3, 'xgb_regressor__max_depth': 7, 'xgb_regressor__learning_rate': 0.1, 'xgb_regressor__lambda': 0.75, 'xgb_regressor__gamma': 0, 'xgb_regressor__colsample_

Modelo con rango: 1  
MEAN: 0.531 (STD: 0.001)  
Hiper-Parametros: {  
    - 'regressor__subsample': 0.5,  
    - 'regressor__n_estimators': 750,  
    - 'regressor__min_child_weight': 5,  
    - 'regressor__max_depth': 9,  
    - 'regressor__learning_rate': 0.03,  
    - 'regressor__lambda': 1,  
    - 'regressor__gamma': 0.75,  
    - 'regressor__colsample_bytree': 0.6
    }

In [21]:
xgb_model = XGBRegressor(
    objective = 'reg:squarederror',
    subsample = 0.5,
    n_estimators = 750,
    min_child_weight = 5,
    max_depth = 9,
    learning_rate = 0.03,
    reg_lambda = 1,
    gamma = 0.75,
    colsample_bytree = 0.6
)

mejor_pipe = Pipeline(steps = [
    ('preprocessing', pre_processor_pipe), 
    ('xgb_regressor', xgb_model)
])


In [None]:
mejor_pipe = mejor_pipe.fit(X_train, y_train)

In [25]:
y_pred = mejor_pipe.predict(X_test)

print("RMSE : {0:.2f}".format(np.sqrt(mean_squared_error(y_pred, y_test))))

RMSE : 1449886.38
