In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [0]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [0]:
import feature_string as feature_string
import feature_num as feature_num
import feature_categorica as feature_categorica
import common_machine_learning as common
import my_pipeline as my_pipe
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import xgboost as xgb
import time

#### Ignore Future Warning 

In [0]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

## Constantes 

In [0]:
TARGET = 'precio'
K = 3
RANDOM_SEMILLA = 3 # numero favorito (de la suerte)
XGB_RE_OBJECTIVE = 'reg:squarederror'
N_ITER = 50

## Train 

In [0]:
train = common.cargar_set_optimizado('/content/drive/My Drive/Colab Notebooks/sets_de_datos/train.csv')

## Features

In [0]:
feature_num.agregar_feature_fecha_numerica(train)

Esto toma unos segundos

In [0]:
train = feature_num.completar_lat_lng_con_provincias_y_ciudades(train)
train = feature_num.completar_lat_lng_con_idzona_mean(train)
feature_num.completar_lat_lng_con_promedio_Mexico(train)

In [0]:
train = feature_categorica.agregar_tipodepropiedad_precio_mean(train)

In [0]:
train = feature_categorica.train_agregar_feature_provincias_ciudades_ohe_reducido_df(train)

In [0]:
train = feature_string.train_agregar_feature_string_todos_df(train)

In [0]:
drop_columns = ['titulo', 'descripcion', 'tipodepropiedad', 'direccion', 'ciudad', 'provincia', 'idzona']

In [0]:
train = train.drop(drop_columns, axis = 1)

In [0]:
train.set_index('id', inplace = True)

In [0]:
X = train.drop([TARGET], axis = 1).copy()
y = train[TARGET].copy()

In [0]:
pre_procesar_pipe = my_pipe.get_columns_pipeline()

def objective(hyper_parametros):
    
    busqueda_pipe = Pipeline(steps = [
        ('pre-procesar', pre_procesar_pipe),
        ('xgb_regressor', XGBRegressor(objective = XGB_RE_OBJECTIVE, **hyper_parametros))
    ])  
    
    score = cross_val_score(busqueda_pipe, X, y, scoring='neg_mean_absolute_error', cv=K).mean()
    
    print("SCORE: {:.3f} params {}".format(score, hyper_parametros))
    
    return {'loss': -score, 'status': STATUS_OK}

In [0]:
space = {
    "n_estimators": hp.randint("xgb_re__n_estimators", 1000) + 1, 
    "learning_rate": hp.uniform("xgb_re__learning_rate", 0.005, 0.03), 
    "gamma": hp.uniform("xgb_re__gamma", 0.1, 5),
    "lambda": hp.uniform("xgb_re__lambda",0.1, 5),
    "max_depth": hp.randint("xgb_re__max_depth", 11) + 1, 
    "subsample": 0.79,
    "colsample_bytree": 0.8,
    "min_child_weight": hp.randint("xgb_re__min_child_weight", 5) + 1
}

In [19]:
t0 = time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=N_ITER)
t1 = time.time()
print('Tiempo = {0:.2f} minutos'.format((t1 - t0)/60))

SCORE: -992003.062 params {'colsample_bytree': 0.8, 'gamma': 0.8407282068909406, 'lambda': 2.280833908213882, 'learning_rate': 0.02676243315387239, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 568, 'subsample': 0.79}
SCORE: -1161523.500 params {'colsample_bytree': 0.8, 'gamma': 4.1163026641838005, 'lambda': 2.5361838963417966, 'learning_rate': 0.010456510993606772, 'max_depth': 1, 'min_child_weight': 2, 'n_estimators': 298, 'subsample': 0.79}
SCORE: -985112.500 params {'colsample_bytree': 0.8, 'gamma': 4.918264252535506, 'lambda': 0.9714821559724026, 'learning_rate': 0.010934032740481384, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 836, 'subsample': 0.79}
SCORE: -949379.833 params {'colsample_bytree': 0.8, 'gamma': 0.41404800888010296, 'lambda': 4.076493580206379, 'learning_rate': 0.018897550901897812, 'max_depth': 8, 'min_child_weight': 4, 'n_estimators': 824, 'subsample': 0.79}
SCORE: -939483.417 params {'colsample_bytree': 0.8, 'gamma': 1.9946751797362463, 'lamb

In [20]:
print(best)

{'xgb_re__gamma': 3.2491569626462806, 'xgb_re__lambda': 0.7758176217295185, 'xgb_re__learning_rate': 0.019562118856941643, 'xgb_re__max_depth': 10, 'xgb_re__min_child_weight': 1, 'xgb_re__n_estimators': 632}


Solo Xgboost    
{  
  'xgb_re__gamma': 3.2491569626462806,  
  'xgb_re__lambda': 0.7758176217295185,  
  'xgb_re__learning_rate': 0.019562118856941643,  
  'xgb_re__max_depth': 10 + 1,  
  'xgb_re__min_child_weight': 1 + 1,  
  'xgb_re__n_estimators': 632  
}  
Score = 938231.0833333334
