In [197]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from bayes_opt import BayesianOptimization
import category_encoders as ce
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor

In [198]:
train = pd.read_csv(r'train.csv', index_col=0)
test = pd.read_csv(r'test.csv', index_col=0)

In [199]:
#metrica
def MAE(actual, pred):
    return (np.mean(np.absolute(actual - pred)))

## Preprocesamiento

In [200]:
train.isnull().sum()

titulo                          5387
descripcion                     1619
tipodepropiedad                   46
direccion                      53072
ciudad                           372
provincia                        155
antiguedad                     43555
habitaciones                   22471
garages                        37765
banos                          26221
metroscubiertos                17400
metrostotales                  51467
idzona                         28621
lat                           123488
lng                           123488
fecha                              0
gimnasio                           0
usosmultiples                      0
piscina                            0
escuelascercanas                   0
centroscomercialescercanos         0
precio                             0
dtype: int64

In [201]:
test.isnull().sum()

titulo                         1378
descripcion                     401
tipodepropiedad                   7
direccion                     13191
ciudad                           83
provincia                        42
antiguedad                    10714
habitaciones                   5628
garages                        9323
banos                          6554
metroscubiertos                4299
metrostotales                 12655
idzona                         7179
lat                           30695
lng                           30695
fecha                             0
gimnasio                          0
usosmultiples                     0
piscina                           0
escuelascercanas                  0
centroscomercialescercanos        0
dtype: int64

In [202]:
train['provincia'] = train['provincia'].fillna('Distrito Federal') #valor mas frecuente
test['provincia'] = test['provincia'].fillna('Distrito Federal')
train['tipodepropiedad'] = train['tipodepropiedad'].fillna('Casa') #valor mas frecuente
test['tipodepropiedad'] = test['tipodepropiedad'].fillna('Casa')

train['descripcion'] = train['descripcion'].fillna('-')
test['descripcion'] = test['descripcion'].fillna('-')

train = train.fillna(0)
test = test.fillna(0)

In [203]:
print(train.shape)
print(test.shape)

(240000, 22)
(60000, 21)


## Feature Engineering

### One Hot Encoding

In [204]:
one_hot_enc = ce.OneHotEncoder()
one_hot_encoded = one_hot_enc.fit_transform(train['provincia'])
train = train.join(one_hot_encoded)
del train['provincia']

one_hot_encoded = one_hot_enc.transform(test['provincia'])
test = test.join(one_hot_encoded)
del test['provincia']

In [205]:
one_hot_enc = ce.OneHotEncoder()
one_hot_encoded = one_hot_enc.fit_transform(train['tipodepropiedad'])
train = train.join(one_hot_encoded)
del train['tipodepropiedad']

one_hot_encoded = one_hot_enc.transform(test['tipodepropiedad'])
test = test.join(one_hot_encoded)
del test['tipodepropiedad']

### Target Encoding

In [206]:
print(train.shape)
print(test.shape)

(240000, 76)
(60000, 75)


In [207]:
X = train.drop('precio', axis = 1)
y = train['precio']
target_enc = ce.TargetEncoder(cols=['idzona', 'ciudad'])
train = target_enc.fit_transform(X, y)
train['precio'] = y

In [208]:
test = target_enc.transform(test)

In [209]:
print(train.shape)
print(test.shape)

(240000, 76)
(60000, 75)


### Agregando Features

#### Avenida

In [210]:
def avenida(st):
    return ("av." in st) or ("avenida" in st) or  ("av " in st)

In [211]:
test = test.astype({"direccion":"str"})
train = train.astype({"direccion":"str"})

test["direccion"] = test["direccion"].apply(str.lower)
test["avenida"] = test["direccion"].map(avenida)

train["direccion"] = train["direccion"].apply(str.lower)
train["avenida"] = train["direccion"].map(avenida)

test = test.astype({"avenida":"uint8"})
train = train.astype({"avenida":"uint8"})

#### Cantidad de amenities

In [212]:
train['cant_amenities'] = train['usosmultiples'] + train['piscina'] + train['gimnasio']
test['cant_amenities'] = test['usosmultiples'] + test['piscina'] + test['gimnasio']

#### Año de publicación

In [213]:
train['fecha'] = pd.to_datetime(train['fecha'])
train['ano'] = train['fecha'].dt.year

test['fecha'] = pd.to_datetime(test['fecha'])
test['ano'] = test['fecha'].dt.year

#### De la descripción

In [214]:
buena_ubicacion = ['buena ubicacion', 'buena ubicación', 'excelente ubicación', 'excelente ubicacion', 'bien ubicada', 'bien ubicado']
train['buena_ubicacion'] = train['descripcion'].apply(lambda x: 1 if any(word in x for word in buena_ubicacion ) else 0)
test['buena_ubicacion'] = test['descripcion'].apply(lambda x: 1 if any(word in x for word in buena_ubicacion ) else 0)

In [215]:
luminoso = ['luminoso', 'luminosa']
train['luminoso'] = train['descripcion'].apply(lambda x: 1 if any(word in x for word in luminoso ) else 0)
test['luminoso'] = test['descripcion'].apply(lambda x: 1 if any(word in x for word in luminoso ) else 0)

In [216]:
jardin = ['jardin', 'jardín', 'parque', 'patio']
train['jardin'] = train['descripcion'].apply(lambda x: 1 if any(word in x for word in luminoso ) else 0)
test['jardin'] = test['descripcion'].apply(lambda x: 1 if any(word in x for word in luminoso ) else 0)

In [217]:
balcon = ['balcon', 'balcón', 'terraza']
train['balcon'] = train['descripcion'].apply(lambda x: 1 if any(word in x for word in luminoso ) else 0)
test['balcon'] = test['descripcion'].apply(lambda x: 1 if any(word in x for word in luminoso ) else 0)

#### Cercanías

In [218]:
train['centroscomercialescercanos'] = train['centroscomercialescercanos'].map({1:2, 0:0})
test['centroscomercialescercanos'] = test['centroscomercialescercanos'].map({1:2, 0:0})

In [219]:
train['cercanias'] = train['centroscomercialescercanos'] + train['escuelascercanas']
train['cercanias'] = train['cercanias'].astype(str)

test['cercanias'] = test['centroscomercialescercanos'] + test['escuelascercanas']
test['cercanias'] = test['cercanias'].astype(str)

In [220]:
train = train.join(pd.get_dummies(train['cercanias'], prefix = 'cercania'))
del train['cercanias']

test = test.join(pd.get_dummies(test['cercanias'], prefix = 'cercania'))
del test['cercanias']

### Borrando columnas innecesarias

In [221]:
del train['lat']
del train['lng']
del train['descripcion']
del train['titulo']
del train['direccion']
del train['fecha']

del test['lat']
del test['lng']
del test['descripcion']
del test['titulo']
del test['direccion']
del test['fecha']

In [222]:
print(train.shape)
print(test.shape)

(240000, 81)
(60000, 80)


### Dividiendo por año

In [223]:
train_2012 = train[train['ano'] == 2012]
train_2013 = train[train['ano'] == 2013]
train_2014 = train[train['ano'] == 2014]
train_2015 = train[train['ano'] == 2015]
train_2016 = train[train['ano'] == 2016]

test_2012 = test[test['ano'] == 2012]
test_2013 = test[test['ano'] == 2013]
test_2014 = test[test['ano'] == 2014]
test_2015 = test[test['ano'] == 2015]
test_2016 = test[test['ano'] == 2016]

## KNN

In [27]:
def KNN(train, param_grid):
    KNN = KNeighborsRegressor()
    X = train.drop(['precio'], axis=1)
    y = train['precio']
    grid = GridSearchCV(KNN, param_grid, cv=5, scoring='neg_mean_absolute_error')
    grid.fit(X, y)
    KNN = grid.best_estimator_
    print(grid.best_params_)
    print(grid.best_score_)
    return KNN

In [None]:
k_valores = list(range(1, 21))
param_grid = dict(n_neighbors=k_valores)
KNN_2012 = KNN(train_2012, param_grid)
KNN_2013 = KNN(train_2013, param_grid)
KNN_2014 = KNN(train_2014, param_grid)
KNN_2015 = KNN(train_2015, param_grid)
KNN_2016 = KNN(train_2016, param_grid)

## Definiendo tamaños de los sets

In [224]:
def train_set(train,test):
    X = train.drop(['precio'], axis=1)
    y = train['precio']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
    print("Train shapes: X = " + str(X_train.shape) + " y = " + str(y_train.shape))
    print("Validation shapes: X = " + str(X_val.shape) +  " y = " + str(y_val.shape))
    print("Test shape: " + str(test.shape))
    return X, y, X_val, y_val

In [225]:
x_2012, y_2012, x_2012_val, y_2012_val = train_set(train_2012,test_2012)
x_2013, y_2013, x_2013_val, y_2013_val = train_set(train_2013,test_2013)
x_2014, y_2014, x_2014_val, y_2014_val = train_set(train_2014,test_2014)
x_2015, y_2015, x_2015_val, y_2015_val = train_set(train_2015,test_2015)
x_2016, y_2016, x_2016_val, y_2016_val = train_set(train_2016,test_2016)

Train shapes: X = (17650, 80) y = (17650,)
Validation shapes: X = (5884, 80) y = (5884,)
Test shape: (5899, 80)
Train shapes: X = (22789, 80) y = (22789,)
Validation shapes: X = (7597, 80) y = (7597,)
Test shape: (7592, 80)
Train shapes: X = (30429, 80) y = (30429,)
Validation shapes: X = (10143, 80) y = (10143,)
Test shape: (10018, 80)
Train shapes: X = (38602, 80) y = (38602,)
Validation shapes: X = (12868, 80) y = (12868,)
Test shape: (13017, 80)
Train shapes: X = (70528, 80) y = (70528,)
Validation shapes: X = (23510, 80) y = (23510,)
Test shape: (23474, 80)


## LIGHTGBM

### Tuneo de hiperparámetros

In [226]:
import lightgbm
def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight, learning_rate, seed,n_estimators):
    params = {}
    params['metric'] = 'rmse'
    params["seed"] = int(seed)
    params["learning_rate"] = learning_rate
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['max_depth'] = int(round(max_depth))
    params['min_split_gain'] = min_split_gain
    params['min_child_weight'] = min_child_weight
    params["n_estimators"] = int(n_estimators)
    cv_result = lightgbm.cv(params, dtrain, nfold=5, num_boost_round=100)
    # Used around 1000 boosting rounds in the full model
    #cv_result = lightgbm.cv(params, dtrain, num_boost_round=100, nfold=3, seed=20)    
    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return max(cv_result['rmse-mean'])

In [227]:
def optimize_lgbm(sett):
    dtrain = sett
    lightgbm_bo = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                        'feature_fraction': (0.1, 0.9),
                                        'bagging_fraction': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50),
                                        "learning_rate" : (0.01, 0.07),
                                        "seed":(10,50),
                                        "n_estimators":(100,1000)          
                                        }, random_state=0)
    # Se probo con 3 puntos iniciales y 10 iteraciones, deberiamos probar con mas en ambas
    # La prox iteracion en casa probar 100 y 10000
    lightgbm_bo.maximize(init_points=3, n_iter=10, acq='ei')
    params = lightgbm_bo.max['params']
    params['max_depth'] = int(params['max_depth'])
    params["seed"] = int(params["seed"])
    params["n_estimators"] = int(params["n_estimators"])
    return params

In [231]:
feature_set = list(x_2012.columns)
dtrain_2012 = lightgbm.Dataset(x_2012, label=y_2012,feature_name=feature_set)
feature_set = list(x_2013.columns)
dtrain_2013 = lightgbm.Dataset(x_2013, label=y_2013,feature_name=feature_set)
feature_set = list(x_2014.columns)
dtrain_2014 = lightgbm.Dataset(x_2014, label=y_2014,feature_name=feature_set)
feature_set = list(x_2015.columns)
dtrain_2015 = lightgbm.Dataset(x_2015, label=y_2015,feature_name=feature_set)
feature_set = list(x_2016.columns)
dtrain_2016 = lightgbm.Dataset(x_2016, label=y_2016,feature_name=feature_set)


In [232]:
dtrain = dtrain_2012
params_2012 = optimize_lgbm(dtrain_2012)
dtrain = dtrain_2013
params_2013 = optimize_lgbm(dtrain_2013)
dtrain = dtrain_2014
params_2014 = optimize_lgbm(dtrain_2014)
dtrain = dtrain_2015
params_2015 = optimize_lgbm(dtrain_2015)
dtrain = dtrain_2016
params_2016 = optimize_lgbm(dtrain_2016)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... |   seed    |
-------------------------------------------------------------------------------------------------------------------------------------




| [0m 1       [0m | [0m 1.727e+0[0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 0.04617 [0m | [0m 7.174   [0m | [0m 24.06   [0m | [0m 0.06494 [0m | [0m 493.8   [0m | [0m 42.73   [0m | [0m 48.55   [0m |




| [95m 2       [0m | [95m 1.739e+0[0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 0.04173 [0m | [95m 7.266   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 178.4   [0m | [95m 24.42   [0m | [95m 43.3    [0m |




| [0m 3       [0m | [0m 1.702e+0[0m | [0m 0.9556  [0m | [0m 0.796   [0m | [0m 0.06872 [0m | [0m 8.189   [0m | [0m 25.77   [0m | [0m 0.07827 [0m | [0m 206.4   [0m | [0m 37.44   [0m | [0m 15.73   [0m |




| [95m 4       [0m | [95m 1.772e+0[0m | [95m 0.8844  [0m | [95m 0.2402  [0m | [95m 0.01921 [0m | [95m 8.225   [0m | [95m 9.211   [0m | [95m 0.001068[0m | [95m 749.3   [0m | [95m 30.28   [0m | [95m 39.33   [0m |




| [0m 5       [0m | [0m 1.761e+0[0m | [0m 0.8595  [0m | [0m 0.843   [0m | [0m 0.01736 [0m | [0m 5.968   [0m | [0m 23.52   [0m | [0m 0.03308 [0m | [0m 615.2   [0m | [0m 28.82   [0m | [0m 47.88   [0m |




| [0m 6       [0m | [0m 1.755e+0[0m | [0m 0.9116  [0m | [0m 0.6652  [0m | [0m 0.0291  [0m | [0m 6.534   [0m | [0m 18.14   [0m | [0m 0.06617 [0m | [0m 609.8   [0m | [0m 29.32   [0m | [0m 45.58   [0m |




| [0m 7       [0m | [0m 1.711e+0[0m | [0m 0.9396  [0m | [0m 0.2106  [0m | [0m 0.06445 [0m | [0m 7.494   [0m | [0m 34.7    [0m | [0m 0.06542 [0m | [0m 662.8   [0m | [0m 28.8    [0m | [0m 36.39   [0m |




| [0m 8       [0m | [0m 1.722e+0[0m | [0m 0.9603  [0m | [0m 0.865   [0m | [0m 0.05059 [0m | [0m 6.625   [0m | [0m 21.06   [0m | [0m 0.02608 [0m | [0m 141.6   [0m | [0m 36.43   [0m | [0m 18.61   [0m |




| [0m 9       [0m | [0m 1.713e+0[0m | [0m 0.9577  [0m | [0m 0.6611  [0m | [0m 0.05888 [0m | [0m 7.25    [0m | [0m 41.3    [0m | [0m 0.07923 [0m | [0m 836.8   [0m | [0m 32.23   [0m | [0m 37.26   [0m |




| [0m 10      [0m | [0m 1.758e+0[0m | [0m 0.8575  [0m | [0m 0.2975  [0m | [0m 0.03868 [0m | [0m 5.398   [0m | [0m 40.28   [0m | [0m 0.09882 [0m | [0m 179.3   [0m | [0m 25.36   [0m | [0m 38.65   [0m |




| [0m 11      [0m | [0m 1.759e+0[0m | [0m 0.951   [0m | [0m 0.6561  [0m | [0m 0.02132 [0m | [0m 8.973   [0m | [0m 41.21   [0m | [0m 0.04594 [0m | [0m 289.4   [0m | [0m 26.45   [0m | [0m 28.42   [0m |




| [0m 12      [0m | [0m 1.766e+0[0m | [0m 0.8457  [0m | [0m 0.4622  [0m | [0m 0.01581 [0m | [0m 6.111   [0m | [0m 28.36   [0m | [0m 0.05396 [0m | [0m 754.0   [0m | [0m 38.65   [0m | [0m 44.3    [0m |




| [0m 13      [0m | [0m 1.73e+06[0m | [0m 0.853   [0m | [0m 0.4545  [0m | [0m 0.0522  [0m | [0m 5.427   [0m | [0m 38.43   [0m | [0m 0.001316[0m | [0m 103.9   [0m | [0m 33.05   [0m | [0m 13.05   [0m |
|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... |   seed    |
-------------------------------------------------------------------------------------------------------------------------------------




| [0m 1       [0m | [0m 1.787e+0[0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 0.04617 [0m | [0m 7.174   [0m | [0m 24.06   [0m | [0m 0.06494 [0m | [0m 493.8   [0m | [0m 42.73   [0m | [0m 48.55   [0m |




| [95m 2       [0m | [95m 1.801e+0[0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 0.04173 [0m | [95m 7.266   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 178.4   [0m | [95m 24.42   [0m | [95m 43.3    [0m |




| [0m 3       [0m | [0m 1.76e+06[0m | [0m 0.9556  [0m | [0m 0.796   [0m | [0m 0.06872 [0m | [0m 8.189   [0m | [0m 25.77   [0m | [0m 0.07827 [0m | [0m 206.4   [0m | [0m 37.44   [0m | [0m 15.73   [0m |




| [95m 4       [0m | [95m 1.839e+0[0m | [95m 0.8844  [0m | [95m 0.2402  [0m | [95m 0.01921 [0m | [95m 8.225   [0m | [95m 9.211   [0m | [95m 0.001068[0m | [95m 749.3   [0m | [95m 30.28   [0m | [95m 39.33   [0m |




| [0m 5       [0m | [0m 1.826e+0[0m | [0m 0.8595  [0m | [0m 0.843   [0m | [0m 0.01736 [0m | [0m 5.968   [0m | [0m 23.52   [0m | [0m 0.03308 [0m | [0m 615.2   [0m | [0m 28.82   [0m | [0m 47.88   [0m |




| [0m 6       [0m | [0m 1.819e+0[0m | [0m 0.9116  [0m | [0m 0.6652  [0m | [0m 0.0291  [0m | [0m 6.534   [0m | [0m 18.14   [0m | [0m 0.06617 [0m | [0m 609.8   [0m | [0m 29.32   [0m | [0m 45.58   [0m |




| [0m 7       [0m | [0m 1.768e+0[0m | [0m 0.9396  [0m | [0m 0.2106  [0m | [0m 0.06445 [0m | [0m 7.494   [0m | [0m 34.7    [0m | [0m 0.06542 [0m | [0m 662.8   [0m | [0m 28.8    [0m | [0m 36.39   [0m |




| [0m 8       [0m | [0m 1.783e+0[0m | [0m 0.9603  [0m | [0m 0.865   [0m | [0m 0.05059 [0m | [0m 6.625   [0m | [0m 21.06   [0m | [0m 0.02608 [0m | [0m 141.6   [0m | [0m 36.43   [0m | [0m 18.61   [0m |




| [0m 9       [0m | [0m 1.774e+0[0m | [0m 0.9577  [0m | [0m 0.6611  [0m | [0m 0.05888 [0m | [0m 7.25    [0m | [0m 41.3    [0m | [0m 0.07923 [0m | [0m 836.8   [0m | [0m 32.23   [0m | [0m 37.26   [0m |




| [0m 10      [0m | [0m 1.826e+0[0m | [0m 0.8575  [0m | [0m 0.2975  [0m | [0m 0.03868 [0m | [0m 5.398   [0m | [0m 40.28   [0m | [0m 0.09882 [0m | [0m 179.3   [0m | [0m 25.36   [0m | [0m 38.65   [0m |




| [0m 11      [0m | [0m 1.823e+0[0m | [0m 0.951   [0m | [0m 0.6561  [0m | [0m 0.02132 [0m | [0m 8.973   [0m | [0m 41.21   [0m | [0m 0.04594 [0m | [0m 289.4   [0m | [0m 26.45   [0m | [0m 28.42   [0m |




| [0m 12      [0m | [0m 1.83e+06[0m | [0m 0.8457  [0m | [0m 0.4622  [0m | [0m 0.01581 [0m | [0m 6.111   [0m | [0m 28.36   [0m | [0m 0.05396 [0m | [0m 754.0   [0m | [0m 38.65   [0m | [0m 44.3    [0m |




| [0m 13      [0m | [0m 1.791e+0[0m | [0m 0.853   [0m | [0m 0.4545  [0m | [0m 0.0522  [0m | [0m 5.427   [0m | [0m 38.43   [0m | [0m 0.001316[0m | [0m 103.9   [0m | [0m 33.05   [0m | [0m 13.05   [0m |
|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... |   seed    |
-------------------------------------------------------------------------------------------------------------------------------------




| [0m 1       [0m | [0m 1.921e+0[0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 0.04617 [0m | [0m 7.174   [0m | [0m 24.06   [0m | [0m 0.06494 [0m | [0m 493.8   [0m | [0m 42.73   [0m | [0m 48.55   [0m |




| [95m 2       [0m | [95m 1.934e+0[0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 0.04173 [0m | [95m 7.266   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 178.4   [0m | [95m 24.42   [0m | [95m 43.3    [0m |




| [0m 3       [0m | [0m 1.893e+0[0m | [0m 0.9556  [0m | [0m 0.796   [0m | [0m 0.06872 [0m | [0m 8.189   [0m | [0m 25.77   [0m | [0m 0.07827 [0m | [0m 206.4   [0m | [0m 37.44   [0m | [0m 15.73   [0m |




| [95m 4       [0m | [95m 1.98e+06[0m | [95m 0.8844  [0m | [95m 0.2402  [0m | [95m 0.01921 [0m | [95m 8.225   [0m | [95m 9.211   [0m | [95m 0.001068[0m | [95m 749.3   [0m | [95m 30.28   [0m | [95m 39.33   [0m |




| [0m 5       [0m | [0m 1.959e+0[0m | [0m 0.8595  [0m | [0m 0.843   [0m | [0m 0.01736 [0m | [0m 5.968   [0m | [0m 23.52   [0m | [0m 0.03308 [0m | [0m 615.2   [0m | [0m 28.82   [0m | [0m 47.88   [0m |




| [0m 6       [0m | [0m 1.953e+0[0m | [0m 0.9116  [0m | [0m 0.6652  [0m | [0m 0.0291  [0m | [0m 6.534   [0m | [0m 18.14   [0m | [0m 0.06617 [0m | [0m 609.8   [0m | [0m 29.32   [0m | [0m 45.58   [0m |




| [0m 7       [0m | [0m 1.913e+0[0m | [0m 0.9396  [0m | [0m 0.2106  [0m | [0m 0.06445 [0m | [0m 7.494   [0m | [0m 34.7    [0m | [0m 0.06542 [0m | [0m 662.8   [0m | [0m 28.8    [0m | [0m 36.39   [0m |




| [0m 8       [0m | [0m 1.916e+0[0m | [0m 0.9603  [0m | [0m 0.865   [0m | [0m 0.05059 [0m | [0m 6.625   [0m | [0m 21.06   [0m | [0m 0.02608 [0m | [0m 141.6   [0m | [0m 36.43   [0m | [0m 18.61   [0m |




| [0m 9       [0m | [0m 1.907e+0[0m | [0m 0.9577  [0m | [0m 0.6611  [0m | [0m 0.05888 [0m | [0m 7.25    [0m | [0m 41.3    [0m | [0m 0.07923 [0m | [0m 836.8   [0m | [0m 32.23   [0m | [0m 37.26   [0m |




| [0m 10      [0m | [0m 1.956e+0[0m | [0m 0.8575  [0m | [0m 0.2975  [0m | [0m 0.03868 [0m | [0m 5.398   [0m | [0m 40.28   [0m | [0m 0.09882 [0m | [0m 179.3   [0m | [0m 25.36   [0m | [0m 38.65   [0m |




| [0m 11      [0m | [0m 1.956e+0[0m | [0m 0.951   [0m | [0m 0.6561  [0m | [0m 0.02132 [0m | [0m 8.973   [0m | [0m 41.21   [0m | [0m 0.04594 [0m | [0m 289.4   [0m | [0m 26.45   [0m | [0m 28.42   [0m |




| [0m 12      [0m | [0m 1.963e+0[0m | [0m 0.8457  [0m | [0m 0.4622  [0m | [0m 0.01581 [0m | [0m 6.111   [0m | [0m 28.36   [0m | [0m 0.05396 [0m | [0m 754.0   [0m | [0m 38.65   [0m | [0m 44.3    [0m |




| [0m 13      [0m | [0m 1.921e+0[0m | [0m 0.853   [0m | [0m 0.4545  [0m | [0m 0.0522  [0m | [0m 5.427   [0m | [0m 38.43   [0m | [0m 0.001316[0m | [0m 103.9   [0m | [0m 33.05   [0m | [0m 13.05   [0m |
|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... |   seed    |
-------------------------------------------------------------------------------------------------------------------------------------




| [0m 1       [0m | [0m 2.121e+0[0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 0.04617 [0m | [0m 7.174   [0m | [0m 24.06   [0m | [0m 0.06494 [0m | [0m 493.8   [0m | [0m 42.73   [0m | [0m 48.55   [0m |




| [95m 2       [0m | [95m 2.134e+0[0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 0.04173 [0m | [95m 7.266   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 178.4   [0m | [95m 24.42   [0m | [95m 43.3    [0m |




| [0m 3       [0m | [0m 2.089e+0[0m | [0m 0.9556  [0m | [0m 0.796   [0m | [0m 0.06872 [0m | [0m 8.189   [0m | [0m 25.77   [0m | [0m 0.07827 [0m | [0m 206.4   [0m | [0m 37.44   [0m | [0m 15.73   [0m |




| [95m 4       [0m | [95m 2.166e+0[0m | [95m 0.8844  [0m | [95m 0.2402  [0m | [95m 0.01921 [0m | [95m 8.225   [0m | [95m 9.211   [0m | [95m 0.001068[0m | [95m 749.3   [0m | [95m 30.28   [0m | [95m 39.33   [0m |




| [0m 5       [0m | [0m 2.164e+0[0m | [0m 0.8595  [0m | [0m 0.843   [0m | [0m 0.01736 [0m | [0m 5.968   [0m | [0m 23.52   [0m | [0m 0.03308 [0m | [0m 615.2   [0m | [0m 28.82   [0m | [0m 47.88   [0m |




| [0m 6       [0m | [0m 2.154e+0[0m | [0m 0.9116  [0m | [0m 0.6652  [0m | [0m 0.0291  [0m | [0m 6.534   [0m | [0m 18.14   [0m | [0m 0.06617 [0m | [0m 609.8   [0m | [0m 29.32   [0m | [0m 45.58   [0m |




| [0m 7       [0m | [0m 2.146e+0[0m | [0m 0.9396  [0m | [0m 0.2106  [0m | [0m 0.06445 [0m | [0m 7.494   [0m | [0m 34.7    [0m | [0m 0.06542 [0m | [0m 662.8   [0m | [0m 28.8    [0m | [0m 36.39   [0m |




| [0m 8       [0m | [0m 2.116e+0[0m | [0m 0.9603  [0m | [0m 0.865   [0m | [0m 0.05059 [0m | [0m 6.625   [0m | [0m 21.06   [0m | [0m 0.02608 [0m | [0m 141.6   [0m | [0m 36.43   [0m | [0m 18.61   [0m |




| [0m 9       [0m | [0m 2.105e+0[0m | [0m 0.9577  [0m | [0m 0.6611  [0m | [0m 0.05888 [0m | [0m 7.25    [0m | [0m 41.3    [0m | [0m 0.07923 [0m | [0m 836.8   [0m | [0m 32.23   [0m | [0m 37.26   [0m |




| [0m 10      [0m | [0m 2.162e+0[0m | [0m 0.8575  [0m | [0m 0.2975  [0m | [0m 0.03868 [0m | [0m 5.398   [0m | [0m 40.28   [0m | [0m 0.09882 [0m | [0m 179.3   [0m | [0m 25.36   [0m | [0m 38.65   [0m |




| [0m 11      [0m | [0m 2.16e+06[0m | [0m 0.951   [0m | [0m 0.6561  [0m | [0m 0.02132 [0m | [0m 8.973   [0m | [0m 41.21   [0m | [0m 0.04594 [0m | [0m 289.4   [0m | [0m 26.45   [0m | [0m 28.42   [0m |




| [95m 12      [0m | [95m 2.168e+0[0m | [95m 0.8457  [0m | [95m 0.4622  [0m | [95m 0.01581 [0m | [95m 6.111   [0m | [95m 28.36   [0m | [95m 0.05396 [0m | [95m 754.0   [0m | [95m 38.65   [0m | [95m 44.3    [0m |




| [0m 13      [0m | [0m 2.122e+0[0m | [0m 0.853   [0m | [0m 0.4545  [0m | [0m 0.0522  [0m | [0m 5.427   [0m | [0m 38.43   [0m | [0m 0.001316[0m | [0m 103.9   [0m | [0m 33.05   [0m | [0m 13.05   [0m |
|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... |   seed    |
-------------------------------------------------------------------------------------------------------------------------------------




| [0m 1       [0m | [0m 2.254e+0[0m | [0m 0.9098  [0m | [0m 0.6722  [0m | [0m 0.04617 [0m | [0m 7.174   [0m | [0m 24.06   [0m | [0m 0.06494 [0m | [0m 493.8   [0m | [0m 42.73   [0m | [0m 48.55   [0m |




| [95m 2       [0m | [95m 2.267e+0[0m | [95m 0.8767  [0m | [95m 0.7334  [0m | [95m 0.04173 [0m | [95m 7.266   [0m | [95m 46.65   [0m | [95m 0.008033[0m | [95m 178.4   [0m | [95m 24.42   [0m | [95m 43.3    [0m |




| [0m 3       [0m | [0m 2.22e+06[0m | [0m 0.9556  [0m | [0m 0.796   [0m | [0m 0.06872 [0m | [0m 8.189   [0m | [0m 25.77   [0m | [0m 0.07827 [0m | [0m 206.4   [0m | [0m 37.44   [0m | [0m 15.73   [0m |




| [95m 4       [0m | [95m 2.313e+0[0m | [95m 0.8844  [0m | [95m 0.2402  [0m | [95m 0.01921 [0m | [95m 8.225   [0m | [95m 9.211   [0m | [95m 0.001068[0m | [95m 749.3   [0m | [95m 30.28   [0m | [95m 39.33   [0m |




| [0m 5       [0m | [0m 2.299e+0[0m | [0m 0.8595  [0m | [0m 0.843   [0m | [0m 0.01736 [0m | [0m 5.968   [0m | [0m 23.52   [0m | [0m 0.03308 [0m | [0m 615.2   [0m | [0m 28.82   [0m | [0m 47.88   [0m |




| [0m 6       [0m | [0m 2.291e+0[0m | [0m 0.9116  [0m | [0m 0.6652  [0m | [0m 0.0291  [0m | [0m 6.534   [0m | [0m 18.14   [0m | [0m 0.06617 [0m | [0m 609.8   [0m | [0m 29.32   [0m | [0m 45.58   [0m |




| [0m 7       [0m | [0m 2.259e+0[0m | [0m 0.9396  [0m | [0m 0.2106  [0m | [0m 0.06445 [0m | [0m 7.494   [0m | [0m 34.7    [0m | [0m 0.06542 [0m | [0m 662.8   [0m | [0m 28.8    [0m | [0m 36.39   [0m |




| [0m 8       [0m | [0m 2.248e+0[0m | [0m 0.9603  [0m | [0m 0.865   [0m | [0m 0.05059 [0m | [0m 6.625   [0m | [0m 21.06   [0m | [0m 0.02608 [0m | [0m 141.6   [0m | [0m 36.43   [0m | [0m 18.61   [0m |




| [0m 9       [0m | [0m 2.237e+0[0m | [0m 0.9577  [0m | [0m 0.6611  [0m | [0m 0.05888 [0m | [0m 7.25    [0m | [0m 41.3    [0m | [0m 0.07923 [0m | [0m 836.8   [0m | [0m 32.23   [0m | [0m 37.26   [0m |




| [0m 10      [0m | [0m 2.297e+0[0m | [0m 0.8575  [0m | [0m 0.2975  [0m | [0m 0.03868 [0m | [0m 5.398   [0m | [0m 40.28   [0m | [0m 0.09882 [0m | [0m 179.3   [0m | [0m 25.36   [0m | [0m 38.65   [0m |




| [0m 11      [0m | [0m 2.295e+0[0m | [0m 0.951   [0m | [0m 0.6561  [0m | [0m 0.02132 [0m | [0m 8.973   [0m | [0m 41.21   [0m | [0m 0.04594 [0m | [0m 289.4   [0m | [0m 26.45   [0m | [0m 28.42   [0m |




| [0m 12      [0m | [0m 2.303e+0[0m | [0m 0.8457  [0m | [0m 0.4622  [0m | [0m 0.01581 [0m | [0m 6.111   [0m | [0m 28.36   [0m | [0m 0.05396 [0m | [0m 754.0   [0m | [0m 38.65   [0m | [0m 44.3    [0m |




| [0m 13      [0m | [0m 2.253e+0[0m | [0m 0.853   [0m | [0m 0.4545  [0m | [0m 0.0522  [0m | [0m 5.427   [0m | [0m 38.43   [0m | [0m 0.001316[0m | [0m 103.9   [0m | [0m 33.05   [0m | [0m 13.05   [0m |


In [234]:
print(params_2016)

{'bagging_fraction': 0.8844396091027046, 'feature_fraction': 0.2401644436304009, 'learning_rate': 0.019208260134863285, 'num_leaves': 30.282236771792782, 'n_estimators': 749, 'min_split_gain': 0.0010676751915985497, 'max_depth': 8, 'min_child_weight': 9.211388287076087, 'seed': 39}


In [240]:
from lightgbm import LGBMRegressor as lgbm
def fabrica_lgbm(params):
    best_lgbm_model = lgbm(#colsample_bytree=params["colsample_bytree"],
                 bagging_fracion=params["bagging_fraction"],
                 num_leaves=int(params["num_leaves"]),                 
                 learning_rate=params["learning_rate"],
                 max_depth=params["max_depth"],
                 min_child_weight=params["min_child_weight"],
                 n_estimators=params["n_estimators"],                                                                    
                 #reg_alpha=params["reg_alpha"],
                 #reg_lambda=params["reg_lambda"],
                 #subsample=params["subsample"],
                 seed=params["seed"])
    
    return best_lgbm_model

In [241]:
lgbm_2012 = fabrica_lgbm(params_2012)
lgbm_2013 = fabrica_lgbm(params_2013)
lgbm_2014 = fabrica_lgbm(params_2014)
lgbm_2015 = fabrica_lgbm(params_2015)
lgbm_2016 = fabrica_lgbm(params_2016)

In [243]:
lgbm_2012.fit(x_2012,y_2012)
lgbm_pred_val = lgbm_2012.predict(x_2012_val)
lgbm_mae = MAE(y_2012_val, lgbm_pred_val)
print("MAE XGB: " + str(lgbm_mae))

MAE XGB: 406024.91973549256


In [244]:
lgbm_2013.fit(x_2013,y_2013)
lgbm_pred_val = lgbm_2013.predict(x_2013_val)
lgbm_mae = MAE(y_2013_val, lgbm_pred_val)
print("MAE XGB: " + str(lgbm_mae))

MAE XGB: 407606.33905984863


In [245]:
lgbm_2014.fit(x_2014,y_2014)
lgbm_pred_val = lgbm_2014.predict(x_2014_val)
lgbm_mae = MAE(y_2014_val, lgbm_pred_val)
print("MAE XGB: " + str(lgbm_mae))

MAE XGB: 482616.50925858127


In [246]:
lgbm_2015.fit(x_2015,y_2015)
lgbm_pred_val = lgbm_2015.predict(x_2015_val)
lgbm_mae = MAE(y_2015_val, lgbm_pred_val)
print("MAE XGB: " + str(lgbm_mae))

MAE XGB: 534111.9699979337


In [247]:
lgbm_2016.fit(x_2016,y_2016)
lgbm_pred_val = lgbm_2016.predict(x_2016_val)
lgbm_mae = MAE(y_2016_val, lgbm_pred_val)
print("MAE XGB: " + str(lgbm_mae))

MAE XGB: 595856.9186336667


## XGBoost

### Tuneo de hiperparámetros

In [254]:
import xgboost as xgb

dtrain_2012 = xgb.DMatrix(x_2012, label=y_2012)
dtrain_2013 = xgb.DMatrix(x_2013, label=y_2013)
dtrain_2014 = xgb.DMatrix(x_2014, label=y_2014)
dtrain_2015 = xgb.DMatrix(x_2015, label=y_2015)
dtrain_2016 = xgb.DMatrix(x_2016, label=y_2016)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [258]:
def xgb_evaluate(max_depth, gamma, colsample_bytree,seed,min_child_weight,n_estimators,
                  reg_alpha,reg_lambda,subsample,learning_rate):
    params = {'eval_metric': 'rmse',
              'max_depth': int(max_depth),
              'subsample': subsample,
              'eta': 0.1,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              "seed": int(seed),
              "min_child_weight": min_child_weight,
              "n_estimators": n_estimators,
              "reg_alpha": reg_alpha,
              "reg_lambda": reg_lambda,
              "learning_rate": learning_rate
             }
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=3)    
    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [259]:
def optimize(sett):
    dtrain = sett
    xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 7), 
                                             'gamma': (0, 1),
                                             'colsample_bytree': (0.3, 0.9),
                                             "seed": (10,50),
                                             "min_child_weight": (0.4,1.5),
                                             "n_estimators":(100,10000),                                                                    
                                             "reg_alpha":(0.2,0.75),
                                             "reg_lambda": (0.2,0.8),
                                             "subsample" : (0.3, 0.8),
                                             "learning_rate": (0.01,0.07),
                                            })
    # Se probo con 3 puntos iniciales y 10 iteraciones, deberiamos probar con mas en ambas
    # La prox iteracion en casa probar 100 y 10000
    xgb_bo.maximize(init_points=3, n_iter=10, acq='ei')
    params = xgb_bo.max['params']
    params['max_depth'] = int(params['max_depth'])
    params["seed"] = int(params["seed"])
    params["n_estimators"] = int(params["n_estimators"])
    return params

In [260]:
dtrain = dtrain_2012
params_2012 = optimize(dtrain_2012)
dtrain = dtrain_2013
params_2013 = optimize(dtrain_2013)
dtrain = dtrain_2014
params_2014 = optimize(dtrain_2014)
dtrain = dtrain_2015
params_2015 = optimize(dtrain_2015)
dtrain = dtrain_2016
params_2016 = optimize(dtrain_2016)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... |   seed    | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-8.862e+0[0m | [0m 0.4441  [0m | [0m 0.6882  [0m | [0m 0.04941 [0m | [0m 5.864   [0m | [0m 1.301   [0m | [0m 8.159e+0[0m | [0m 0.7123  [0m | [0m 0.3806  [0m | [0m 41.06   [0m | [0m 0.3587  [0m |
| [95m 2       [0m | [95m-8.73e+05[0m | [95m 0.6479  [0m | [95m 0.3529  [0m | [95m 0.04546 [0m | [95m 5.359   [0m | [95m 1.227   [0m | [95m 7.129e+0[0m | [95m 0.2772  [0m | [95m 0.4825  [0m | [95m 25.03   [0m | [95m 0.3723  [0m |
| [0m 3       [0m | [0m-8.795e+0[0m | [0m 0.7166  [0m | [0m 0.1181  [0m | [0m 0.06087 [0m | [0m 4.716   [0m | [0m 1.335   [0m | [0m 2.768e+0[0m | [0m 0.3284  [0m | [0m 0.6204  [0m | [0m 28.24

| [0m 4       [0m | [0m-9.728e+0[0m | [0m 0.4439  [0m | [0m 0.8922  [0m | [0m 0.0427  [0m | [0m 5.509   [0m | [0m 1.03    [0m | [0m 965.0   [0m | [0m 0.749   [0m | [0m 0.7312  [0m | [0m 21.1    [0m | [0m 0.6454  [0m |
| [95m 5       [0m | [95m-9.232e+0[0m | [95m 0.717   [0m | [95m 0.5619  [0m | [95m 0.04889 [0m | [95m 6.527   [0m | [95m 1.203   [0m | [95m 111.3   [0m | [95m 0.6511  [0m | [95m 0.3612  [0m | [95m 19.06   [0m | [95m 0.7976  [0m |
| [0m 6       [0m | [0m-9.649e+0[0m | [0m 0.5067  [0m | [0m 0.2845  [0m | [0m 0.04345 [0m | [0m 5.758   [0m | [0m 1.144   [0m | [0m 5.929e+0[0m | [0m 0.2273  [0m | [0m 0.2331  [0m | [0m 28.78   [0m | [0m 0.5094  [0m |
| [0m 7       [0m | [0m-1.019e+0[0m | [0m 0.4393  [0m | [0m 0.4601  [0m | [0m 0.06556 [0m | [0m 3.353   [0m | [0m 1.277   [0m | [0m 2.859e+0[0m | [0m 0.2258  [0m | [0m 0.3602  [0m | [0m 19.75   [0m | [0m 0.3014  [0m |
| [0m 8       [0m 

| [0m 9       [0m | [0m-1.505e+0[0m | [0m 0.3946  [0m | [0m 0.7737  [0m | [0m 0.01477 [0m | [0m 6.03    [0m | [0m 0.6256  [0m | [0m 7.582e+0[0m | [0m 0.7055  [0m | [0m 0.5981  [0m | [0m 30.86   [0m | [0m 0.584   [0m |
| [0m 10      [0m | [0m-1.144e+0[0m | [0m 0.3348  [0m | [0m 0.9027  [0m | [0m 0.03266 [0m | [0m 6.703   [0m | [0m 1.204   [0m | [0m 8.227e+0[0m | [0m 0.2855  [0m | [0m 0.4947  [0m | [0m 37.05   [0m | [0m 0.4449  [0m |
| [0m 11      [0m | [0m-1.236e+0[0m | [0m 0.6269  [0m | [0m 0.5188  [0m | [0m 0.03825 [0m | [0m 3.038   [0m | [0m 0.5808  [0m | [0m 7.788e+0[0m | [0m 0.4467  [0m | [0m 0.2542  [0m | [0m 12.67   [0m | [0m 0.6322  [0m |
| [0m 12      [0m | [0m-1.154e+0[0m | [0m 0.8981  [0m | [0m 0.297   [0m | [0m 0.06911 [0m | [0m 3.906   [0m | [0m 0.9029  [0m | [0m 4.14e+03[0m | [0m 0.7258  [0m | [0m 0.5883  [0m | [0m 17.33   [0m | [0m 0.6235  [0m |
| [0m 13      [0m | [0m-1.119

### Probando

In [261]:
def fabrica_XGB(params):
    best_xgb_model = XGBRegressor(colsample_bytree=params["colsample_bytree"],
                 gamma=params["gamma"],                 
                 learning_rate=params["learning_rate"],
                 max_depth=params["max_depth"],
                 min_child_weight=params["min_child_weight"],
                 n_estimators=params["n_estimators"],                                                                    
                 reg_alpha=params["reg_alpha"],
                 reg_lambda=params["reg_lambda"],
                 subsample=params["subsample"],
                 seed=params["seed"])
    
    return best_xgb_model

In [262]:
XGB_2012 = fabrica_XGB(params_2012)
XGB_2013 = fabrica_XGB(params_2013)
XGB_2014 = fabrica_XGB(params_2014)
XGB_2015 = fabrica_XGB(params_2015)
XGB_2016 = fabrica_XGB(params_2016)

In [263]:
XGB_2012.fit(x_2012,y_2012)
XGB_pred_val = XGB_2012.predict(x_2012_val)
XGB_mae = MAE(y_2012_val, XGB_pred_val)
print("MAE XGB: " + str(XGB_mae))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


MAE XGB: 176801.79285987423


In [264]:
XGB_2013.fit(x_2013,y_2013)
XGB_pred_val = XGB_2013.predict(x_2013_val)
XGB_mae = MAE(y_2013_val, XGB_pred_val)
print("MAE XGB: " + str(XGB_mae))

MAE XGB: 160516.67489017046


In [265]:
XGB_2014.fit(x_2014,y_2014)
XGB_pred_val = XGB_2014.predict(x_2014_val)
XGB_mae = MAE(y_2014_val, XGB_pred_val)
print("MAE XGB: " + str(XGB_mae))

MAE XGB: 508711.66124728875


In [266]:
XGB_2015.fit(x_2015,y_2015)
XGB_pred_val = XGB_2015.predict(x_2015_val)
XGB_mae = MAE(y_2015_val, XGB_pred_val)
print("MAE XGB: " + str(XGB_mae))

MAE XGB: 350874.9482388483


In [267]:
XGB_2016.fit(x_2016,y_2016)
XGB_pred_val = XGB_2016.predict(x_2016_val)
XGB_mae = MAE(y_2016_val, XGB_pred_val)
print("MAE XGB: " + str(XGB_mae))

MAE XGB: 318933.79442906607


In [272]:
print(params_2012)

{'subsample': 0.48520193050269284, 'learning_rate': 0.04954389117285075, 'colsample_bytree': 0.5542596325423537, 'reg_alpha': 0.21562617860171035, 'reg_lambda': 0.45926838232016043, 'n_estimators': 3462, 'seed': 28, 'max_depth': 6, 'min_child_weight': 0.9778725480963207, 'gamma': 0.7645394638651799}


In [273]:
print(params_2013)

{'subsample': 0.31535915678487986, 'learning_rate': 0.05113657454540983, 'colsample_bytree': 0.5494091757771876, 'reg_alpha': 0.34224452560602836, 'reg_lambda': 0.7186831927674757, 'n_estimators': 6031, 'seed': 14, 'max_depth': 6, 'min_child_weight': 0.5216582305373545, 'gamma': 0.5036817452851475}


## Random Forest

In [268]:
def fabrica_RF(train, param_grid):
    RF = RandomForestRegressor()
    X = train.drop(['precio'], axis=1)
    y = train['precio']
    grid = GridSearchCV(RF, param_grid, cv=4, scoring='neg_mean_absolute_error')
    grid.fit(X, y)
    RF = grid.best_estimator_
    print(grid.best_params_)
    print(grid.best_score_)
    return RF

In [276]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
    #'n_estimators': [200, 400]
    
}

In [277]:
rf_2012 = fabrica_RF(train_2012, param_grid)
rf_2013 = fabrica_RF(train_2013, param_grid)
rf_2014 = fabrica_RF(train_2014, param_grid)
rf_2015 = fabrica_RF(train_2015, param_grid)
rf_2016 = fabrica_RF(train_2016, param_grid)

{'bootstrap': True, 'n_estimators': 200, 'min_samples_leaf': 3, 'min_samples_split': 8, 'max_depth': 110, 'max_features': 3}
-685222.7626369509
{'bootstrap': True, 'n_estimators': 300, 'min_samples_leaf': 3, 'min_samples_split': 8, 'max_depth': 90, 'max_features': 3}
-659846.7173177076
{'bootstrap': True, 'n_estimators': 100, 'min_samples_leaf': 3, 'min_samples_split': 8, 'max_depth': 80, 'max_features': 3}
-742149.026873734
{'bootstrap': True, 'n_estimators': 300, 'min_samples_leaf': 3, 'min_samples_split': 8, 'max_depth': 80, 'max_features': 3}
-808693.6945384506
{'bootstrap': True, 'n_estimators': 200, 'min_samples_leaf': 3, 'min_samples_split': 8, 'max_depth': 100, 'max_features': 3}
-845238.0721589796


In [278]:
rf_2012 = RandomForestRegressor(n_estimators=200)
rf_2012.fit(x_2012,y_2012)
rf_pred_val = rf_2012.predict(x_2012_val)
rf_mae = MAE(y_2012_val, rf_pred_val)
print("MAE rf: " + str(rf_mae))

MAE rf: 169648.08362038207


In [280]:
rf_2013 = RandomForestRegressor(n_estimators=200)
rf_2013.fit(x_2013,y_2013)
rf_pred_val = rf_2013.predict(x_2013_val)
rf_mae = MAE(y_2013_val, rf_pred_val)
print("MAE rf: " + str(rf_mae))

MAE rf: 163430.72219079794


In [281]:
rf_2014 = RandomForestRegressor(n_estimators=200)
rf_2014.fit(x_2014,y_2014)
rf_pred_val = rf_2014.predict(x_2014_val)
rf_mae = MAE(y_2014_val, rf_pred_val)
print("MAE rf: " + str(rf_mae))

MAE rf: 187213.09548688054


In [282]:
rf_2015 = RandomForestRegressor(n_estimators=200)
rf_2015.fit(x_2015,y_2015)
rf_pred_val = rf_2015.predict(x_2015_val)
rf_mae = MAE(y_2015_val, rf_pred_val)
print("MAE rf: " + str(rf_mae))

MAE rf: 197270.41281697043


In [283]:
rf_2016 = RandomForestRegressor(n_estimators=200)
rf_2016.fit(x_2016,y_2016)
rf_pred_val = rf_2016.predict(x_2016_val)
rf_mae = MAE(y_2016_val, rf_pred_val)
print("MAE rf: " + str(rf_mae))

MAE rf: 211741.1321400569


In [105]:
#res_2012 = rf_2012.predict(test_2012)
#res_2013 = rf_2013.predict(test_2013)
#res_2014 = rf_2014.predict(test_2014)
#res_2015 = rf_2015.predict(test_2015)
#res_2016 = rf_2016.predict(test_2016)

## Ensembles

In [298]:
ensemble2012 = VotingRegressor([('rf', rf_2012), ('lgbm', lgbm_2012)])
ensemble2013 = VotingRegressor([('rf', rf_2013), ('lgbm', lgbm_2013)])
ensemble2014 = VotingRegressor([('rf', rf_2014), ('lgbm', lgbm_2014)])
ensemble2015 = VotingRegressor([('rf', rf_2015), ('lgbm', lgbm_2015)])
ensemble2016 = VotingRegressor([('rf', rf_2016), ('lgbm', lgbm_2016)])

In [299]:
ensemble2012.fit(x_2012, y_2012)
en_pred_val = ensemble2012.predict(x_2012_val)
en_mae = MAE(y_2012_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

MAE Ensemble: 281303.0169945019


In [300]:
ensemble2013.fit(x_2013, y_2013)
en_pred_val = ensemble2013.predict(x_2013_val)
en_mae = MAE(y_2013_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

MAE Ensemble: 279642.2977729409


In [301]:
ensemble2014.fit(x_2014, y_2014)
en_pred_val = ensemble2014.predict(x_2014_val)
en_mae = MAE(y_2014_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

MAE Ensemble: 327702.9598104


In [302]:
ensemble2015.fit(x_2015, y_2015)
en_pred_val = ensemble2015.predict(x_2015_val)
en_mae = MAE(y_2015_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

MAE Ensemble: 357964.5959041208


In [303]:
ensemble2016.fit(x_2016, y_2016)
en_pred_val = ensemble2016.predict(x_2016_val)
en_mae = MAE(y_2016_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

MAE Ensemble: 394672.4225389817


In [304]:
res_2012 = ensemble2012.predict(test_2012)
res_2013 = ensemble2013.predict(test_2013)
res_2014 = ensemble2014.predict(test_2014)
res_2015 = ensemble2015.predict(test_2015)
res_2016 = ensemble2016.predict(test_2016)

In [305]:
res_2012 = pd.DataFrame(res_2012, index=test_2012.index, columns=['precio'])
res_2012 = res_2012.rename(columns={'precio':'target'})

res_2013 = pd.DataFrame(res_2013, index=test_2013.index, columns=['precio'])
res_2013 = res_2013.rename(columns={'precio':'target'})

res_2014 = pd.DataFrame(res_2014, index=test_2014.index, columns=['precio'])
res_2014 = res_2014.rename(columns={'precio':'target'})

res_2015 = pd.DataFrame(res_2015, index=test_2015.index, columns=['precio'])
res_2015 = res_2015.rename(columns={'precio':'target'})

res_2016 = pd.DataFrame(res_2016, index=test_2016.index, columns=['precio'])
res_2016 = res_2016.rename(columns={'precio':'target'})

In [306]:
res = res_2012.append(res_2013).append(res_2014).append(res_2015).append(res_2016)
res.to_csv("workshop-submission-rf.csv", header=True)