In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from bayes_opt import BayesianOptimization
import category_encoders as ce
from sklearn.ensemble import VotingRegressor

In [2]:
train = pd.read_csv(r'Orga/train.csv', index_col=0)
test = pd.read_csv(r'Orga/test.csv', index_col=0)

In [3]:
#metrica
def MAE(actual, pred):
    return (np.mean(np.absolute(actual - pred)))

## Preprocesamiento

In [4]:
train = train.fillna(0)
test = test.fillna(0)

In [5]:
concat = pd.concat([train, test], sort=False)

## Feature Engineering

In [6]:
concat['cant_amenities'] = concat['usosmultiples'] + concat['piscina'] + concat['gimnasio']

In [7]:
concat['fecha'] = pd.to_datetime(concat['fecha'])
concat['año'] = concat['fecha'].dt.year

In [8]:
def features_descripcion(lista_features, concat):
    for i in range(len(lista_features)):
        concat[lista_features[i]] = concat['descripcion'].str.contains(lista_features[i], regex=False).astype(bool)
    return concat

In [9]:
lista_features = ['luminoso', 'jardin', 'terraza', 'patio', 'balcón']
concat = features_descripcion(lista_features, concat)

In [10]:
concat['centroscomercialescercanos'] = concat['centroscomercialescercanos'].map({1:2, 0:0})

In [11]:
concat['cercanias'] = concat['centroscomercialescercanos'] + concat['escuelascercanas']
concat['cercanias'] = concat['cercanias'].astype(str)

In [12]:
concat = concat.join(pd.get_dummies(concat['provincia'], prefix = 'prov'))
del concat['provincia']
del concat['prov_0']     #seria la provincia NaN

In [13]:
#concat = concat.join(pd.get_dummies(concat['ciudad'], prefix = 'ciudad'))
del concat['ciudad']

In [14]:
concat = concat.join(pd.get_dummies(concat['tipodepropiedad'], prefix = 'tipo'))
del concat['tipodepropiedad']
del concat['tipo_0']     #seria el tipo NaN

In [15]:
concat = concat.join(pd.get_dummies(concat['cercanias'], prefix = 'cercania'))
del concat['cercanias']

In [16]:
del concat['lat']
del concat['lng']
del concat['descripcion']
del concat['titulo']
del concat['direccion']
del concat['fecha']

In [17]:
concat['idzona'] = concat['idzona'].astype(str)

In [18]:
train = concat.iloc[0:240000]
test = concat.iloc[240000:300000]

In [19]:
del test['precio']

In [20]:
train_2012 = train[train['año'] == 2012]
train_2013 = train[train['año'] == 2013]
train_2014 = train[train['año'] == 2014]
train_2015 = train[train['año'] == 2015]
train_2016 = train[train['año'] == 2016]

test_2012 = test[test['año'] == 2012]
test_2013 = test[test['año'] == 2013]
test_2014 = test[test['año'] == 2014]
test_2015 = test[test['año'] == 2015]
test_2016 = test[test['año'] == 2016]

## KNN

In [21]:
def KNN(train, param_grid):
    KNN = KNeighborsRegressor()
    X = train.drop(['precio'], axis=1)
    y = train['precio']
    grid = GridSearchCV(KNN, param_grid, cv=5, scoring='neg_mean_absolute_error')
    grid.fit(X, y)
    KNN = grid.best_estimator_
    print(grid.best_params_)
    print(grid.best_score_)
    return KNN

In [None]:
k_valores = list(range(1, 21))
param_grid = dict(n_neighbors=k_valores)
KNN_2012 = KNN(train_2012, param_grid)
KNN_2013 = KNN(train_2013, param_grid)
KNN_2014 = KNN(train_2014, param_grid)
KNN_2015 = KNN(train_2015, param_grid)
KNN_2016 = KNN(train_2016, param_grid)

## Bayesian Opt

In [22]:
#vuelo idzona porque xgboost no se lo banca, habria que encodearlo preferentemente
del train_2012['idzona']
del train_2013['idzona']
del train_2014['idzona']
del train_2015['idzona']
del train_2016['idzona']

del test_2012['idzona']
del test_2013['idzona']
del test_2014['idzona']
del test_2015['idzona']
del test_2016['idzona']

In [27]:
def train_set(train,test):
    X = train.drop(['precio'], axis=1)
    y = train['precio']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=1)
    print("Train shapes: X = " + str(X_train.shape) + " y = " + str(y_train.shape))
    print("Validation shapes: X = " + str(X_val.shape) +  " y = " + str(y_val.shape))
    print("Test shape: " + str(test.shape))
    return X, y, X_val, y_val

In [28]:
x_2012, y_2012, x_2012_val, y_2012_val = train_set(train_2012,test_2012)
x_2013, y_2013, x_2013_val, y_2013_val = train_set(train_2013,test_2013)
x_2014, y_2014, x_2014_val, y_2014_val = train_set(train_2014,test_2014)
x_2015, y_2015, x_2015_val, y_2015_val = train_set(train_2015,test_2015)
x_2016, y_2016, x_2016_val, y_2016_val = train_set(train_2016,test_2016)

Train shapes: X = (14120, 78) y = (14120,)
Validation shapes: X = (9414, 78) y = (9414,)
Test shape: (5899, 78)
Train shapes: X = (18231, 78) y = (18231,)
Validation shapes: X = (12155, 78) y = (12155,)
Test shape: (7592, 78)
Train shapes: X = (24343, 78) y = (24343,)
Validation shapes: X = (16229, 78) y = (16229,)
Test shape: (10018, 78)
Train shapes: X = (30882, 78) y = (30882,)
Validation shapes: X = (20588, 78) y = (20588,)
Test shape: (13017, 78)
Train shapes: X = (56422, 78) y = (56422,)
Validation shapes: X = (37616, 78) y = (37616,)
Test shape: (23474, 78)


In [22]:
def xgb_evaluate(max_depth, gamma, colsample_bytree,seed,min_child_weight,n_estimators,
                  reg_alpha,reg_lambda,subsample,learning_rate):
    params = {'eval_metric': 'rmse',
              'max_depth': int(max_depth),
              'subsample': subsample,
              'eta': 0.1,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree,
              "seed": int(seed),
              "min_child_weight": min_child_weight,
              "n_estimators": n_estimators,
              "reg_alpha": reg_alpha,
              "reg_lambda": reg_lambda,
              "learning_rate": learning_rate
             }
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=3)    
    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [23]:
def optimize(sett):
    dtrain = sett
    xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 7), 
                                             'gamma': (0, 1),
                                             'colsample_bytree': (0.3, 0.9),
                                             "seed": (10,50),
                                             "min_child_weight": (0.4,1.5),
                                             "n_estimators":(100,10000),                                                                    
                                             "reg_alpha":(0.2,0.75),
                                             "reg_lambda": (0.2,0.8),
                                             "subsample" : (0.3, 0.8),
                                             "learning_rate": (0.01,0.07),
                                            })
    # Se probo con 3 puntos iniciales y 10 iteraciones, deberiamos probar con mas en ambas
    # La prox iteracion en casa probar 100 y 10000
    xgb_bo.maximize(init_points=3, n_iter=10, acq='ei')
    params = xgb_bo.max['params']
    params['max_depth'] = int(params['max_depth'])
    params["seed"] = int(params["seed"])
    params["n_estimators"] = int(params["n_estimators"])
    return params

In [24]:
import xgboost as xgb

dtrain_2012 = xgb.DMatrix(x_2012, label=y_2012)
dtrain_2013 = xgb.DMatrix(x_2013, label=y_2013)
dtrain_2014 = xgb.DMatrix(x_2014, label=y_2014)
dtrain_2015 = xgb.DMatrix(x_2015, label=y_2015)
dtrain_2016 = xgb.DMatrix(x_2016, label=y_2016)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [25]:
dtrain = dtrain_2012
params_2012 = optimize(dtrain_2012)
dtrain = dtrain_2013
params_2013 = optimize(dtrain_2013)
dtrain = dtrain_2014
params_2014 = optimize(dtrain_2014)
dtrain = dtrain_2015
params_2015 = optimize(dtrain_2015)
dtrain = dtrain_2016
params_2016 = optimize(dtrain_2016)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | reg_alpha | reg_la... |   seed    | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-9.531e+0[0m | [0m 0.8553  [0m | [0m 0.04483 [0m | [0m 0.06557 [0m | [0m 5.725   [0m | [0m 1.309   [0m | [0m 2.553e+0[0m | [0m 0.4146  [0m | [0m 0.3952  [0m | [0m 37.24   [0m | [0m 0.761   [0m |
| [0m 2       [0m | [0m-9.717e+0[0m | [0m 0.8107  [0m | [0m 0.915   [0m | [0m 0.0402  [0m | [0m 5.642   [0m | [0m 0.7921  [0m | [0m 7.338e+0[0m | [0m 0.2416  [0m | [0m 0.6223  [0m | [0m 39.36   [0m | [0m 0.6778  [0m |
| [0m 3       [0m | [0m-1.099e+0[0m | [0m 0.6051  [0m | [0m 0.8463  [0m | [0m 0.02905 [0m | [0m 3.279   [0m | [0m 1.478   [0m | [0m 6.187e+0[0m | [0m 0.227   [0m | [0m 0.7966  [0m | [0m 15.15   [0m | [

| [0m 4       [0m | [0m-1.09e+06[0m | [0m 0.721   [0m | [0m 0.4999  [0m | [0m 0.04977 [0m | [0m 5.323   [0m | [0m 0.8275  [0m | [0m 6.577e+0[0m | [0m 0.7321  [0m | [0m 0.2222  [0m | [0m 28.42   [0m | [0m 0.3211  [0m |
| [0m 5       [0m | [0m-1.223e+0[0m | [0m 0.8692  [0m | [0m 0.03066 [0m | [0m 0.03392 [0m | [0m 3.042   [0m | [0m 0.614   [0m | [0m 5.168e+0[0m | [0m 0.7349  [0m | [0m 0.2995  [0m | [0m 11.48   [0m | [0m 0.3311  [0m |
| [0m 6       [0m | [0m-1.214e+0[0m | [0m 0.533   [0m | [0m 0.04458 [0m | [0m 0.03919 [0m | [0m 3.75    [0m | [0m 0.6772  [0m | [0m 1.854e+0[0m | [0m 0.2079  [0m | [0m 0.7529  [0m | [0m 40.29   [0m | [0m 0.5479  [0m |
| [0m 7       [0m | [0m-1.106e+0[0m | [0m 0.6723  [0m | [0m 0.8756  [0m | [0m 0.06052 [0m | [0m 4.62    [0m | [0m 0.4274  [0m | [0m 9.467e+0[0m | [0m 0.2412  [0m | [0m 0.72    [0m | [0m 12.58   [0m | [0m 0.5859  [0m |
| [0m 8       [0m | [0m-1.597

| [0m 8       [0m | [0m-1.315e+0[0m | [0m 0.8897  [0m | [0m 0.8196  [0m | [0m 0.05851 [0m | [0m 4.478   [0m | [0m 0.6615  [0m | [0m 2.579e+0[0m | [0m 0.4508  [0m | [0m 0.764   [0m | [0m 22.26   [0m | [0m 0.5662  [0m |
| [0m 9       [0m | [0m-1.87e+06[0m | [0m 0.8962  [0m | [0m 0.06821 [0m | [0m 0.01136 [0m | [0m 4.447   [0m | [0m 0.5413  [0m | [0m 9.471e+0[0m | [0m 0.5427  [0m | [0m 0.7309  [0m | [0m 26.56   [0m | [0m 0.5334  [0m |
| [0m 10      [0m | [0m-1.291e+0[0m | [0m 0.8781  [0m | [0m 0.6667  [0m | [0m 0.047   [0m | [0m 5.475   [0m | [0m 0.7522  [0m | [0m 9.219e+0[0m | [0m 0.2723  [0m | [0m 0.3324  [0m | [0m 34.14   [0m | [0m 0.6861  [0m |
| [0m 11      [0m | [0m-1.273e+0[0m | [0m 0.7127  [0m | [0m 0.1328  [0m | [0m 0.03867 [0m | [0m 6.221   [0m | [0m 1.185   [0m | [0m 3.333e+0[0m | [0m 0.2523  [0m | [0m 0.5919  [0m | [0m 35.37   [0m | [0m 0.744   [0m |
| [0m 12      [0m | [0m-1.98e

## XGBoost

In [None]:
def xgboost(x,y,x_val,y_val, test, params):
    best_xgb_model = XGBRegressor(colsample_bytree=params["colsample_bytree"],
                 gamma=params["gamma"],                 
                 learning_rate=params["learning_rate"],
                 max_depth=params["max_depth"],
                 min_child_weight=params["min_child_weight"],
                 n_estimators=params["n_estimators"],                                                                    
                 reg_alpha=params["reg_alpha"],
                 reg_lambda=params["reg_lambda"],
                 subsample=params["subsample"],
                 seed=params["seed"])
    best_xgb_model.fit(x,y)
    XGB_pred_val = best_xgb_model.predict(x_val)
    XGB_mae = MAE(y_val, XGB_pred_val)
    print("MAE XGB: " + str(XGB_mae))
    XGB_pred_test = best_xgb_model.predict(test)
    res = pd.DataFrame(XGB_pred_test, index=test.index, columns=['precio'])
    res = res.rename(columns={'precio':'target'})
    
    return res, XGB_pred_val

In [None]:
res_2012, XGB_2012_pred_val = xgboost(x_2012, y_2012, x_2012_val, y_2012_val, test_2012, params_2012)
res_2013, XGB_2013_pred_val = xgboost(x_2013, y_2013, x_2013_val, y_2013_val, test_2013, params_2013)
res_2014, XGB_2014_pred_val = xgboost(x_2014, y_2014, x_2014_val, y_2014_val, test_2014, params_2014)
res_2015, XGB_2015_pred_val = xgboost(x_2015, y_2015, x_2015_val, y_2015_val, test_2015, params_2015)
res_2016, XGB_2016_pred_val = xgboost(x_2016, y_2016, x_2016_val, y_2016_val, test_2016, params_2016)


In [26]:
def fabrica(params):
    best_xgb_model = XGBRegressor(colsample_bytree=params["colsample_bytree"],
                 gamma=params["gamma"],                 
                 learning_rate=params["learning_rate"],
                 max_depth=params["max_depth"],
                 min_child_weight=params["min_child_weight"],
                 n_estimators=params["n_estimators"],                                                                    
                 reg_alpha=params["reg_alpha"],
                 reg_lambda=params["reg_lambda"],
                 subsample=params["subsample"],
                 seed=params["seed"])
    
    return best_xgb_model

In [27]:
XGB_2012 = fabrica(params_2012)
XGB_2013 = fabrica(params_2013)
XGB_2014 = fabrica(params_2014)
XGB_2015 = fabrica(params_2015)
XGB_2016 = fabrica(params_2016)

In [None]:
#ensamblo todo y comparo
#XGB_mae = MAE(y_val_ensamblado, XGB_pred_val_ensamblado)
#print("MAE XGB: " + str(XGB_mae))

res = res_2012.append(res_2013).append(res_2014).append(res_2015).append(res_2016)
res.to_csv("workshop-submission-XGB_2.csv", header=True)

In [None]:
import matplotlib.pyplot as plt
fscores = pd.DataFrame({'X': list(best_xgb_model.get_fscore().keys()), 'Y': list(best_xgb_model.get_fscore().values())})
fscores.sort_values(by='Y').plot.bar(x='X')

In [34]:
ensemble2012 = VotingRegressor([('knn', KNN_2012), ('xgb', XGB_2012)])
ensemble2013 = VotingRegressor([('knn', KNN_2013), ('xgb', XGB_2013)])
ensemble2014 = VotingRegressor([('knn', KNN_2014), ('xgb', XGB_2014)])
ensemble2015 = VotingRegressor([('knn', KNN_2015), ('xgb', XGB_2015)])
ensemble2016 = VotingRegressor([('knn', KNN_2016), ('xgb', XGB_2016)])

In [33]:
ensemble2012.fit(x_2012, y_2012)
en_pred_val = ensemble2012.predict(x_2012_val)
en_mae = MAE(y_2012_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


MAE XGB: 470149.80431839824


In [35]:
ensemble2013.fit(x_2013, y_2013)
en_pred_val = ensemble2013.predict(x_2013_val)
en_mae = MAE(y_2013_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




KeyboardInterrupt: 

In [None]:
ensemble2014.fit(x_2014, y_2014)
en_pred_val = ensemble2014.predict(x_2014_val)
en_mae = MAE(y_2014_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

In [None]:
ensemble2015.fit(x_2015, y_2015)
en_pred_val = ensemble2015.predict(x_2015_val)
en_mae = MAE(y_2015_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

In [None]:
ensemble2016.fit(x_2016, y_2016)
en_pred_val = ensemble2016.predict(x_2016_val)
en_mae = MAE(y_2016_val, en_pred_val)
print("MAE Ensemble: " + str(en_mae))

In [None]:
res_2012 = ensemble2012.predict(test_2012)
res_2013 = ensemble2013.predict(test_2013)
res_2014 = ensemble2014.predict(test_2014)
res_2015 = ensemble2015.predict(test_2015)
res_2016 = ensemble2016.predict(test_2016)