# Modeling
Comenzaremos con un modelo baseline para luego poder comparar la performance de modelos más complejos.

In [1]:
!pip install xgboost



In [2]:
!pip install lightgbm



## Import libraries

In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as st
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, median_absolute_error

## Load dataset

In [3]:
df = pd.read_csv('../data/processed/kc_house_data_clean_with_outliers.csv')
df.head()

Unnamed: 0,price,sqft_living,grade,sqft_above,sqft_living15,bathrooms,view,sqft_basement,bedrooms,lat,waterfront,floors,renovated,sqft_lot,sqft_lot15,yr_built,condition,long,zipcode,house_age
7129300520,221900,1180,7,1180,1340,1,0,0,3,47.5112,0,1,0,5650,5650,1955,3,-122.257,98178,59
6414100192,538000,2570,7,2170,1690,2,0,400,3,47.721,0,2,1,7242,7639,1951,3,-122.319,98125,63
5631500400,180000,770,6,770,2720,1,0,0,2,47.7379,0,1,0,10000,8062,1933,3,-122.233,98028,82
2487200875,604000,1960,7,1050,1360,3,0,910,4,47.5208,0,1,0,5000,5000,1965,5,-122.393,98136,49
1954400510,510000,1680,8,1680,1800,2,0,0,3,47.6168,0,1,0,8080,7503,1987,3,-122.045,98074,28


## Baseline

In [4]:
y = df['price']
X = df.drop('price', axis=1)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split (X, y, test_size = 0.20, random_state=42)

In [5]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledRFR', Pipeline([('Scaler', StandardScaler()),('RFR', RandomForestRegressor())])))
pipelines.append(('ScaledBR', Pipeline([('Scaler', StandardScaler()),('BR', BaggingRegressor())])))
pipelines.append(('ScaledABR', Pipeline([('Scaler', StandardScaler()),('ABR', AdaBoostRegressor())])))
pipelines.append(('ScaledETR', Pipeline([('Scaler', StandardScaler()),('ETR', ExtraTreesRegressor())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', StandardScaler()),('XGB', XGBRegressor())])))
pipelines.append(('ScaledLGBM', Pipeline([('Scaler', StandardScaler()),('LGBM', LGBMRegressor())])))

results = []
names = []

for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='r2')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: 0.698482 (0.013185)
ScaledLASSO: 0.697675 (0.012840)
ScaledEN: 0.670505 (0.013958)
ScaledKNN: 0.785186 (0.027530)
ScaledCART: 0.733370 (0.068837)
ScaledGBM: 0.865226 (0.020624)
ScaledRFR: 0.854956 (0.031478)
ScaledBR: 0.854384 (0.025137)
ScaledABR: 0.091177 (0.092352)
ScaledETR: 0.857879 (0.023101)
ScaledXGB: 0.864449 (0.016561)
ScaledLGBM: 0.878307 (0.018618)


**Notas:**
Podemos observar que `GradientBoostingRegressor`, `XGBRegressor`, y `LGBMRegressor` fueron los que mejor que nos dieron. Seguiremos explotando estos modelos utilizando gridsearch o randomsearch.

## Gridsearch 

Partiendo del siguiente conjunto de parametros comenzaremos a buscar los mejores para poder mejorar los modelos seleccionados.

In [6]:
one_to_left = st.beta(10, 1)

In [22]:
params = {  
    "n_estimators": np.array([100,200,300,400, 500, 600]), # Number of boosted trees to fit.
    "max_depth": st.randint(3, 12),     # Maximum tree depth for base learners.
    "learning_rate": st.uniform(0.05, 0.4), #     Boosting learning rate (xgb’s “eta”)
    "subsample": one_to_left     # Subsample ratio of the training instance.  
}

In [8]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### GrandientBoostingRegressor

In [9]:
gbr = GradientBoostingRegressor(random_state=21)
kfold = KFold(n_splits=10, random_state=21)
grid_gbr = RandomizedSearchCV(gbr, params, n_iter=25, verbose= True, scoring='r2', cv=kfold, n_jobs=2)
grid_gbr.fit(rescaledX, Y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 12.5min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 42.8min
[Parallel(n_jobs=2)]: Done 250 out of 250 | elapsed: 54.1min finished


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=21, shuffle=False),
          error_score='raise-deprecating',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...te=21, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=25, n_jobs=2,
          param_distributions={'n_estimators': array([ 50, 100, 200, 300, 400]), 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE76A0>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE7390>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE7DD8>},
          pre_dispatch='2*n_jobs', random_state=None, r

In [10]:
y_predict_gbr = grid_gbr.predict(X_test)

### XGBM

In [23]:
xgb = XGBRegressor()
kfold = KFold(n_splits=10, random_state=21)
grid_xgb = RandomizedSearchCV(xgb, params, n_iter=25, verbose= True, scoring='r2', cv=kfold)
grid_xgb.fit(rescaledX, Y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 73.4min finished


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=21, shuffle=False),
          error_score='raise-deprecating',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=25, n_jobs=None,
          param_distributions={'n_estimators': array([100, 200, 300, 400, 500, 600]), 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9F1B128>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9F1BF60>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE7DD8>},
          pre_dispatch='2*n_jobs', random_state=None, refit

In [12]:
y_predict_xgb = grid_xgb.predict(X_test)

### LGBM

In [13]:
lgbm = LGBMRegressor()
kfold = KFold(n_splits=10, random_state=21)
grid_lgbm = RandomizedSearchCV(lgbm, params, n_iter=25, verbose= True, scoring='r2', cv=kfold)
grid_lgbm.fit(rescaledX, Y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  4.9min finished


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=21, shuffle=False),
          error_score='raise-deprecating',
          estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid='warn', n_iter=25, n_jobs=None,
          param_distributions={'n_estimators': array([ 50, 100, 200, 300, 400]), 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE76A0>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE7390>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001D3D9DE7DD8>},
          pre_dispatch='2*n_

In [19]:
y_predict_lgbm = grid_lgbm.predict(X_test)

## Final results

In [24]:
print("Best GradientBoosting: %f using %s" % (grid_gbr.best_score_, grid_gbr.best_params_))
print("GradientBoosting Regressor R2-test_score: {}".format(round(r2_score(y_predict_gbr, Y_test),4)))
print("MSE_test of GradientBoosting Regressor: {} ".format(median_absolute_error(y_predict_gbr, Y_test)))
print('-'*100)
print("Best XGB: %f using %s" % (grid_xgb.best_score_, grid_xgb.best_params_))
print("XGB Regressor R2-test_score: {}".format(round(r2_score(y_predict_xgb, Y_test),4)))
print("MSE_test of XGB Regressor: {} ".format(median_absolute_error(y_predict_xgb, Y_test)))
print('-'*100)
print("Best LGBM: %f using %s" % (grid_lgbm.best_score_, grid_lgbm.best_params_))
print("LGBM Regressor R2-test_score: {}".format(round(r2_score(y_predict_lgbm, Y_test),4)))
print("MSE_test of LGBM Regressor: {} ".format(median_absolute_error(y_predict_lgbm, Y_test)))
print('-'*100)

Best GradientBoosting: 0.892048 using {'learning_rate': 0.07373478151180804, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.8827027814384301}
GradientBoosting Regressor R2-test_score: 0.8771
MSE_test of GradientBoosting Regressor: 42175.71550105224 
----------------------------------------------------------------------------------------------------
Best XGB: 0.895153 using {'learning_rate': 0.20499170570069686, 'max_depth': 4, 'n_estimators': 400, 'subsample': 0.8856643553463572}
XGB Regressor R2-test_score: 0.8884
MSE_test of XGB Regressor: 39396.109375 
----------------------------------------------------------------------------------------------------
Best LGBM: 0.886797 using {'learning_rate': 0.2028562929527566, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.7892862827539857}
LGBM Regressor R2-test_score: 0.893
MSE_test of LGBM Regressor: 40860.18255985374 
----------------------------------------------------------------------------------------------------
