# Modeling
Comenzaremos con un modelo baseline para luego poder comparar la performance de modelos más complejos.

In [1]:
!pip install xgboost



In [2]:
!pip install lightgbm



## Import libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as st
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

  from numpy.core.umath_tests import inner1d


## Load dataset

In [4]:
df = pd.read_csv('../data/processed/kc_house_data_clean_with_outliers.csv')
df.head()

Unnamed: 0,price,sqft_living,grade,sqft_above,sqft_living15,bathrooms,view,sqft_basement,bedrooms,lat,waterfront,floors,renovated,sqft_lot,sqft_lot15,yr_built,condition,long,zipcode,house_age
7129300520,221900,1180,7,1180,1340,1,0,0,3,47.5112,0,1,0,5650,5650,1955,3,-122.257,98178,59
6414100192,538000,2570,7,2170,1690,2,0,400,3,47.721,0,2,1,7242,7639,1951,3,-122.319,98125,63
5631500400,180000,770,6,770,2720,1,0,0,2,47.7379,0,1,0,10000,8062,1933,3,-122.233,98028,82
2487200875,604000,1960,7,1050,1360,3,0,910,4,47.5208,0,1,0,5000,5000,1965,5,-122.393,98136,49
1954400510,510000,1680,8,1680,1800,2,0,0,3,47.6168,0,1,0,8080,7503,1987,3,-122.045,98074,28


## Baseline

In [5]:
y = df['price']
X = df.drop('price', axis=1)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split (X, y, test_size = 0.20, random_state=42)

In [7]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledRFR', Pipeline([('Scaler', StandardScaler()),('RFR', RandomForestRegressor())])))
pipelines.append(('ScaledBR', Pipeline([('Scaler', StandardScaler()),('BR', BaggingRegressor())])))
pipelines.append(('ScaledABR', Pipeline([('Scaler', StandardScaler()),('ABR', AdaBoostRegressor())])))
pipelines.append(('ScaledETR', Pipeline([('Scaler', StandardScaler()),('ETR', ExtraTreesRegressor())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', StandardScaler()),('XGB', XGBRegressor())])))
pipelines.append(('ScaledLGBM', Pipeline([('Scaler', StandardScaler()),('LGBM', LGBMRegressor())])))

results = []
names = []

for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='r2')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledLR: 0.698483 (0.013186)
ScaledLASSO: 0.697675 (0.012840)
ScaledEN: 0.670505 (0.013958)
ScaledKNN: 0.785186 (0.027530)
ScaledCART: 0.721227 (0.069299)
ScaledGBM: 0.865519 (0.019903)
ScaledRFR: 0.856866 (0.033184)
ScaledBR: 0.855607 (0.018974)
ScaledABR: 0.079721 (0.170755)
ScaledETR: 0.857031 (0.025294)
ScaledXGB: 0.864449 (0.016561)
ScaledLGBM: 0.878307 (0.018618)


**Notas:**
Podemos observar que `GradientBoostingRegressor`, `XGBRegressor`, y `LGBMRegressor` fueron los que mejor que nos dieron. Seguiremos explotando estos modelos utilizando gridsearch o randomsearch.

## Gridsearch 

Partiendo del siguiente conjunto de parametros comenzaremos a buscar los mejores para mejorar los modelos seleccionados.

In [8]:
one_to_left = st.beta(10, 1)

In [12]:
params = {  
    "n_estimators": np.array([50,100,200,300,400]), # Number of boosted trees to fit.
    "max_depth": st.randint(3, 12),     # Maximum tree depth for base learners.
    "learning_rate": st.uniform(0.05, 0.4), #     Boosting learning rate (xgb’s “eta”)
    "subsample": one_to_left     # Subsample ratio of the training instance.  
}

In [10]:
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

In [13]:
gbr = GradientBoostingRegressor(random_state=21)
kfold = KFold(n_splits=10, random_state=21)
grid_gbr = RandomizedSearchCV(gbr, params, n_iter=25, verbose= True, scoring='r2', cv=kfold)
grid_gbr.fit(rescaledX, Y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 105.1min finished


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=21, shuffle=False),
          error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=21,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'n_estimators': array([ 50, 100, 200, 300, 400]), 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F3BF7C13C8>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F3BF7C1518>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F3BE910550>},
          

In [14]:
y_predict_gbr = grid_gbr.predict(X_test)

In [15]:
xgb = XGBRegressor()
kfold = KFold(n_splits=10, random_state=21)
grid_xgb = RandomizedSearchCV(xgb, params, n_iter=25, verbose= True, scoring='r2', cv=kfold)
grid_xgb.fit(rescaledX, Y_train)

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 45.0min finished


RandomizedSearchCV(cv=KFold(n_splits=10, random_state=21, shuffle=False),
          error_score='raise',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'n_estimators': array([ 50, 100, 200, 300, 400]), 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F3BF7C13C8>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F3BF7C1518>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F3BE910550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          retur

In [16]:
y_predict_xgb = grid_xgb.predict(X_test)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18'] ['sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'lat', 'waterfront', 'floors', 'renovated', 'sqft_lot', 'sqft_lot15', 'yr_built', 'condition', 'long', 'zipcode', 'house_age']
expected f15, f3, f17, f11, f18, f12, f9, f14, f4, f0, f8, f5, f7, f13, f10, f16, f2, f1, f6 in input data
training data did not have the following fields: lat, bedrooms, long, sqft_lot15, floors, condition, sqft_lot, bathrooms, renovated, house_age, grade, zipcode, sqft_living, yr_built, view, sqft_above, sqft_basement, sqft_living15, waterfront

In [None]:
lgbm = LGBMRegressor()
kfold = KFold(n_splits=10, random_state=21)
grid_lgbm = RandomizedSearchCV(lgbm, params, n_iter=25, verbose= True, scoring='r2', cv=kfold)
grid_lgbm.fit(rescaledX, Y_train)

In [None]:
y_predict_lgbm = grid_lgbm.predict(X_test)

In [17]:
print("Best GradientBoosting: %f using %s" % (grid_gbr.best_score_, grid_gbr.best_params_))
print("GradientBoosting Regressor R2-test_score: {}".format(round(r2_score(y_predict_gbr, y_test),4)))
print("MSE_test of GradientBoosting Regressor: {} ".format(median_absolute_error(y_predict_gbr, y_test)))
print('-'*100)
print("Best XGB: %f using %s" % (grid_xgb.best_score_, grid_xgb.best_params_))
print("XGB Regressor R2-test_score: {}".format(round(r2_score(y_predict_xgb, y_test),4)))
print("MSE_test of XGB Regressor: {} ".format(median_absolute_error(y_predict_xgb, y_test)))
print('-'*100)
print("Best LGBM: %f using %s" % (grid_lgbm.best_score_, grid_lgbm.best_params_))
print("LGBM Regressor R2-test_score: {}".format(round(r2_score(y_predict_lgbm, y_test),4)))
print("MSE_test of LGBM Regressor: {} ".format(median_absolute_error(y_predict_lgbm, y_test)))
print('-'*100)

Best GradientBoosting: 0.891642 using {'learning_rate': 0.18976628076812796, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.9075495138695958}


NameError: name 'r2_score' is not defined