In [613]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import Data for V1.1 Feature Selection

In [614]:
X_train= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/x_Train.csv')
X_test= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/x_Test.csv')
y_train= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/y_Train.csv')
y_test= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/y_Test.csv')

# Fit an ElasticNet Model With GridSearch

In [615]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [616]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
parametersGrid = {"max_iter": [1, 5, 10],
                "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                "l1_ratio": np.arange(0.0, 1.0, 0.1)}
eNet = ElasticNet(normalize=True)
grid = GridSearchCV(eNet, parametersGrid, scoring='r2', cv=10)
grid.fit(X_train, y_train)
pred = grid.predict(X_test)
    

In [617]:
PredGrid = pd.concat([pd.Series(pred), y_test], axis = 1)
PredGrid.columns = ['Predicted', 'Actual']
PredGrid

Unnamed: 0,Predicted,Actual
0,199864.057607,173000
1,174770.236129,135000
2,261964.567241,267000
3,152159.704511,161000
4,200346.286091,136000
5,114073.493146,133000
6,135863.965548,142000
7,392631.611934,386250
8,210182.317391,235000
9,125590.996622,133900


## Lets score ourselves

In [618]:
PredGrid['Difference']=(PredGrid['Predicted'] - PredGrid['Actual'])**2
PredGrid.Difference.sum()


580062315034.8544

In [619]:
PredGrid.Difference.mean()**.5


36601.035934646345

In [620]:
grid.score(X_test, y_test)

0.8009833774525363

In [621]:
grid.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__alpha': 1.0,
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__l1_ratio': 0.5,
 'estimator__max_iter': 1000,
 'estimator__normalize': True,
 'estimator__positive': False,
 'estimator__precompute': False,
 'estimator__random_state': None,
 'estimator__selection': 'cyclic',
 'estimator__tol': 0.0001,
 'estimator__warm_start': False,
 'estimator': ElasticNet(normalize=True),
 'n_jobs': None,
 'param_grid': {'max_iter': [1, 5, 10],
  'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
  'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': 'r2',
 'verbose': 0}

## Lets try a Lasso Model

In [622]:
from sklearn.linear_model import Lasso

In [623]:
logit = Lasso()

In [624]:
logit.fit(X_train, y_train)

Lasso()

In [625]:
LassoDF = pd.DataFrame(logit.coef_)
G = pd.concat([pd.Series(X_train.columns), LassoDF], axis = 1)
G.columns = ['Feature', 'Lasso Coefficient']
G.sort_values('Lasso Coefficient')

Unnamed: 0,Feature,Lasso Coefficient
69,Condition2_PosN,-417763.1
92,Exterior1st_CemntBd,-35959.34
140,Functional_Mod,-29927.9
34,LandSlope_Sev,-28315.26
98,Exterior1st_VinylSd,-24449.45
128,Heating_OthW,-19409.2
72,BldgType_Duplex,-18481.54
38,Neighborhood_ClearCr,-16526.01
41,Neighborhood_Edwards,-15713.67
130,HeatingQC_Fa,-15030.53


In [626]:
logit.score(X_test, y_test)

0.7563490038842543

In [627]:
LassoPred = logit.predict(X_test)

## How about a Random Forest

In [628]:
from sklearn.ensemble import RandomForestRegressor

In [629]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.8937111912033382

### Let's do some parameter tuning

In [630]:
params = {
    'n_estimators': [400, 500, 600],
    'max_depth': [8, 10, 12],
    'min_samples_leaf': [4, 5, 6],
    'min_samples_split': [3, 5, 7],
    'max_leaf_nodes': [300, 400, 500],
    'max_samples': [None]
}
grid = GridSearchCV(rfr, params, refit=True, verbose=3)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV 1/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=400;, score=0.882 total time=   4.5s
[CV 2/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=400;, score=0.835 total time=   5.0s
[CV 3/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=400;, score=0.848 total time=   4.2s
[CV 4/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=400;, score=0.786 total time=   4.3s
[CV 5/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=400;, score=0.848 total time=   3.8s
[CV 1/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=500;, score=0.880 total time=   5.5s
[CV 2/

[CV 4/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500;, score=0.793 total time=   4.8s
[CV 5/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500;, score=0.849 total time=   4.7s
[CV 1/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=600;, score=0.880 total time=   6.0s
[CV 2/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=600;, score=0.829 total time=   5.7s
[CV 3/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=600;, score=0.843 total time=   6.1s
[CV 4/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=600;, score=0.793 total time=   6.2s
[CV 5/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_sa

[CV 2/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=400;, score=0.823 total time=   3.6s
[CV 3/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=400;, score=0.836 total time=   4.3s
[CV 4/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=400;, score=0.800 total time=   3.7s
[CV 5/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=400;, score=0.838 total time=   4.3s
[CV 1/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=500;, score=0.875 total time=   4.7s
[CV 2/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=500;, score=0.826 total time=   4.4s
[CV 3/5] END max_depth=8, max_leaf_nodes=300, max_samples=None, min_sa

[CV 5/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.850 total time=   5.0s
[CV 1/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=600;, score=0.882 total time=   5.7s
[CV 2/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=600;, score=0.836 total time=   5.4s
[CV 3/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=600;, score=0.846 total time=   5.3s
[CV 4/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=600;, score=0.786 total time=   5.7s
[CV 5/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=600;, score=0.850 total time=   5.5s
[CV 1/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_sa

[CV 3/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=400;, score=0.846 total time=   4.3s
[CV 4/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=400;, score=0.796 total time=   4.3s
[CV 5/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=400;, score=0.844 total time=   4.6s
[CV 1/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=500;, score=0.879 total time=   5.3s
[CV 2/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=500;, score=0.831 total time=   5.6s
[CV 3/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=500;, score=0.847 total time=   5.2s
[CV 4/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_sa

[CV 1/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=600;, score=0.878 total time=   5.7s
[CV 2/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=600;, score=0.823 total time=   5.2s
[CV 3/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=600;, score=0.837 total time=   5.7s
[CV 4/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=600;, score=0.801 total time=   6.0s
[CV 5/5] END max_depth=8, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=600;, score=0.842 total time=   5.2s
[CV 1/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=400;, score=0.881 total time=   4.3s
[CV 2/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_sa

[CV 4/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=400;, score=0.793 total time=   7.2s
[CV 5/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=400;, score=0.848 total time=   6.1s
[CV 1/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500;, score=0.879 total time=   8.9s
[CV 2/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500;, score=0.830 total time=   8.0s
[CV 3/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500;, score=0.843 total time=  13.3s
[CV 4/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=500;, score=0.792 total time=  16.9s
[CV 5/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_sa

[CV 2/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=600;, score=0.824 total time=   8.6s
[CV 3/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=600;, score=0.832 total time=   8.3s
[CV 4/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=600;, score=0.799 total time=   9.7s
[CV 5/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=600;, score=0.846 total time=   8.1s
[CV 1/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=400;, score=0.877 total time=   5.2s
[CV 2/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=400;, score=0.825 total time=   4.3s
[CV 3/5] END max_depth=8, max_leaf_nodes=500, max_samples=None, min_sa

[CV 5/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.851 total time=   4.4s
[CV 1/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.883 total time=   5.0s
[CV 2/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.840 total time=   4.9s
[CV 3/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.851 total time=   5.3s
[CV 4/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.784 total time=   5.4s
[CV 5/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=500;, score=0.856 total time=   5.9s
[CV 1/5] END max_depth=10, max_leaf_nodes=300, max_samples=None,

[CV 3/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=600;, score=0.845 total time=   5.3s
[CV 4/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=600;, score=0.790 total time=   5.0s
[CV 5/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=600;, score=0.847 total time=   5.9s
[CV 1/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=400;, score=0.881 total time=   4.6s
[CV 2/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=400;, score=0.829 total time=   3.3s
[CV 3/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=400;, score=0.844 total time=   3.3s
[CV 4/5] END max_depth=10, max_leaf_nodes=300, max_samples=None,

[CV 1/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=500;, score=0.875 total time=   4.2s
[CV 2/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=500;, score=0.823 total time=   4.0s
[CV 3/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=500;, score=0.837 total time=   3.9s
[CV 4/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=500;, score=0.798 total time=   4.2s
[CV 5/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=500;, score=0.841 total time=   4.4s
[CV 1/5] END max_depth=10, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=600;, score=0.876 total time=   4.8s
[CV 2/5] END max_depth=10, max_leaf_nodes=300, max_samples=None,

[CV 4/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=600;, score=0.785 total time=   5.5s
[CV 5/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=600;, score=0.855 total time=   5.3s
[CV 1/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=400;, score=0.879 total time=   3.6s
[CV 2/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=400;, score=0.829 total time=   3.4s
[CV 3/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=400;, score=0.842 total time=   3.4s
[CV 4/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=3, n_estimators=400;, score=0.793 total time=   3.3s
[CV 5/5] END max_depth=10, max_leaf_nodes=400, max_samples=None,

[CV 2/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=500;, score=0.827 total time=   4.0s
[CV 3/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=500;, score=0.841 total time=   4.5s
[CV 4/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=500;, score=0.800 total time=   4.0s
[CV 5/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=500;, score=0.846 total time=   3.9s
[CV 1/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=600;, score=0.874 total time=   4.8s
[CV 2/5] END max_depth=10, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=600;, score=0.825 total time=   5.0s
[CV 3/5] END max_depth=10, max_leaf_nodes=400, max_samples=None,

[CV 5/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=600;, score=0.858 total time=   5.4s
[CV 1/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.881 total time=   3.6s
[CV 2/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.835 total time=   3.5s
[CV 3/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.852 total time=   3.8s
[CV 4/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.788 total time=   3.8s
[CV 5/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=5, n_estimators=400;, score=0.851 total time=   3.5s
[CV 1/5] END max_depth=10, max_leaf_nodes=500, max_samples=None,

[CV 3/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.846 total time=   4.2s
[CV 4/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.794 total time=   4.2s
[CV 5/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.850 total time=   4.2s
[CV 1/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=600;, score=0.878 total time=   5.3s
[CV 2/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=600;, score=0.829 total time=   5.1s
[CV 3/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=600;, score=0.846 total time=   5.0s
[CV 4/5] END max_depth=10, max_leaf_nodes=500, max_samples=None,

[CV 1/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=400;, score=0.874 total time=   3.2s
[CV 2/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=400;, score=0.822 total time=   3.2s
[CV 3/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=400;, score=0.843 total time=   3.8s
[CV 4/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=400;, score=0.801 total time=   3.6s
[CV 5/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=400;, score=0.843 total time=   3.2s
[CV 1/5] END max_depth=10, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=500;, score=0.876 total time=   4.0s
[CV 2/5] END max_depth=10, max_leaf_nodes=500, max_samples=None,

[CV 4/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=500;, score=0.787 total time=   4.8s
[CV 5/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=500;, score=0.854 total time=   4.6s
[CV 1/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=600;, score=0.883 total time=   5.4s
[CV 2/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=600;, score=0.836 total time=   5.8s
[CV 3/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=600;, score=0.850 total time=   5.5s
[CV 4/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=600;, score=0.787 total time=   5.6s
[CV 5/5] END max_depth=12, max_leaf_nodes=300, max_samples=None,

[CV 2/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=400;, score=0.823 total time=   3.1s
[CV 3/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=400;, score=0.834 total time=   3.1s
[CV 4/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=400;, score=0.801 total time=   3.2s
[CV 5/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=400;, score=0.841 total time=   3.5s
[CV 1/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=500;, score=0.876 total time=   4.3s
[CV 2/5] END max_depth=12, max_leaf_nodes=300, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=500;, score=0.825 total time=   4.2s
[CV 3/5] END max_depth=12, max_leaf_nodes=300, max_samples=None,

[CV 5/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=500;, score=0.852 total time=   4.5s
[CV 1/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=600;, score=0.883 total time=   5.5s
[CV 2/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=600;, score=0.835 total time=   5.8s
[CV 3/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=600;, score=0.853 total time=   5.6s
[CV 4/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=600;, score=0.787 total time=   6.0s
[CV 5/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=4, min_samples_split=3, n_estimators=600;, score=0.856 total time=   6.0s
[CV 1/5] END max_depth=12, max_leaf_nodes=400, max_samples=None,

[CV 3/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=400;, score=0.842 total time=   4.1s
[CV 4/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=400;, score=0.792 total time=   3.8s
[CV 5/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=400;, score=0.853 total time=   5.4s
[CV 1/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.879 total time=   4.9s
[CV 2/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.831 total time=   4.5s
[CV 3/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=5, min_samples_split=5, n_estimators=500;, score=0.847 total time=   4.6s
[CV 4/5] END max_depth=12, max_leaf_nodes=400, max_samples=None,

[CV 1/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=600;, score=0.875 total time=   6.1s
[CV 2/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=600;, score=0.823 total time=   4.9s
[CV 3/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=600;, score=0.839 total time=   5.2s
[CV 4/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=600;, score=0.801 total time=   5.1s
[CV 5/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=5, n_estimators=600;, score=0.847 total time=   6.1s
[CV 1/5] END max_depth=12, max_leaf_nodes=400, max_samples=None, min_samples_leaf=6, min_samples_split=7, n_estimators=400;, score=0.876 total time=   5.4s
[CV 2/5] END max_depth=12, max_leaf_nodes=400, max_samples=None,

[CV 4/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=400;, score=0.786 total time=   3.6s
[CV 5/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=400;, score=0.856 total time=   3.7s
[CV 1/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=500;, score=0.882 total time=   4.9s
[CV 2/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=500;, score=0.837 total time=   4.8s
[CV 3/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=500;, score=0.853 total time=   5.2s
[CV 4/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=4, min_samples_split=7, n_estimators=500;, score=0.785 total time=   4.9s
[CV 5/5] END max_depth=12, max_leaf_nodes=500, max_samples=None,

[CV 2/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=600;, score=0.829 total time=   5.0s
[CV 3/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=600;, score=0.846 total time=   5.1s
[CV 4/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=600;, score=0.795 total time=   5.6s
[CV 5/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=5, min_samples_split=7, n_estimators=600;, score=0.847 total time=   7.3s
[CV 1/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=400;, score=0.876 total time=   4.4s
[CV 2/5] END max_depth=12, max_leaf_nodes=500, max_samples=None, min_samples_leaf=6, min_samples_split=3, n_estimators=400;, score=0.825 total time=   3.3s
[CV 3/5] END max_depth=12, max_leaf_nodes=500, max_samples=None,

0.8930729797028375

In [631]:
grid.best_params_

{'max_depth': 12,
 'max_leaf_nodes': 400,
 'max_samples': None,
 'min_samples_leaf': 4,
 'min_samples_split': 7,
 'n_estimators': 400}

In [632]:
rfrPred = pd.Series(rfr.predict(X_test))
rfrPred

0      164670.08
1      176012.50
2      255642.93
3      168805.09
4      209534.05
5      130546.00
6      141232.75
7      429003.39
8      213685.06
9      119997.00
10     144730.50
11     139396.07
12     174987.74
13     119378.83
14     296368.32
15     131789.79
16     276747.52
17     179047.83
18     103049.54
19     107354.22
20     337698.26
21     298725.04
22     251613.19
23     132502.15
24     197215.40
25     164501.14
26     200302.68
27     141694.75
28     161456.45
29     117457.50
30     155228.50
31     113697.06
32     151824.00
33     133099.00
34     100332.33
35     115454.25
36     159932.50
37     218384.67
38     156192.65
39     233022.64
40     252877.08
41     191252.12
42     355334.33
43     174494.00
44      93673.00
45     232537.14
46     120287.42
47     244114.68
48     154350.53
49     174719.37
50     123551.00
51     153133.10
52     168723.90
53     113145.76
54     244091.56
55     142098.03
56     116338.66
57     236063.51
58     125720.

## XG Boost?

In [633]:
import xgboost as xgb

In [634]:
logitXB = xgb.XGBRegressor()
logitXB.fit(X_train, y_train)
logitXB.score(X_test, y_test)

0.8602377864710888

In [635]:
from sklearn.metrics import mean_absolute_error

In [636]:
mean_train = np.mean(y_train)
mean_train[0]
mean_test = np.mean(y_test)

In [637]:
baseline_predictions = np.ones(y_test.shape) * mean_train[0]
baseline_predictions

array([[181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [181590.47175421],
       [1815

In [638]:
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print(f'Baseline MAE is {round(mae_baseline,2)}')

Baseline MAE is 59188.76


In [639]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

In [640]:
params['eval_metric'] = "mae"

In [641]:
num_boost_round = 999

In [642]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [643]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:126362.89844
[1]	Test-mae:88810.49219
[2]	Test-mae:63281.26562
[3]	Test-mae:45538.38672
[4]	Test-mae:34305.86719
[5]	Test-mae:27215.09961
[6]	Test-mae:22734.66992
[7]	Test-mae:20757.03906
[8]	Test-mae:19574.47266
[9]	Test-mae:19132.33984
[10]	Test-mae:18890.82617
[11]	Test-mae:18630.82617
[12]	Test-mae:18478.38281
[13]	Test-mae:18382.46484
[14]	Test-mae:18354.89062
[15]	Test-mae:18338.12891
[16]	Test-mae:18256.62305
[17]	Test-mae:18209.03516
[18]	Test-mae:18162.46875
[19]	Test-mae:18108.83398
[20]	Test-mae:18073.66797
[21]	Test-mae:18026.87695
[22]	Test-mae:18001.76367
[23]	Test-mae:17942.35938
[24]	Test-mae:17950.08203
[25]	Test-mae:17892.48633
[26]	Test-mae:17874.79688
[27]	Test-mae:17883.60742
[28]	Test-mae:17928.08594
[29]	Test-mae:17969.50195
[30]	Test-mae:17954.52148
[31]	Test-mae:17951.45703
[32]	Test-mae:17958.06641
[33]	Test-mae:17919.17383
[34]	Test-mae:17924.59180
[35]	Test-mae:17955.65430
[36]	Test-mae:17959.68750


In [644]:
print(f'Best MAE is {round(model.best_score, 2)} with {round(model.best_iteration+1, 2)} rounds')

Best MAE is 17874.8 with 27 rounds


In [645]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results



Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,127993.698438,1893.328785,128201.796875,8753.464015
1,90416.246875,1406.146423,90920.834375,7245.874495
2,64108.15,1065.131176,65204.296875,6108.06568
3,45614.616406,788.629929,47829.260156,5054.703012
4,32714.31836,614.621098,36427.982422,4313.313242
5,23949.757031,489.180789,29426.337891,3623.926252
6,18234.841797,245.08238,25623.551562,3180.86807
7,14629.563086,139.503287,23427.126563,2989.872105
8,12279.820117,107.786803,22252.141406,2857.504449
9,10827.986328,108.252589,21582.310938,2823.133551


In [646]:
cv_results['test-mae-mean'].min()

20123.360937600002

In [647]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(8,13)
    for min_child_weight in range(5,10)
]

In [648]:
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=8, min_child_weight=5
	MAE 19460.1183594 for 15 rounds
CV with max_depth=8, min_child_weight=6
	MAE 19287.395508 for 14 rounds
CV with max_depth=8, min_child_weight=7
	MAE 19416.285547 for 16 rounds
CV with max_depth=8, min_child_weight=8
	MAE 19233.2957032 for 20 rounds
CV with max_depth=8, min_child_weight=9
	MAE 19565.3425782 for 19 rounds
CV with max_depth=9, min_child_weight=5
	MAE 19340.280663999998 for 44 rounds
CV with max_depth=9, min_child_weight=6
	MAE 19167.054297000002 for 31 rounds
CV with max_depth=9, min_child_weight=7
	MAE 19243.244921800004 for 21 rounds
CV with max_depth=9, min_child_weight=8
	MAE 19059.640038999998 for 28 rounds
CV with max_depth=9, min_child_weight=9


	MAE 19687.0304688 for 26 rounds
CV with max_depth=10, min_child_weight=5
	MAE 19602.5431642 for 25 rounds
CV with max_depth=10, min_child_weight=6
	MAE 19367.2482424 for 18 rounds
CV with max_depth=10, min_child_weight=7
	MAE 18968.65625 for 29 rounds
CV with max_depth=10, min_child_weight=8
	MAE 19444.9171874 for 35 rounds
CV with max_depth=10, min_child_weight=9
	MAE 19707.341797 for 20 rounds
CV with max_depth=11, min_child_weight=5
	MAE 19761.498828 for 19 rounds
CV with max_depth=11, min_child_weight=6
	MAE 19416.166796999998 for 15 rounds
CV with max_depth=11, min_child_weight=7
	MAE 19397.955273400003 for 15 rounds
CV with max_depth=11, min_child_weight=8
	MAE 19699.3007816 for 20 rounds
CV with max_depth=11, min_child_weight=9


	MAE 19953.0820314 for 21 rounds
CV with max_depth=12, min_child_weight=5
	MAE 19485.690624799998 for 22 rounds
CV with max_depth=12, min_child_weight=6
	MAE 19325.138086 for 15 rounds
CV with max_depth=12, min_child_weight=7
	MAE 19374.9326172 for 17 rounds
CV with max_depth=12, min_child_weight=8
	MAE 19386.696875200003 for 12 rounds
CV with max_depth=12, min_child_weight=9
	MAE 19785.8812502 for 16 rounds
Best params: 10, 7, MAE: 18968.65625


In [649]:
params['max_depth'] = 10
params['min_child_weight'] = 8

In [650]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [651]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 19444.9171874 for 35 rounds
CV with subsample=1.0, colsample=0.9
	MAE 19350.133398600003 for 15 rounds
CV with subsample=1.0, colsample=0.8
	MAE 19854.3339846 for 15 rounds
CV with subsample=1.0, colsample=0.7
	MAE 19109.7365234 for 18 rounds
CV with subsample=0.9, colsample=1.0
	MAE 19648.493555 for 18 rounds
CV with subsample=0.9, colsample=0.9
	MAE 19744.123828 for 18 rounds
CV with subsample=0.9, colsample=0.8
	MAE 19654.697656199998 for 22 rounds
CV with subsample=0.9, colsample=0.7
	MAE 19820.5917968 for 19 rounds
CV with subsample=0.8, colsample=1.0
	MAE 19975.421484600003 for 32 rounds
CV with subsample=0.8, colsample=0.9


	MAE 19529.5703126 for 26 rounds
CV with subsample=0.8, colsample=0.8
	MAE 19256.121289 for 23 rounds
CV with subsample=0.8, colsample=0.7
	MAE 19529.541797 for 29 rounds
CV with subsample=0.7, colsample=1.0
	MAE 19278.5812502 for 15 rounds
CV with subsample=0.7, colsample=0.9
	MAE 19609.3464844 for 11 rounds
CV with subsample=0.7, colsample=0.8
	MAE 19231.4707032 for 14 rounds
CV with subsample=0.7, colsample=0.7
	MAE 19517.175 for 17 rounds
Best params: 1.0, 0.7, MAE: 19109.7365234


In [679]:
params['subsample'] = 1
params['colsample_bytree'] = 0.7

In [680]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params, dtrain,num_boost_round=num_boost_round,seed=42,nfold=5, metrics=['mae'], early_stopping_rounds=10)    
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

Wall time: 0 ns
CV with eta=0.3
Wall time: 1.76 s
	MAE 19109.7365234 for 18 rounds

CV with eta=0.2
Wall time: 1.88 s
	MAE 18763.865039 for 29 rounds

CV with eta=0.1
Wall time: 3.03 s
	MAE 18210.9292972 for 50 rounds

CV with eta=0.05
Wall time: 4.94 s
	MAE 17781.7041016 for 100 rounds

CV with eta=0.01
Wall time: 20.1 s
	MAE 17826.4544924 for 493 rounds

CV with eta=0.005
Wall time: 20.9 s
	MAE 23177.1429688 for 495 rounds

Best params: 0.05, MAE: 17781.7041016


In [702]:
params['eta'] = .05

In [703]:
params

{'max_depth': 10,
 'min_child_weight': 8,
 'eta': 0.05,
 'subsample': 1,
 'colsample_bytree': 0.7,
 'objective': 'reg:linear',
 'eval_metric': 'mae'}

In [704]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-mae:171004.23438
[1]	Test-mae:162504.17188
[2]	Test-mae:154357.29688
[3]	Test-mae:146682.15625
[4]	Test-mae:139478.04688
[5]	Test-mae:132482.00000
[6]	Test-mae:126051.29688
[7]	Test-mae:119745.46094
[8]	Test-mae:113830.57812
[9]	Test-mae:108215.18750
[10]	Test-mae:102817.47656
[11]	Test-mae:97707.60156
[12]	Test-mae:92881.14844
[13]	Test-mae:88317.27344
[14]	Test-mae:84001.47656
[15]	Test-mae:79948.21094
[16]	Test-mae:76066.95312
[17]	Test-mae:72309.96094
[18]	Test-mae:68858.60938
[19]	Test-mae:65528.91016
[20]	Test-mae:62412.01562
[21]	Test-mae:59439.95703
[22]	Test-mae:56598.21875
[23]	Test-mae:53998.64062
[24]	Test-mae:51443.74219
[25]	Test-mae:49137.04297
[26]	Test-mae:46926.83594
[27]	Test-mae:44791.79297
[28]	Test-mae:42864.59766
[29]	Test-mae:41044.21094
[30]	Test-mae:39332.66406
[31]	Test-mae:37743.07812
[32]	Test-mae:36224.94531
[33]	Test-mae:34775.08984
[34]	Test-mae:33453.30859
[35]	Test-mae:32136.84766
[36]	Test-mae:30929.83203
[37]	Test-mae:29848.94531
[38]	Test-m

In [705]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:171004.23438
[1]	Test-mae:162504.17188
[2]	Test-mae:154357.29688
[3]	Test-mae:146682.15625
[4]	Test-mae:139478.04688
[5]	Test-mae:132482.00000
[6]	Test-mae:126051.29688
[7]	Test-mae:119745.46094
[8]	Test-mae:113830.57812
[9]	Test-mae:108215.18750
[10]	Test-mae:102817.47656
[11]	Test-mae:97707.60156
[12]	Test-mae:92881.14844
[13]	Test-mae:88317.27344
[14]	Test-mae:84001.47656
[15]	Test-mae:79948.21094
[16]	Test-mae:76066.95312
[17]	Test-mae:72309.96094
[18]	Test-mae:68858.60938
[19]	Test-mae:65528.91016
[20]	Test-mae:62412.01562
[21]	Test-mae:59439.95703
[22]	Test-mae:56598.21875
[23]	Test-mae:53998.64062
[24]	Test-mae:51443.74219
[25]	Test-mae:49137.04297
[26]	Test-mae:46926.83594
[27]	Test-mae:44791.79297
[28]	Test-mae:42864.59766
[29]	Test-mae:41044.21094
[30]	Test-mae:39332.66406
[31]	Test-mae:37743.07812
[32]	Test-mae:36224.94531
[33]	Test-mae:34775.08984
[34]	Test-mae:33453.30859
[35]	Test-mae:32136.84766
[36]	Test-mae:30929.83203
[37]	Test-mae:29848.94531
[38]	Test-m

In [706]:
mean_absolute_error(best_model.predict(dtest), y_test)

16891.926042869516

In [707]:
best_model.save_model("my_model.model")

In [708]:
loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")
loaded_model.predict(dtest)



array([164293.34 , 175441.53 , 249272.33 , 148069.2  , 165790.11 ,
       133921.62 , 139577.22 , 434989.75 , 188877.61 , 124136.24 ,
       149824.03 , 131485.1  , 174856.62 , 121755.695, 285850.9  ,
       133651.7  , 283011.9  , 181495.14 , 105441.984, 101588.33 ,
       340707.28 , 278660.06 , 266939.56 , 124232.94 , 200387.2  ,
       167188.05 , 195595.19 , 149277.84 , 172168.69 , 114352.54 ,
       169464.7  , 115711.23 , 141546.22 , 140319.52 ,  94571.92 ,
       104038.305, 159181.66 , 205177.6  , 150427.16 , 227132.69 ,
       245506.31 , 190946.72 , 355047.6  , 163206.16 ,  88442.81 ,
       224314.38 , 116603.37 , 232846.28 , 161383.42 , 168554.08 ,
       121396.375, 156482.94 , 177189.03 , 108820.84 , 256273.61 ,
       146234.89 , 121203.81 , 229251.11 , 134460.95 ,  80029.84 ,
        85347.   , 100062.76 , 180549.38 , 176126.61 , 187315.03 ,
       110806.266, 240716.34 , 180231.06 , 365603.06 , 200583.23 ,
       421221.66 , 210130.11 , 170976.78 , 214107.73 , 323891.

In [709]:
RSquared = 1 - (mean_absolute_error(best_model.predict(dtest), y_test)/mean_test[0])

In [710]:
RSquared

0.906161089402131

# Let's Put all our Models together for some Ensembling 

In [711]:
FullPredicitions = pd.concat([pd.Series(loaded_model.predict(dtest)), y_test.SalePrice], axis = 1)
FullPredicitions.columns = ['XGBoost', 'Actual']

In [712]:
FullPredictions = pd.concat([pd.Series(LassoPred), FullPredicitions], axis = 1)
FullPredictions.columns = ['Lasso','XGBoost', 'Actual']
FullPredictions

Unnamed: 0,Lasso,XGBoost,Actual
0,190910.047047,164293.34375,173000
1,168450.362884,175441.53125,135000
2,268971.731807,249272.328125,267000
3,146761.428259,148069.203125,161000
4,204416.36652,165790.109375,136000
5,112703.286695,133921.625,133000
6,131706.896154,139577.21875,142000
7,403739.363011,434989.75,386250
8,216355.217235,188877.609375,235000
9,123463.245965,124136.242188,133900


In [713]:
FullPredictions = pd.concat([PredGrid.Predicted, FullPredictions], axis = 1)
FullPredictions.columns = ['ElasticNet','Lasso','XGBoost', 'Actual']
FullPredictions

Unnamed: 0,ElasticNet,Lasso,XGBoost,Actual
0,199864.057607,190910.047047,164293.34375,173000
1,174770.236129,168450.362884,175441.53125,135000
2,261964.567241,268971.731807,249272.328125,267000
3,152159.704511,146761.428259,148069.203125,161000
4,200346.286091,204416.36652,165790.109375,136000
5,114073.493146,112703.286695,133921.625,133000
6,135863.965548,131706.896154,139577.21875,142000
7,392631.611934,403739.363011,434989.75,386250
8,210182.317391,216355.217235,188877.609375,235000
9,125590.996622,123463.245965,124136.242188,133900


In [714]:
FullPredictions = pd.concat([rfrPred, FullPredictions], axis = 1)
FullPredictions.columns = ['RandomForest','ElasticNet','Lasso','XGBoost', 'Actual']
FullPredictions

Unnamed: 0,RandomForest,ElasticNet,Lasso,XGBoost,Actual
0,164670.08,199864.057607,190910.047047,164293.34375,173000
1,176012.5,174770.236129,168450.362884,175441.53125,135000
2,255642.93,261964.567241,268971.731807,249272.328125,267000
3,168805.09,152159.704511,146761.428259,148069.203125,161000
4,209534.05,200346.286091,204416.36652,165790.109375,136000
5,130546.0,114073.493146,112703.286695,133921.625,133000
6,141232.75,135863.965548,131706.896154,139577.21875,142000
7,429003.39,392631.611934,403739.363011,434989.75,386250
8,213685.06,210182.317391,216355.217235,188877.609375,235000
9,119997.0,125590.996622,123463.245965,124136.242188,133900


In [715]:
meltTemp = pd.melt(FullPredictions, id_vars='Actual')
meltTemp
#fig = px.scatter(meltTemp.loc[meltTemp['variable'].isin(['RandomForest', 'Lasso', 'ElasticNet','XGBoost'])], x = 'Actual', y = 'value', color = 'variable')
#fig


Unnamed: 0,Actual,variable,value
0,173000,RandomForest,164670.080000
1,135000,RandomForest,176012.500000
2,267000,RandomForest,255642.930000
3,161000,RandomForest,168805.090000
4,136000,RandomForest,209534.050000
...,...,...,...
1727,135750,XGBoost,109772.828125
1728,192500,XGBoost,199630.390625
1729,146500,XGBoost,153945.828125
1730,118500,XGBoost,127085.296875


In [716]:
FullPredictions['RandomForest_Variance'] = np.sqrt((FullPredictions['RandomForest'] - FullPredictions['Actual'])**2)
FullPredictions['ElasticNet_Variance'] = np.sqrt((FullPredictions['ElasticNet'] - FullPredictions['Actual'])**2)
FullPredictions['Lasso_Variance'] = np.sqrt((FullPredictions['Lasso'] - FullPredictions['Actual'])**2)
FullPredictions['XGBoost_Variance'] = np.sqrt((FullPredictions['XGBoost'] - FullPredictions['Actual'])**2)
FullPredictions

Unnamed: 0,RandomForest,ElasticNet,Lasso,XGBoost,Actual,RandomForest_Variance,ElasticNet_Variance,Lasso_Variance,XGBoost_Variance
0,164670.08,199864.057607,190910.047047,164293.34375,173000,8329.92,26864.057607,17910.047047,8706.65625
1,176012.5,174770.236129,168450.362884,175441.53125,135000,41012.5,39770.236129,33450.362884,40441.53125
2,255642.93,261964.567241,268971.731807,249272.328125,267000,11357.07,5035.432759,1971.731807,17727.671875
3,168805.09,152159.704511,146761.428259,148069.203125,161000,7805.09,8840.295489,14238.571741,12930.796875
4,209534.05,200346.286091,204416.36652,165790.109375,136000,73534.05,64346.286091,68416.36652,29790.109375
5,130546.0,114073.493146,112703.286695,133921.625,133000,2454.0,18926.506854,20296.713305,921.625
6,141232.75,135863.965548,131706.896154,139577.21875,142000,767.25,6136.034452,10293.103846,2422.78125
7,429003.39,392631.611934,403739.363011,434989.75,386250,42753.39,6381.611934,17489.363011,48739.75
8,213685.06,210182.317391,216355.217235,188877.609375,235000,21314.94,24817.682609,18644.782765,46122.390625
9,119997.0,125590.996622,123463.245965,124136.242188,133900,13903.0,8309.003378,10436.754035,9763.757812


In [717]:
ScoreDF = pd.DataFrame({
    'RandomForestMAE': [FullPredictions.RandomForest_Variance.sum()/(len(FullPredictions.index))],
    'ElasticNetMAE': [FullPredictions.ElasticNet_Variance.sum()/(len(FullPredictions.index))],
    'LassoMAE': [FullPredictions.Lasso_Variance.sum()/(len(FullPredictions.index))],
    'XGBoostMAE': [FullPredictions.XGBoost_Variance.sum()/(len(FullPredictions.index))]
})
ScoreDF

    

Unnamed: 0,RandomForestMAE,ElasticNetMAE,LassoMAE,XGBoostMAE
0,17823.290855,20336.553839,21929.572447,16891.926043


In [718]:
ScoreDF['RandomForestRSquared'] = 1-(ScoreDF.RandomForestMAE/mean_test[0])
ScoreDF['ElasticNetRSquared'] = 1-(ScoreDF.ElasticNetMAE/mean_test[0])
ScoreDF['LassoRSquared'] = 1-(ScoreDF.LassoMAE/mean_test[0])
ScoreDF['XGBoostRSquared'] = 1-(ScoreDF.XGBoostMAE/mean_test[0])



In [719]:
ScoreDF

Unnamed: 0,RandomForestMAE,ElasticNetMAE,LassoMAE,XGBoostMAE,RandomForestRSquared,ElasticNetRSquared,LassoRSquared,XGBoostRSquared
0,17823.290855,20336.553839,21929.572447,16891.926043,0.900987,0.887025,0.878176,0.906161


In [720]:
EnsembleDF = FullPredictions.loc[:,['RandomForest', 'Lasso', 'XGBoost', 'ElasticNet', 'Actual']]

loopDF = FullPredictions.loc[:,['Actual']]
minMAE = float('Inf')
pLst = []
for i in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
    for j in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
        for k in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
            for m in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
                if (i + j + k + m) == 1:
                    loopDF['RandomForest'] = FullPredictions.RandomForest * i
                    loopDF['Lasso'] = FullPredictions.Lasso * j
                    loopDF['XGBoost'] = FullPredictions.XGBoost * k
                    loopDF['ElasticNet'] = FullPredictions.ElasticNet * m
                    loopDF['Prediction'] = loopDF['ElasticNet'] + loopDF['XGBoost'] + loopDF['Lasso'] + loopDF['RandomForest']
                    loopDF['Difference'] = loopDF['Prediction'] - loopDF['Actual']
                    loopDF['Diffsq'] = np.sqrt(loopDF['Difference']**2)
                    mae = loopDF.Diffsq.sum()/len(loopDF.index)
                    if mae < minMAE:
                        minMAE = mae
                        pLst = [i,j,k,m]
                        print(mae)


                        
        
    
    

17823.290854503462
17117.03874496432
16763.332843475157
16653.200615797436
16385.074168358653
16258.28479058959
16173.398243536842
16120.525810599349
16116.94743627113


In [721]:
pLst

[0.2, 0.2, 0.6, 0]

In [722]:
minMAE

16116.94743627113

# Ensembling Complete:
### RandomForest = 0.2
### Lasso = 0.5
### XGBoost = 0.3

RandomState 0 gives MAE of 15,147m