In [428]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import Data for V1.1 Feature Selection

In [429]:
X_train= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/x_Train.csv')
X_test= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/x_Test.csv')
y_train= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/y_Train.csv')
y_test= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.3/y_Test.csv')

# Fit an ElasticNet Model With GridSearch

In [430]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


In [431]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
parametersGrid = {"max_iter": [1, 5, 10],
                "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                "l1_ratio": np.arange(0.0, 1.0, 0.1)}
eNet = ElasticNet(normalize=True)
grid = GridSearchCV(eNet, parametersGrid, scoring='r2', cv=10)
grid.fit(X_train, y_train)
pred = grid.predict(X_test)
    

In [432]:
PredGrid = pd.concat([pd.Series(pred), y_test], axis = 1)
PredGrid.columns = ['Predicted', 'Actual']
PredGrid

Unnamed: 0,Predicted,Actual
0,123099.620459,136000
1,222571.744604,194201
2,262885.7825,294000
3,143940.12372,132000
4,106608.549399,119000
5,86323.606171,110000
6,177799.241999,162500
7,136537.33486,153000
8,139980.440954,140000
9,274650.782519,220000


## Lets score ourselves

In [433]:
PredGrid['Difference']=(PredGrid['Predicted'] - PredGrid['Actual'])**2
PredGrid.Difference.sum()


275622631842.8461

In [434]:
PredGrid.Difference.mean()**.5


25229.781507689306

In [435]:
grid.score(X_test, y_test)

0.8822982519751587

In [436]:
grid.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__alpha': 1.0,
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__l1_ratio': 0.5,
 'estimator__max_iter': 1000,
 'estimator__normalize': True,
 'estimator__positive': False,
 'estimator__precompute': False,
 'estimator__random_state': None,
 'estimator__selection': 'cyclic',
 'estimator__tol': 0.0001,
 'estimator__warm_start': False,
 'estimator': ElasticNet(normalize=True),
 'n_jobs': None,
 'param_grid': {'max_iter': [1, 5, 10],
  'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
  'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': 'r2',
 'verbose': 0}

## Lets try a Lasso Model

In [444]:
from sklearn.linear_model import Lasso

In [446]:
logit = Lasso()

In [447]:
logit.fit(X_train, y_train)

Lasso()

In [456]:
LassoDF = pd.DataFrame(logit.coef_)
G = pd.concat([pd.Series(X_train.columns), LassoDF], axis = 1)
G.columns = ['Feature', 'Lasso Coefficient']
G.sort_values('Lasso Coefficient')

Unnamed: 0,Feature,Lasso Coefficient
69,Condition2_PosN,-201217.228124
109,Exterior2nd_Stone,-31486.12532
128,Heating_OthW,-23439.01816
45,Neighborhood_Mitchel,-20272.835967
41,Neighborhood_Edwards,-20147.737667
87,RoofMatl_Tar&Grv,-20054.513158
172,SaleCondition_Partial,-18272.950648
57,Neighborhood_Timber,-17734.259655
72,BldgType_Duplex,-17276.898384
31,LotConfig_FR3,-17222.399777


In [514]:
logit.score(X_test, y_test)

0.881841567486735

In [519]:
LassoPred = logit.predict(X_test)

## How about a Random Forest

In [458]:
from sklearn.ensemble import RandomForestRegressor

In [459]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr.score(X_test, y_test)

0.8736868807167796

### Let's do some parameter tuning

In [466]:
params = {
    'n_estimators': [500, 1000, 1500],
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_leaf': [5, 10, 20],
    'min_samples_split': [5, 10, 15],
    'max_leaf_nodes': [100, 200, 300, 400],
    'max_samples': [None, 0.3, 0.5, 0.7, 1]
}
grid = GridSearchCV(rfr, params, refit=True, verbose=3)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


ValueError: Invalid parameter feature_fraction for estimator RandomForestRegressor(). Check the list of available parameters with `estimator.get_params().keys()`.

In [465]:
grid.best_params_

{'max_depth': 10,
 'max_leaf_nodes': 400,
 'max_samples': None,
 'min_samples_leaf': 5,
 'min_samples_split': 5,
 'n_estimators': 500}

In [534]:
rfrPred = pd.Series(rfr.predict(X_test))
rfrPred

0      127907.37
1      209822.66
2      265582.27
3      161427.40
4      114894.00
5      104674.01
6      156042.00
7      129142.99
8      152401.00
9      263296.59
10     189146.25
11     490156.04
12     225979.86
13     141900.25
14     213545.17
15     104592.27
16     288479.69
17     335033.99
18     278902.11
19     189474.28
20     120783.26
21     114120.76
22     255332.63
23     133223.80
24     217103.69
25     203011.13
26     285674.67
27     107973.00
28     147972.20
29     183201.42
30      85635.65
31     121380.53
32     124640.83
33     128472.61
34     190072.50
35     195833.50
36     113204.43
37     133033.55
38     178505.89
39     125856.00
40     154582.10
41     262310.05
42     126810.27
43     233870.35
44     319592.16
45     136953.34
46     115249.83
47      86532.00
48     121617.50
49     148177.09
50     153632.85
51     119079.69
52     205195.86
53     192460.82
54     156828.25
55     115148.50
56     391703.20
57     207681.20
58     134202.

## XG Boost?

In [461]:
import xgboost as xgb

In [462]:
logitXB = xgb.XGBRegressor()
logitXB.fit(X_train, y_train)
logitXB.score(X_test, y_test)

0.8507051744459793

In [467]:
from sklearn.metrics import mean_absolute_error

In [507]:
mean_train = np.mean(y_train)
mean_train[0]
mean_test = np.mean(y_test)

In [473]:
baseline_predictions = np.ones(y_test.shape) * mean_train[0]
baseline_predictions

array([[182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [182490.66600595],
       [1824

In [475]:
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print(f'Baseline MAE is {round(mae_baseline,2)}')

Baseline MAE is 56587.06


In [476]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

In [477]:
params['eval_metric'] = "mae"

In [478]:
num_boost_round = 999

In [479]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [480]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:124914.98438
[1]	Test-mae:88133.72656
[2]	Test-mae:61915.81250
[3]	Test-mae:45280.01953
[4]	Test-mae:34416.69922
[5]	Test-mae:27328.77734
[6]	Test-mae:23620.75977
[7]	Test-mae:21420.30469
[8]	Test-mae:20286.05273
[9]	Test-mae:19610.23047
[10]	Test-mae:19151.88281
[11]	Test-mae:18977.49219
[12]	Test-mae:18896.92969
[13]	Test-mae:18797.78320
[14]	Test-mae:18673.69922
[15]	Test-mae:18644.85156
[16]	Test-mae:18653.88086
[17]	Test-mae:18661.14453
[18]	Test-mae:18576.75195
[19]	Test-mae:18531.89844
[20]	Test-mae:18450.91992
[21]	Test-mae:18456.48633
[22]	Test-mae:18310.67188
[23]	Test-mae:18275.11914
[24]	Test-mae:18290.11328
[25]	Test-mae:18263.08789
[26]	Test-mae:18223.20898
[27]	Test-mae:18186.26172
[28]	Test-mae:18166.96680
[29]	Test-mae:18113.17969
[30]	Test-mae:18140.31250
[31]	Test-mae:18140.74805
[32]	Test-mae:18167.43359
[33]	Test-mae:18155.37109
[34]	Test-mae:18114.61914
[35]	Test-mae:18097.06055
[36]	Test-mae:18093.13867
[37]	Test-mae:18062.25000
[38]	Test-mae:18009.9

In [481]:
print(f'Best MAE is {round(model.best_score, 2)} with {round(model.best_iteration+1, 2)} rounds')

Best MAE is 18010.0 with 39 rounds


In [482]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results



Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,128585.221875,1096.714929,128865.565625,5469.406238
1,90863.9125,800.821466,91491.998438,4739.265229
2,64392.239063,545.311729,65645.645313,4118.491501
3,45835.311719,417.004711,48113.064063,3796.403011
4,32875.820703,306.74419,36674.978125,3010.011039
5,24049.746484,246.199549,29589.446094,2490.690826
6,18294.007031,278.855401,25391.757422,2397.127496
7,14641.946875,264.457877,22926.558203,2064.359371
8,12326.870703,232.372994,21612.926172,1853.065839
9,10728.064258,312.831599,20810.923047,1732.982223


In [483]:
cv_results['test-mae-mean'].min()

19292.1457032

In [485]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(8,13)
    for min_child_weight in range(5,10)
]

In [487]:
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=8, min_child_weight=5
	MAE 19879.4988282 for 40 rounds
CV with max_depth=8, min_child_weight=6
	MAE 19487.088672 for 37 rounds
CV with max_depth=8, min_child_weight=7
	MAE 19395.966016 for 36 rounds
CV with max_depth=8, min_child_weight=8
	MAE 18843.5 for 35 rounds
CV with max_depth=8, min_child_weight=9
	MAE 18810.7058594 for 19 rounds
CV with max_depth=9, min_child_weight=5
	MAE 19870.3722656 for 21 rounds
CV with max_depth=9, min_child_weight=6
	MAE 19443.1183596 for 124 rounds
CV with max_depth=9, min_child_weight=7
	MAE 19138.5429686 for 72 rounds
CV with max_depth=9, min_child_weight=8
	MAE 18784.697656199998 for 45 rounds
CV with max_depth=9, min_child_weight=9


	MAE 18856.2617186 for 18 rounds
CV with max_depth=10, min_child_weight=5
	MAE 19762.144922 for 19 rounds
CV with max_depth=10, min_child_weight=6
	MAE 19658.7671874 for 27 rounds
CV with max_depth=10, min_child_weight=7
	MAE 19181.511719000002 for 20 rounds
CV with max_depth=10, min_child_weight=8
	MAE 18659.444140800002 for 19 rounds
CV with max_depth=10, min_child_weight=9
	MAE 19153.423047 for 18 rounds
CV with max_depth=11, min_child_weight=5
	MAE 20015.884375 for 22 rounds
CV with max_depth=11, min_child_weight=6
	MAE 19573.725780999997 for 17 rounds
CV with max_depth=11, min_child_weight=7
	MAE 19386.434375 for 13 rounds
CV with max_depth=11, min_child_weight=8
	MAE 18772.354687799998 for 22 rounds
CV with max_depth=11, min_child_weight=9


	MAE 18879.3375 for 17 rounds
CV with max_depth=12, min_child_weight=5
	MAE 19761.453125 for 14 rounds
CV with max_depth=12, min_child_weight=6
	MAE 19668.5820312 for 25 rounds
CV with max_depth=12, min_child_weight=7
	MAE 19515.4015628 for 37 rounds
CV with max_depth=12, min_child_weight=8
	MAE 18682.2566406 for 32 rounds
CV with max_depth=12, min_child_weight=9
	MAE 19221.6007812 for 24 rounds
Best params: 10, 8, MAE: 18659.444140800002


In [488]:
params['max_depth'] = 10
params['min_child_weight'] = 8

In [489]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [490]:
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0
	MAE 18659.444140800002 for 19 rounds
CV with subsample=1.0, colsample=0.9
	MAE 19336.0316406 for 27 rounds
CV with subsample=1.0, colsample=0.8
	MAE 19588.312890399997 for 23 rounds
CV with subsample=1.0, colsample=0.7
	MAE 19499.636328200002 for 35 rounds
CV with subsample=0.9, colsample=1.0
	MAE 19821.994531199998 for 26 rounds
CV with subsample=0.9, colsample=0.9
	MAE 18966.4757814 for 26 rounds
CV with subsample=0.9, colsample=0.8
	MAE 19449.9375 for 24 rounds
CV with subsample=0.9, colsample=0.7
	MAE 19860.3164064 for 13 rounds
CV with subsample=0.8, colsample=1.0
	MAE 19716.4816406 for 13 rounds
CV with subsample=0.8, colsample=0.9


	MAE 19953.150781199998 for 17 rounds
CV with subsample=0.8, colsample=0.8
	MAE 19755.0703126 for 23 rounds
CV with subsample=0.8, colsample=0.7
	MAE 19509.0671876 for 14 rounds
CV with subsample=0.7, colsample=1.0
	MAE 19550.4785158 for 22 rounds
CV with subsample=0.7, colsample=0.9
	MAE 20079.4148436 for 22 rounds
CV with subsample=0.7, colsample=0.8
	MAE 20507.938281400002 for 22 rounds
CV with subsample=0.7, colsample=0.7
	MAE 19822.5074218 for 13 rounds
Best params: 1.0, 1.0, MAE: 18659.444140800002


In [491]:
params['subsample'] = 1
params['colsample_bytree'] = 1

In [499]:
%time
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params, dtrain,num_boost_round=num_boost_round,seed=42,nfold=5, metrics=['mae'], early_stopping_rounds=10)    
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
print("Best params: {}, MAE: {}".format(best_params, min_mae))

Wall time: 0 ns
CV with eta=0.3
Wall time: 1.72 s
	MAE 18659.444140800002 for 19 rounds

CV with eta=0.2
Wall time: 2.4 s
	MAE 18530.571484599997 for 33 rounds

CV with eta=0.1
Wall time: 3.8 s
	MAE 18257.260156199998 for 70 rounds

CV with eta=0.05
Wall time: 5.43 s
	MAE 18187.8111328 for 109 rounds

CV with eta=0.01
Wall time: 31.2 s
	MAE 18044.5554688 for 645 rounds

CV with eta=0.005
Wall time: 1min
	MAE 18107.9628906 for 998 rounds

Best params: 0.01, MAE: 18044.5554688


In [500]:
params['eta'] = .01

In [501]:
params

{'max_depth': 10,
 'min_child_weight': 8,
 'eta': 0.01,
 'subsample': 1,
 'colsample_bytree': 1,
 'objective': 'reg:linear',
 'eval_metric': 'mae'}

In [502]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)
print("Best MAE: {:.2f} in {} rounds".format(model.best_score, model.best_iteration+1))

[0]	Test-mae:176149.09375
[1]	Test-mae:174403.87500
[2]	Test-mae:172677.43750
[3]	Test-mae:170969.81250
[4]	Test-mae:169277.75000
[5]	Test-mae:167603.73438
[6]	Test-mae:165944.98438
[7]	Test-mae:164311.21875
[8]	Test-mae:162683.67188
[9]	Test-mae:161082.03125
[10]	Test-mae:159485.40625
[11]	Test-mae:157915.98438
[12]	Test-mae:156348.62500
[13]	Test-mae:154791.50000
[14]	Test-mae:153273.07812
[15]	Test-mae:151773.15625
[16]	Test-mae:150262.62500
[17]	Test-mae:148791.15625
[18]	Test-mae:147306.20312
[19]	Test-mae:145871.82812
[20]	Test-mae:144443.40625
[21]	Test-mae:143013.03125
[22]	Test-mae:141593.17188
[23]	Test-mae:140180.90625
[24]	Test-mae:138791.04688
[25]	Test-mae:137412.28125
[26]	Test-mae:136049.76562
[27]	Test-mae:134701.81250
[28]	Test-mae:133363.78125
[29]	Test-mae:132064.62500
[30]	Test-mae:130757.42969
[31]	Test-mae:129464.71875
[32]	Test-mae:128179.70312
[33]	Test-mae:126902.43750
[34]	Test-mae:125649.05469
[35]	Test-mae:124412.04688
[36]	Test-mae:123179.03125
[37]	Test-m

[299]	Test-mae:19280.29297
[300]	Test-mae:19233.25391
[301]	Test-mae:19192.42578
[302]	Test-mae:19152.58984
[303]	Test-mae:19111.72461
[304]	Test-mae:19074.85547
[305]	Test-mae:19042.33984
[306]	Test-mae:19009.20508
[307]	Test-mae:18971.79883
[308]	Test-mae:18939.00391
[309]	Test-mae:18906.42773
[310]	Test-mae:18874.21680
[311]	Test-mae:18841.89062
[312]	Test-mae:18806.82422
[313]	Test-mae:18776.62305
[314]	Test-mae:18742.51953
[315]	Test-mae:18713.04297
[316]	Test-mae:18679.69141
[317]	Test-mae:18651.20508
[318]	Test-mae:18618.72461
[319]	Test-mae:18588.53516
[320]	Test-mae:18555.57812
[321]	Test-mae:18524.62500
[322]	Test-mae:18494.34375
[323]	Test-mae:18464.28516
[324]	Test-mae:18432.54102
[325]	Test-mae:18402.56055
[326]	Test-mae:18374.12695
[327]	Test-mae:18345.62305
[328]	Test-mae:18317.80273
[329]	Test-mae:18292.85938
[330]	Test-mae:18267.44336
[331]	Test-mae:18243.80664
[332]	Test-mae:18222.04688
[333]	Test-mae:18198.15430
[334]	Test-mae:18176.56641
[335]	Test-mae:18152.98242
[

In [503]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:176149.09375
[1]	Test-mae:174403.87500
[2]	Test-mae:172677.43750
[3]	Test-mae:170969.81250
[4]	Test-mae:169277.75000
[5]	Test-mae:167603.73438
[6]	Test-mae:165944.98438
[7]	Test-mae:164311.21875
[8]	Test-mae:162683.67188
[9]	Test-mae:161082.03125
[10]	Test-mae:159485.40625
[11]	Test-mae:157915.98438
[12]	Test-mae:156348.62500
[13]	Test-mae:154791.50000
[14]	Test-mae:153273.07812
[15]	Test-mae:151773.15625
[16]	Test-mae:150262.62500
[17]	Test-mae:148791.15625
[18]	Test-mae:147306.20312
[19]	Test-mae:145871.82812
[20]	Test-mae:144443.40625
[21]	Test-mae:143013.03125
[22]	Test-mae:141593.17188
[23]	Test-mae:140180.90625
[24]	Test-mae:138791.04688
[25]	Test-mae:137412.28125
[26]	Test-mae:136049.76562
[27]	Test-mae:134701.81250
[28]	Test-mae:133363.78125
[29]	Test-mae:132064.62500
[30]	Test-mae:130757.42969
[31]	Test-mae:129464.71875
[32]	Test-mae:128179.70312
[33]	Test-mae:126902.43750
[34]	Test-mae:125649.05469
[35]	Test-mae:124412.04688
[36]	Test-mae:123179.03125
[37]	Test-m

[300]	Test-mae:19233.25391
[301]	Test-mae:19192.42578
[302]	Test-mae:19152.58984
[303]	Test-mae:19111.72461
[304]	Test-mae:19074.85547
[305]	Test-mae:19042.33984
[306]	Test-mae:19009.20508
[307]	Test-mae:18971.79883
[308]	Test-mae:18939.00391
[309]	Test-mae:18906.42773
[310]	Test-mae:18874.21680
[311]	Test-mae:18841.89062
[312]	Test-mae:18806.82422
[313]	Test-mae:18776.62305
[314]	Test-mae:18742.51953
[315]	Test-mae:18713.04297
[316]	Test-mae:18679.69141
[317]	Test-mae:18651.20508
[318]	Test-mae:18618.72461
[319]	Test-mae:18588.53516
[320]	Test-mae:18555.57812
[321]	Test-mae:18524.62500
[322]	Test-mae:18494.34375
[323]	Test-mae:18464.28516
[324]	Test-mae:18432.54102
[325]	Test-mae:18402.56055
[326]	Test-mae:18374.12695
[327]	Test-mae:18345.62305
[328]	Test-mae:18317.80273
[329]	Test-mae:18292.85938
[330]	Test-mae:18267.44336
[331]	Test-mae:18243.80664
[332]	Test-mae:18222.04688
[333]	Test-mae:18198.15430
[334]	Test-mae:18176.56641
[335]	Test-mae:18152.98242
[336]	Test-mae:18131.80078
[

In [504]:
mean_absolute_error(best_model.predict(dtest), y_test)

17006.258696593533

In [505]:
best_model.save_model("my_model.model")

In [506]:
loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")
loaded_model.predict(dtest)



array([132273.12 , 200348.27 , 253904.55 , 152565.56 , 114184.7  ,
       111806.26 , 155183.02 , 127030.09 , 146457.9  , 259467.02 ,
       192710.84 , 476397.9  , 232579.48 , 133063.84 , 221008.83 ,
       101410.82 , 267444.6  , 314669.28 , 274735.75 , 206601.9  ,
       118189.1  , 115015.4  , 255551.84 , 136591.73 , 214791.03 ,
       215154.55 , 299012.06 ,  99885.49 , 132058.25 , 188223.2  ,
        93751.14 , 122700.78 , 120553.59 , 127797.04 , 187694.52 ,
       215747.06 , 110933.055, 127598.625, 176742.25 , 124959.74 ,
       151941.03 , 273931.3  , 122875.82 , 231908.14 , 321088.7  ,
       134208.69 , 112861.62 ,  81508.82 , 117513.26 , 143122.84 ,
       155096.3  , 116502.35 , 189201.69 , 189727.75 , 160283.   ,
       107566.125, 397633.66 , 211302.31 , 132168.89 , 220561.9  ,
       195035.84 , 142584.47 , 134635.11 , 213460.11 , 103711.7  ,
       195321.6  , 233505.69 , 135879.11 , 149876.78 , 129878.484,
        98813.52 , 313840.16 , 129558.7  , 104338.586, 194205.

In [508]:
RSquared = 1 - (mean_absolute_error(best_model.predict(dtest), y_test)/mean_test[0])

In [509]:
RSquared

0.9044120419708801

# Let's Put all our Models together for some Ensembling 

In [540]:
FullPredicitions = pd.concat([pd.Series(loaded_model.predict(dtest)), y_test.SalePrice], axis = 1)
FullPredicitions.columns = ['XGBoost', 'Actual']

In [541]:
FullPredictions = pd.concat([pd.Series(LassoPred), FullPredicitions], axis = 1)
FullPredictions.columns = ['Lasso','XGBoost', 'Actual']
FullPredictions

Unnamed: 0,Lasso,XGBoost,Actual
0,119290.995664,132273.125,136000
1,212190.369735,200348.265625,194201
2,275875.900414,253904.546875,294000
3,147633.276472,152565.5625,132000
4,122229.836557,114184.703125,119000
5,87830.018587,111806.257812,110000
6,182449.913781,155183.015625,162500
7,141429.876937,127030.09375,153000
8,138589.047488,146457.90625,140000
9,286125.066609,259467.015625,220000


In [542]:
FullPredictions = pd.concat([PredGrid.Predicted, FullPredictions], axis = 1)
FullPredictions.columns = ['ElasticNet','Lasso','XGBoost', 'Actual']
FullPredictions

Unnamed: 0,ElasticNet,Lasso,XGBoost,Actual
0,123099.620459,119290.995664,132273.125,136000
1,222571.744604,212190.369735,200348.265625,194201
2,262885.7825,275875.900414,253904.546875,294000
3,143940.12372,147633.276472,152565.5625,132000
4,106608.549399,122229.836557,114184.703125,119000
5,86323.606171,87830.018587,111806.257812,110000
6,177799.241999,182449.913781,155183.015625,162500
7,136537.33486,141429.876937,127030.09375,153000
8,139980.440954,138589.047488,146457.90625,140000
9,274650.782519,286125.066609,259467.015625,220000


In [543]:
FullPredictions = pd.concat([rfrPred, FullPredictions], axis = 1)
FullPredictions.columns = ['RandomForest','ElasticNet','Lasso','XGBoost', 'Actual']
FullPredictions

Unnamed: 0,RandomForest,ElasticNet,Lasso,XGBoost,Actual
0,127907.37,123099.620459,119290.995664,132273.125,136000
1,209822.66,222571.744604,212190.369735,200348.265625,194201
2,265582.27,262885.7825,275875.900414,253904.546875,294000
3,161427.4,143940.12372,147633.276472,152565.5625,132000
4,114894.0,106608.549399,122229.836557,114184.703125,119000
5,104674.01,86323.606171,87830.018587,111806.257812,110000
6,156042.0,177799.241999,182449.913781,155183.015625,162500
7,129142.99,136537.33486,141429.876937,127030.09375,153000
8,152401.0,139980.440954,138589.047488,146457.90625,140000
9,263296.59,274650.782519,286125.066609,259467.015625,220000


In [582]:
meltTemp = pd.melt(FullPredictions, id_vars='Actual')
meltTemp
#fig = px.scatter(meltTemp.loc[meltTemp['variable'].isin(['RandomForest', 'Lasso', 'ElasticNet','XGBoost'])], x = 'Actual', y = 'value', color = 'variable')
#fig


Unnamed: 0,Actual,variable,value
0,136000,RandomForest,127907.370000
1,194201,RandomForest,209822.660000
2,294000,RandomForest,265582.270000
3,132000,RandomForest,161427.400000
4,119000,RandomForest,114894.000000
...,...,...,...
3459,274725,XGBoost_Variance,30681.125000
3460,84000,XGBoost_Variance,519.507812
3461,485000,XGBoost_Variance,41466.625000
3462,215000,XGBoost_Variance,4698.703125


In [545]:
FullPredictions['RandomForest_Variance'] = np.sqrt((FullPredictions['RandomForest'] - FullPredictions['Actual'])**2)
FullPredictions['ElasticNet_Variance'] = np.sqrt((FullPredictions['ElasticNet'] - FullPredictions['Actual'])**2)
FullPredictions['Lasso_Variance'] = np.sqrt((FullPredictions['Lasso'] - FullPredictions['Actual'])**2)
FullPredictions['XGBoost_Variance'] = np.sqrt((FullPredictions['XGBoost'] - FullPredictions['Actual'])**2)
FullPredictions

Unnamed: 0,RandomForest,ElasticNet,Lasso,XGBoost,Actual,RandomForest_Variance,ElasticNet_Variance,Lasso_Variance,XGBoost_Variance
0,127907.37,123099.620459,119290.995664,132273.125,136000,8092.63,12900.379541,16709.004336,3726.875
1,209822.66,222571.744604,212190.369735,200348.265625,194201,15621.66,28370.744604,17989.369735,6147.265625
2,265582.27,262885.7825,275875.900414,253904.546875,294000,28417.73,31114.2175,18124.099586,40095.453125
3,161427.4,143940.12372,147633.276472,152565.5625,132000,29427.4,11940.12372,15633.276472,20565.5625
4,114894.0,106608.549399,122229.836557,114184.703125,119000,4106.0,12391.450601,3229.836557,4815.296875
5,104674.01,86323.606171,87830.018587,111806.257812,110000,5325.99,23676.393829,22169.981413,1806.257812
6,156042.0,177799.241999,182449.913781,155183.015625,162500,6458.0,15299.241999,19949.913781,7316.984375
7,129142.99,136537.33486,141429.876937,127030.09375,153000,23857.01,16462.66514,11570.123063,25969.90625
8,152401.0,139980.440954,138589.047488,146457.90625,140000,12401.0,19.559046,1410.952512,6457.90625
9,263296.59,274650.782519,286125.066609,259467.015625,220000,43296.59,54650.782519,66125.066609,39467.015625


In [572]:
ScoreDF = pd.DataFrame({
    'RandomForestMAE': [FullPredictions.RandomForest_Variance.sum()/(len(FullPredictions.index))],
    'ElasticNetMAE': [FullPredictions.ElasticNet_Variance.sum()/(len(FullPredictions.index))],
    'LassoMAE': [FullPredictions.Lasso_Variance.sum()/(len(FullPredictions.index))],
    'XGBoostMAE': [FullPredictions.XGBoost_Variance.sum()/(len(FullPredictions.index))]
})
ScoreDF

    

Unnamed: 0,RandomForestMAE,ElasticNetMAE,LassoMAE,XGBoostMAE
0,17318.913487,18152.003542,17641.972239,17006.258697


In [585]:
ScoreDF['RandomForestRSquared'] = 1-(ScoreDF.RandomForestMAE/mean_test[0])
ScoreDF['ElasticNetRSquared'] = 1-(ScoreDF.ElasticNetMAE/mean_test[0])
ScoreDF['LassoRSquared'] = 1-(ScoreDF.LassoMAE/mean_test[0])
ScoreDF['XGBoostRSquared'] = 1-(ScoreDF.XGBoostMAE/mean_test[0])
ScoreDF.drop(['RandomForest'], axis =1, inplace = True)


In [586]:
ScoreDF

Unnamed: 0,RandomForestMAE,ElasticNetMAE,LassoMAE,XGBoostMAE,RandomForestRSquared,ElasticNetRSquared,LassoRSquared,XGBoostRSquared
0,17318.913487,18152.003542,17641.972239,17006.258697,0.902655,0.897972,0.900839,0.904412


In [587]:
import itertools

In [591]:
list(itertools.combinations(['RandomForest', 'Lasso', 'XGBoost', 'ElasticNet'], 4))

[('RandomForest', 'Lasso', 'XGBoost', 'ElasticNet')]

In [608]:
EnsembleDF = FullPredictions.loc[:,['RandomForest', 'Lasso', 'XGBoost', 'ElasticNet', 'Actual']]

loopDF = FullPredictions.loc[:,['Actual']]
minMAE = float('Inf')
pLst = []
for i in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
    for j in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
        for k in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
            for m in [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0]:
                if (i + j + k + m) == 1:
                    loopDF['RandomForest'] = FullPredictions.RandomForest * i
                    loopDF['Lasso'] = FullPredictions.Lasso * j
                    loopDF['XGBoost'] = FullPredictions.XGBoost * k
                    loopDF['ElasticNet'] = FullPredictions.ElasticNet * m
                    loopDF['Prediction'] = loopDF['ElasticNet'] + loopDF['XGBoost'] + loopDF['Lasso'] + loopDF['RandomForest']
                    loopDF['Difference'] = loopDF['Prediction'] - loopDF['Actual']
                    loopDF['Diffsq'] = np.sqrt(loopDF['Difference']**2)
                    mae = loopDF.Diffsq.sum()/len(loopDF.index)
                    if mae < minMAE:
                        minMAE = mae
                        pLst = [i,j,k,m]
                        print(mae)


                        
        
    
    

17318.91348729792
16574.940506319082
16015.133322247513
15598.672123834345
15373.447717961752
15258.137734200809
15193.143121363948
15156.800342956805
15147.643506076836


In [609]:
pLst

[0.2, 0.5, 0.3, 0]

In [612]:
minMAE

15147.643506076836

# Ensembling Complete:
### RandomForest = 0.2
### Lasso = 0.5
### XGBoost = 0.3