In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
#need this list of imports later on
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import LinearSVR
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
df = pd.read_excel('./datasets/combined.xlsx', index_col=0)

In [3]:
df.head()

Unnamed: 0,Dwelling Type,Year,Month,Region,Towns,Avg kWh,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,1-room / 2-room,2005,1,Central Region,Bishan,104.9,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
1,1-room / 2-room,2005,1,Central Region,Bukit Merah,140.7,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
2,1-room / 2-room,2005,1,Central Region,Central Region,136.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
3,1-room / 2-room,2005,1,Central Region,Geylang,148.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
4,1-room / 2-room,2005,1,Central Region,Kallang,115.6,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3


In [4]:
df_one_hot_encoded = df.copy()

In [5]:
# # Using 120 min data as identified in the notebook with K-NN
features_df = pd.get_dummies(df_one_hot_encoded, columns=['Dwelling Type', 'Month', 'Towns', 'Region'])
del features_df['Avg kWh']
del features_df['Highest 30 min Rainfall (mm)']
del features_df['Highest 60 min Rainfall (mm)']
features_df

Unnamed: 0,Year,Daily Rainfall Total (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Dwelling Type_1-room / 2-room,Dwelling Type_3-room,...,Towns_Tanglin,Towns_Toa Payoh,Towns_West Region,Towns_Woodlands,Towns_Yishun,Region_Central Region,Region_East Region,Region_North East Region,Region_North Region,Region_West Region
0,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
1,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
2,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
3,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
4,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60563,2021,9.8,9.4,28.1,32.7,24.5,8.6,31.9,0,0,...,0,0,0,0,0,0,0,0,0,1
60564,2021,7.5,6.7,28.2,32.1,25.2,4.9,26.5,0,0,...,0,0,0,0,0,0,0,0,0,1
60565,2021,8.5,7.9,28.2,32.4,24.9,6.7,29.2,0,0,...,0,0,0,0,0,0,0,0,0,1
60566,2021,8.5,7.9,28.2,32.4,24.9,6.7,29.2,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# apply normalization techniques
for column in features_df.columns:
    features_df[column] = (features_df[column] - features_df[column].min()) / (features_df[column].max() - features_df[column].min())

In [7]:
features_df

Unnamed: 0,Year,Daily Rainfall Total (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Dwelling Type_1-room / 2-room,Dwelling Type_3-room,...,Towns_Tanglin,Towns_Toa Payoh,Towns_West Region,Towns_Woodlands,Towns_Yishun,Region_Central Region,Region_East Region,Region_North East Region,Region_North Region,Region_West Region
0,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60563,1.0,0.151703,0.388430,0.634615,0.650000,0.446809,0.321429,0.375000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60564,1.0,0.116099,0.276860,0.653846,0.550000,0.595745,0.101190,0.252273,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60565,1.0,0.131579,0.326446,0.653846,0.600000,0.531915,0.208333,0.313636,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60566,1.0,0.131579,0.326446,0.653846,0.600000,0.531915,0.208333,0.313636,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
X = features_df
y = df_one_hot_encoded['Avg kWh']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
eval_results = {}
def perform_eval(model, model_name):
    #Mean squared error 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    #Train
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    #Test
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

    #R Squared Score
    #Train
    r2train = model.score(X_train, y_train)
    adjr2_train = 1 - (1-r2train) * (len(y)-1)/(len(y)-X.shape[1]-1)
    #Test
    r2test = model.score(X_test, y_test)
    adjr2_test = 1 - (1-r2test) * (len(y)-1)/(len(y)-X.shape[1]-1)
    
    eval_results[model_name] = {
        "MSE (Train)" : mse_train,
        "MSE (Test)" : mse_test,
        "RMSE (Train)" : rmse_train,
        "RMSE (Test)" : rmse_test,
        "R2 (Train)" : r2train,
        "R2 (Test)" : r2test,
        "Adj R2 (Train)" : adjr2_train,
        "Adj R2 (Test)" : adjr2_test
    }
    
    return eval_results[model_name]

In [11]:
def get_best_model(model, best_model):
    if best_model == "":
        best_model = model

    #test adj r2
    best_adjr2_test = 1 - (1-best_model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)
    model_adjr2_test = 1 - (1-model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)

    if model_adjr2_test > best_adjr2_test:
        best_model = model

    return best_model

In [12]:
def compare_results(desired_model):
    metric_data = []
    col_names = ["Models"]
    col_done = False

    for model in eval_results:
        eval_list = []
        #Append Model names
        if (desired_model.lower() in str(model).lower()) or (desired_model.lower() == "all"):
            eval_list.append(model)
            for metric in eval_results[model]:
                if not col_done:
                    col_names.append(metric)
                eval_list.append(eval_results[model][metric])
            col_done = True
            metric_data.append(eval_list)
        
    df = pd.DataFrame(metric_data, columns=col_names)
    df = df.sort_values(by=['Adj R2 (Test)'], ascending=False)
    df = df.style.set_table_attributes("style='display:inline'").set_caption(f'{str(desired_model).capitalize()} Models (Sort by Adj R2 (Test))')
    
    return df

# Ensemble Methods

# 1) Extra Tree Regressor

In [13]:
#Reset best model for each section
best_model = ""

In [14]:
initial_etr=ExtraTreesRegressor(max_features=20, n_estimators=3, random_state=7, max_depth=3)
initial_etr.fit(X_train,y_train)

ExtraTreesRegressor(max_depth=3, max_features=20, n_estimators=3,
                    random_state=7)

In [15]:
best_model = get_best_model(initial_etr, best_model)
res_val = perform_eval(initial_etr, "Extra Tree Regressor (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 96529.22128013207
MSE (Test): 90950.85234820581
RMSE (Train): 310.6915210946898
RMSE (Test): 301.5805901383672
R2 (Train): 0.48002450733907687
R2 (Test): 0.4914311385841901
Adj R2 (Train): 0.47933678866543006
Adj R2 (Test): 0.4907585063010008


# Grid Search CV for best n_estimators and max_features, max_depth

In [16]:
param_grid=[
{'n_estimators':[50,100,200],'max_features':[1,2,5,10,20], 'max_depth':[3,5,7]}]
forest_reg_extra=ExtraTreesRegressor(random_state=7)
rnd_grid_search_extra=GridSearchCV(forest_reg_extra,param_grid, cv=10)
rnd_grid_search_extra.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=ExtraTreesRegressor(random_state=7),
             param_grid=[{'max_depth': [3, 5, 7],
                          'max_features': [1, 2, 5, 10, 20],
                          'n_estimators': [50, 100, 200]}])

In [17]:
rnd_grid_search_extra.best_params_

{'max_depth': 7, 'max_features': 20, 'n_estimators': 200}

In [18]:
best_params_etr=ExtraTreesRegressor(max_features=20, n_estimators=200, max_depth=7, random_state=7)
best_params_etr.fit(X_train,y_train)

ExtraTreesRegressor(max_depth=7, max_features=20, n_estimators=200,
                    random_state=7)

In [19]:
best_model = get_best_model(best_params_etr, best_model)
res_val = perform_eval(best_params_etr, "Extra Tree Regressor (Best Params)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 27006.37983034979
MSE (Test): 26566.010655491766
RMSE (Train): 164.33617931042997
RMSE (Test): 162.99082997362694
R2 (Train): 0.8545243039253188
R2 (Test): 0.8514511360520499
Adj R2 (Train): 0.8543318980251092
Adj R2 (Test): 0.8512546655854069


In [22]:
etr_df = compare_results("extra tree")
etr_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,Extra Tree Regressor (Best Params),27006.37983,26566.010655,164.336179,162.99083,0.854524,0.851451,0.854332,0.851255
0,Extra Tree Regressor (Initial),96529.22128,90950.852348,310.691521,301.58059,0.480025,0.491431,0.479337,0.490759


In [23]:
import pickle
filename = 'ensemble_etr.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

# 2) Random Forest Regressor

In [24]:
#Reset best model for each section
best_model = ""

In [25]:
from sklearn.ensemble import RandomForestRegressor
#default criterion is MSE
# fitting RandomForestRegressor with training dataset
# criterion = MSE
initial_rfr = RandomForestRegressor(
    random_state=7,
    n_estimators = 50, max_depth=3, max_features=20)
initial_rfr.fit(X_train, y_train.values.ravel())

RandomForestRegressor(max_depth=3, max_features=20, n_estimators=50,
                      random_state=7)

In [26]:
best_model = get_best_model(initial_rfr, best_model)
res_val = perform_eval(initial_rfr, "Random Forest Regressor (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 74470.46354116454
MSE (Test): 71533.56391412587
RMSE (Train): 272.89276930905396
RMSE (Test): 267.4575927397199
R2 (Train): 0.5988487687461084
R2 (Test): 0.6000065726317547
Adj R2 (Train): 0.598318206831973
Adj R2 (Test): 0.5994775420270055


# Gridsearch CV for best n_estimators, max_features, max_depth

In [27]:
param_grid=[
{'n_estimators':[50,100,200],'max_features':[1,2,5,10,20], 'max_depth':[3,5,7]}]
forest_reg=RandomForestRegressor(random_state=7)
rnd_grid_search=GridSearchCV(forest_reg,param_grid)
rnd_grid_search.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=7),
             param_grid=[{'max_depth': [3, 5, 7],
                          'max_features': [1, 2, 5, 10, 20],
                          'n_estimators': [50, 100, 200]}])

In [28]:
rnd_grid_search.best_params_

{'max_depth': 7, 'max_features': 20, 'n_estimators': 50}

In [29]:
best_params_rfr = RandomForestRegressor(
    random_state=7,
    n_estimators = 50, max_features=20, max_depth=7)
best_params_rfr.fit(X_train, y_train.values.ravel())

RandomForestRegressor(max_depth=7, max_features=20, n_estimators=50,
                      random_state=7)

In [30]:
best_model = get_best_model(best_params_rfr, best_model)
res_val = perform_eval(best_params_rfr, "Random Forest Regressor (Best Params)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 27447.049300168026
MSE (Test): 26937.765106422536
RMSE (Train): 165.67151022480607
RMSE (Test): 164.12728324816243
R2 (Train): 0.8521505426784068
R2 (Test): 0.8493724008565589
Adj R2 (Train): 0.8519549972457399
Adj R2 (Test): 0.849173181058396


Try to improve somemore

In [31]:
param_grid=[
{'n_estimators':[50,100],'max_features':[10,20,50]}]
forest_reg=RandomForestRegressor(random_state=7, max_depth=7)
rnd_grid_search=GridSearchCV(forest_reg,param_grid)
rnd_grid_search.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestRegressor(max_depth=7, random_state=7),
             param_grid=[{'max_features': [10, 20, 50],
                          'n_estimators': [50, 100]}])

In [32]:
rnd_grid_search.best_params_

{'max_features': 50, 'n_estimators': 100}

In [33]:
best_params_rfr_v2 = RandomForestRegressor(
    random_state=7,
    n_estimators = 100, max_features=50, max_depth=7)
best_params_rfr_v2.fit(X_train, y_train.values.ravel())

RandomForestRegressor(max_depth=7, max_features=50, random_state=7)

In [34]:
best_model = get_best_model(best_params_rfr_v2, best_model)
res_val = perform_eval(best_params_rfr_v2, "Random Forest Regressor (Best Params v2)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 17555.93643692776
MSE (Test): 17487.6574728108
RMSE (Train): 132.4988167378402
RMSE (Test): 132.2409069570033
R2 (Train): 0.9054311577690698
R2 (Test): 0.9022144617652667
Adj R2 (Train): 0.9053060811843743
Adj R2 (Test): 0.9020851307840843


In [35]:
rfr_df = compare_results("random forest")
rfr_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
2,Random Forest Regressor (Best Params v2),17555.936437,17487.657473,132.498817,132.240907,0.905431,0.902214,0.905306,0.902085
1,Random Forest Regressor (Best Params),27447.0493,26937.765106,165.67151,164.127283,0.852151,0.849372,0.851955,0.849173
0,Random Forest Regressor (Initial),74470.463541,71533.563914,272.892769,267.457593,0.598849,0.600007,0.598318,0.599478


In [36]:
import pickle
filename = 'ensemble_rfr.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

# 3) Voting Regressor

In [37]:
#Reset best model for each section
best_model = ""

Here we try to put in all the models we have did so far (fine tuned), but we exclude K-NN given its long prediction time

In [38]:
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import SGDRegressor
from sklearn import ensemble
from sklearn.linear_model import ElasticNet
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge
from sklearn import linear_model

In [39]:
#Non-ensemble methods
lr = LinearRegression()
r = Ridge(alpha = 1.0, solver='sparse_cg', random_state=7)
ls = linear_model.Lasso(alpha = 0.09797670560760961, max_iter = 500,random_state=7)
en = ElasticNet(alpha = 0.01, l1_ratio = 0.5, random_state=7)
sgd = SGDRegressor(max_iter=100, alpha=0, eta0=0.02 ,random_state=7, penalty='l2')
svr = LinearSVR(random_state=7, max_iter=30000, loss="squared_epsilon_insensitive", C=0.1)

#Ensemble methods
#results from gridsearchcv file
gbr = ensemble.GradientBoostingRegressor(
    n_estimators=3000, #how many decision trees to build
    learning_rate=0.1, #how much decision trees influence overall prediction
    max_depth=9,
    min_samples_leaf=15,
    max_features=1.0,
    loss='huber',
    random_state=7
)

etr = ExtraTreesRegressor(max_features=20, n_estimators=200, max_depth=7, random_state=7)

rfr = RandomForestRegressor(random_state=7,n_estimators = 100, max_features=50, max_depth=7)

In [40]:
first_voting_reg = VotingRegressor(
    estimators=[('lr', lr), ('r', r), ('ls', ls),
               ('en',en),('svr',svr),('sgd',sgd),('etr',etr),('rfr',rfr),('gbr',gbr)])

In [41]:
first_voting_reg.fit(X_train,y_train.ravel())

VotingRegressor(estimators=[('lr', LinearRegression()),
                            ('r', Ridge(random_state=7, solver='sparse_cg')),
                            ('ls',
                             Lasso(alpha=0.09797670560760961, max_iter=500,
                                   random_state=7)),
                            ('en', ElasticNet(alpha=0.01, random_state=7)),
                            ('svr',
                             LinearSVR(C=0.1,
                                       loss='squared_epsilon_insensitive',
                                       max_iter=30000, random_state=7)),
                            ('sgd',
                             SGDRegressor(alpha=0, eta0=0.02, max_iter=100,
                                          random_state=7)),
                            ('etr',
                             ExtraTreesRegressor(max_depth=7, max_features=20,
                                                 n_estimators=200,
                                          

In [42]:
best_model = get_best_model(first_voting_reg, best_model)
res_val = perform_eval(first_voting_reg, "Voting Regressor (v1)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 29510.29104722374
MSE (Test): 29036.556117990272
RMSE (Train): 171.78559615760497
RMSE (Test): 170.40116231408246
R2 (Train): 0.841036445520299
R2 (Test): 0.8376366146869443
Adj R2 (Train): 0.8408262006022442
Adj R2 (Test): 0.8374218731585986


Save model

In [43]:
import pickle
filename = 'voting_reg_v1.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(first_voting_reg, open(save_location, 'wb'))

# Retrain with better models (above 0.8)

In [44]:
second_voting_reg = VotingRegressor(
    estimators=[('etr',etr),('rf',rfr),('gbr',gbr)])
second_voting_reg.fit(X_train,y_train.ravel())

VotingRegressor(estimators=[('etr',
                             ExtraTreesRegressor(max_depth=7, max_features=20,
                                                 n_estimators=200,
                                                 random_state=7)),
                            ('rf',
                             RandomForestRegressor(max_depth=7, max_features=50,
                                                   random_state=7)),
                            ('gbr',
                             GradientBoostingRegressor(loss='huber',
                                                       max_depth=9,
                                                       max_features=1.0,
                                                       min_samples_leaf=15,
                                                       n_estimators=3000,
                                                       random_state=7))])

In [45]:
best_model = get_best_model(second_voting_reg, best_model)
res_val = perform_eval(second_voting_reg, "Voting Regressor (v2)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 10557.985917672777
MSE (Test): 10906.038975039939
RMSE (Train): 102.75206040597325
RMSE (Test): 104.43198252949112
R2 (Train): 0.9431271292128518
R2 (Test): 0.9390168241320289
Adj R2 (Train): 0.9430519092538032
Adj R2 (Test): 0.9389361678906971


In [46]:
import pickle
filename = 'voting_reg_v2.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(second_voting_reg, open(save_location, 'wb'))

In [47]:
vr_df = compare_results("voting")
vr_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,Voting Regressor (v2),10557.985918,10906.038975,102.75206,104.431983,0.943127,0.939017,0.943052,0.938936
0,Voting Regressor (v1),29510.291047,29036.556118,171.785596,170.401162,0.841036,0.837637,0.840826,0.837422


# 4) Stacking Regressor

In [48]:
#Reset best model for each section
best_model = ""

Here we try to put in all the models we have did so far (fine tuned), excluding KNN

In [49]:
#Non-ensemble methods
lr = LinearRegression()
r = Ridge(alpha = 1.0, solver='sparse_cg', random_state=7)
ls = linear_model.Lasso(alpha = 0.09797670560760961, max_iter = 500,random_state=7)
en = ElasticNet(alpha = 0.01, l1_ratio = 0.5, random_state=7)
sgd = SGDRegressor(max_iter=100, alpha=0, eta0=0.02 ,random_state=7, penalty='l2')
svr = LinearSVR(random_state=7, max_iter=30000, loss="squared_epsilon_insensitive", C=0.1)

#Ensemble methods
#results from gridsearchcv file
gbr = ensemble.GradientBoostingRegressor(
    n_estimators=3000, #how many decision trees to build
    learning_rate=0.1, #how much decision trees influence overall prediction
    max_depth=9,
    min_samples_leaf=15,
    max_features=1.0,
    loss='huber',
    random_state=7
)

etr = ExtraTreesRegressor(max_features=20, n_estimators=200, max_depth=7, random_state=7)

rfr = RandomForestRegressor(random_state=7,n_estimators = 100, max_features=50, max_depth=7)

In [50]:
from sklearn.ensemble import StackingRegressor
first_sr = StackingRegressor(
    estimators=[('lr', lr), ('r', r), ('ls', ls),
               ('en',en),('svr',svr),('sgd',sgd),('etr',etr),('rf',rfr),('gbr',gbr)])
first_sr.fit(X_train,y_train.ravel())

StackingRegressor(estimators=[('lr', LinearRegression()),
                              ('r', Ridge(random_state=7, solver='sparse_cg')),
                              ('ls',
                               Lasso(alpha=0.09797670560760961, max_iter=500,
                                     random_state=7)),
                              ('en', ElasticNet(alpha=0.01, random_state=7)),
                              ('svr',
                               LinearSVR(C=0.1,
                                         loss='squared_epsilon_insensitive',
                                         max_iter=30000, random_state=7)),
                              ('sgd',
                               SGDRegressor(alpha=0, eta0=0.02, max_iter=100,
                                            random_state=7)),
                              ('etr',
                               ExtraTreesRegressor(max_depth=7, max_features=20,
                                                   n_estimators=200,
          

In [51]:
best_model = get_best_model(first_sr, best_model)
res_val = perform_eval(first_sr, "Stacking Regressor (v1)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 2156.9541649295443
MSE (Test): 3219.3317903076213
RMSE (Train): 46.443020626672684
RMSE (Test): 56.73915570668655
R2 (Train): 0.9883811006689731
R2 (Test): 0.981998498520407
Adj R2 (Train): 0.9883657335331177
Adj R2 (Test): 0.9819746897661562


In [52]:
import pickle
filename = 'first_sr.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(first_sr, open(save_location, 'wb'))

# Retrain with better models (above 0.8)
# As identified in voting regressor section, predicting with K-NN takes a long time, hence we will exclude it here as well.

In [53]:
from sklearn.ensemble import StackingRegressor
second_sr = StackingRegressor(
    estimators=[('etr',etr),('rf',rfr),('gbr',gbr)])
second_sr.fit(X_train,y_train.ravel())

StackingRegressor(estimators=[('etr',
                               ExtraTreesRegressor(max_depth=7, max_features=20,
                                                   n_estimators=200,
                                                   random_state=7)),
                              ('rf',
                               RandomForestRegressor(max_depth=7,
                                                     max_features=50,
                                                     random_state=7)),
                              ('gbr',
                               GradientBoostingRegressor(loss='huber',
                                                         max_depth=9,
                                                         max_features=1.0,
                                                         min_samples_leaf=15,
                                                         n_estimators=3000,
                                                         random_state=7))])

In [54]:
best_model = get_best_model(second_sr, best_model)
res_val = perform_eval(second_sr, "Stacking Regressor (v2)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 2150.686299423104
MSE (Test): 3208.2634438293053
RMSE (Train): 46.37549244399572
RMSE (Test): 56.64153461753403
R2 (Train): 0.9884148638798581
R2 (Test): 0.9820603892693214
Adj R2 (Train): 0.9883995413991662
Adj R2 (Test): 0.9820366623716664


In [55]:
import pickle
filename = 'second_sr.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(second_sr, open(save_location, 'wb'))

In [56]:
sr_df = compare_results("stacking")
sr_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,Stacking Regressor (v2),2150.686299,3208.263444,46.375492,56.641535,0.988415,0.98206,0.9884,0.982037
0,Stacking Regressor (v1),2156.954165,3219.33179,46.443021,56.739156,0.988381,0.981998,0.988366,0.981975


In [57]:
all_df = compare_results("all")
all_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
8,Stacking Regressor (v2),2150.686299,3208.263444,46.375492,56.641535,0.988415,0.98206,0.9884,0.982037
7,Stacking Regressor (v1),2156.954165,3219.33179,46.443021,56.739156,0.988381,0.981998,0.988366,0.981975
6,Voting Regressor (v2),10557.985918,10906.038975,102.75206,104.431983,0.943127,0.939017,0.943052,0.938936
4,Random Forest Regressor (Best Params v2),17555.936437,17487.657473,132.498817,132.240907,0.905431,0.902214,0.905306,0.902085
1,Extra Tree Regressor (Best Params),27006.37983,26566.010655,164.336179,162.99083,0.854524,0.851451,0.854332,0.851255
3,Random Forest Regressor (Best Params),27447.0493,26937.765106,165.67151,164.127283,0.852151,0.849372,0.851955,0.849173
5,Voting Regressor (v1),29510.291047,29036.556118,171.785596,170.401162,0.841036,0.837637,0.840826,0.837422
2,Random Forest Regressor (Initial),74470.463541,71533.563914,272.892769,267.457593,0.598849,0.600007,0.598318,0.599478
0,Extra Tree Regressor (Initial),96529.22128,90950.852348,310.691521,301.58059,0.480025,0.491431,0.479337,0.490759
