In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# to remove warning calls/emitted
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle

In [2]:
df = pd.read_excel('./datasets/combined.xlsx', index_col=0)

In [3]:
df.head()

Unnamed: 0,Dwelling Type,Year,Month,Region,Towns,Avg kWh,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,1-room / 2-room,2005,1,Central Region,Bishan,104.9,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
1,1-room / 2-room,2005,1,Central Region,Bukit Merah,140.7,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
2,1-room / 2-room,2005,1,Central Region,Central Region,136.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
3,1-room / 2-room,2005,1,Central Region,Geylang,148.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
4,1-room / 2-room,2005,1,Central Region,Kallang,115.6,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3


In [4]:
df_one_hot_encoded = df.copy()

In [5]:
features_df = pd.get_dummies(df_one_hot_encoded, columns=['Dwelling Type', 'Month', 'Towns', 'Region'])
del features_df['Avg kWh']
del features_df['Highest 30 min Rainfall (mm)']
del features_df['Highest 60 min Rainfall (mm)']
features_df

Unnamed: 0,Year,Daily Rainfall Total (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Dwelling Type_1-room / 2-room,Dwelling Type_3-room,...,Towns_Tanglin,Towns_Toa Payoh,Towns_West Region,Towns_Woodlands,Towns_Yishun,Region_Central Region,Region_East Region,Region_North East Region,Region_North Region,Region_West Region
0,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
1,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
2,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
3,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
4,2005,2.9,5.0,27.5,31.3,25.1,7.2,35.3,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60563,2021,9.8,9.4,28.1,32.7,24.5,8.6,31.9,0,0,...,0,0,0,0,0,0,0,0,0,1
60564,2021,7.5,6.7,28.2,32.1,25.2,4.9,26.5,0,0,...,0,0,0,0,0,0,0,0,0,1
60565,2021,8.5,7.9,28.2,32.4,24.9,6.7,29.2,0,0,...,0,0,0,0,0,0,0,0,0,1
60566,2021,8.5,7.9,28.2,32.4,24.9,6.7,29.2,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# apply normalization techniques
for column in features_df.columns:
    features_df[column] = (features_df[column] - features_df[column].min()) / (features_df[column].max() - features_df[column].min())

In [7]:
features_df

Unnamed: 0,Year,Daily Rainfall Total (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Dwelling Type_1-room / 2-room,Dwelling Type_3-room,...,Towns_Tanglin,Towns_Toa Payoh,Towns_West Region,Towns_Woodlands,Towns_Yishun,Region_Central Region,Region_East Region,Region_North East Region,Region_North Region,Region_West Region
0,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60563,1.0,0.151703,0.388430,0.634615,0.650000,0.446809,0.321429,0.375000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60564,1.0,0.116099,0.276860,0.653846,0.550000,0.595745,0.101190,0.252273,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60565,1.0,0.131579,0.326446,0.653846,0.600000,0.531915,0.208333,0.313636,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60566,1.0,0.131579,0.326446,0.653846,0.600000,0.531915,0.208333,0.313636,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
X = features_df
y = df_one_hot_encoded['Avg kWh']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
eval_results = {}
def perform_eval(model, model_name):
    #Mean squared error 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    #Train
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    #Test
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

    #R Squared Score
    #Train
    r2train = model.score(X_train, y_train)
    adjr2_train = 1 - (1-r2train) * (len(y)-1)/(len(y)-X.shape[1]-1)
    #Test
    r2test = model.score(X_test, y_test)
    adjr2_test = 1 - (1-r2test) * (len(y)-1)/(len(y)-X.shape[1]-1)
    
    eval_results[model_name] = {
        "MSE (Train)" : mse_train,
        "MSE (Test)" : mse_test,
        "RMSE (Train)" : rmse_train,
        "RMSE (Test)" : rmse_test,
        "R2 (Train)" : r2train,
        "R2 (Test)" : r2test,
        "Adj R2 (Train)" : adjr2_train,
        "Adj R2 (Test)" : adjr2_test
    }
    
    return eval_results[model_name]

In [11]:
def get_best_model(model, best_model):
    if best_model == "":
        best_model = model

    #test adj r2
    best_adjr2_test = 1 - (1-best_model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)
    model_adjr2_test = 1 - (1-model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)

    if model_adjr2_test > best_adjr2_test:
        best_model = model

    return best_model

In [12]:
def compare_results(desired_model):
    metric_data = []
    col_names = ["Models"]
    col_done = False

    for model in eval_results:
        eval_list = []
        #Append Model names
        if (desired_model.lower() in str(model).lower()) or (desired_model.lower() == "all"):
            eval_list.append(model)
            for metric in eval_results[model]:
                if not col_done:
                    col_names.append(metric)
                eval_list.append(eval_results[model][metric])
            col_done = True
            metric_data.append(eval_list)
        
    df = pd.DataFrame(metric_data, columns=col_names)
    df = df.sort_values(by=['Adj R2 (Test)'], ascending=False)
    df = df.style.set_table_attributes("style='display:inline'").set_caption(f'{str(desired_model).capitalize()} Models (Sort by Adj R2 (Test))')
    
    return df

# 1) Ridge Regression

Linear least squares with l2 regularization

In [13]:
#Reset best model for each section
best_model = ""

In [14]:
from sklearn.linear_model import Ridge
# to remove warning calls/emitted
initial_ridge_reg = Ridge(alpha=0.001,solver="cholesky", random_state=7)
initial_ridge_reg.fit(X_train,y_train)

Ridge(alpha=0.001, random_state=7, solver='cholesky')

In [15]:
best_model = get_best_model(initial_ridge_reg, best_model)
res_val = perform_eval(initial_ridge_reg, "Ridge Regression (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48215.49201472936
MSE (Test): 47387.174160038165
RMSE (Train): 219.58026326318438
RMSE (Test): 217.68595306091333
R2 (Train): 0.740276841750428
R2 (Test): 0.7350256695119504
Adj R2 (Train): 0.7399333323573358
Adj R2 (Test): 0.7346752149276754


# Lets use Ridge CV to find the best alpha for our model

In [16]:
#0.0 to 1.0
alphas =[]
for x in range(0, 101, 1):
    alphas.append((x/100))

In [17]:
from sklearn.linear_model import RidgeCV
ridge_cv = RidgeCV(alphas = alphas, cv = 10)
ridge_cv.fit(X_train, y_train)

RidgeCV(alphas=array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ]),
        cv=10)

In [18]:
alpha = ridge_cv.alpha_
alpha

1.0

In [19]:
best_alpha_ridge_reg = Ridge(alpha = ridge_cv.alpha_, random_state=7)
best_alpha_ridge_reg.fit(X_train, y_train)

Ridge(random_state=7)

In [20]:
best_model = get_best_model(best_alpha_ridge_reg, best_model)
res_val = perform_eval(best_alpha_ridge_reg, "Ridge Regression (Best Alpha)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48215.5846319574
MSE (Test): 47377.80044219072
RMSE (Train): 219.58047415915058
RMSE (Test): 217.66442162694094
R2 (Train): 0.7402763428477315
R2 (Test): 0.7350780844249482
Adj R2 (Train): 0.7399328327947915
Adj R2 (Test): 0.7347276991645451


# Lets use gridsearchCV to find the best solver

In [21]:
solver = ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]

we use the same alpha as above

In [22]:
params = {'solver':solver}
ridge_reg = Ridge(random_state=7, alpha = ridge_cv.alpha_)
model = GridSearchCV(ridge_reg, params, scoring='r2', cv=10)
model.fit(X_train,y_train)
model.best_params_

{'solver': 'sparse_cg'}

In [23]:
best_params_ridge_reg = Ridge(alpha = ridge_cv.alpha_, solver='sparse_cg', random_state=7)
best_params_ridge_reg.fit(X_train, y_train)

Ridge(random_state=7, solver='sparse_cg')

In [24]:
best_model = get_best_model(best_params_ridge_reg, best_model)
res_val = perform_eval(best_params_ridge_reg, "Ridge Regression (Best Params)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48215.88491049413
MSE (Test): 47375.823381363545
RMSE (Train): 219.58115791318284
RMSE (Test): 217.65988004536698
R2 (Train): 0.7402747253325583
R2 (Test): 0.7350891395338104
Adj R2 (Train): 0.7399312131402955
Adj R2 (Test): 0.7347387688948748


In [25]:
ridge_df = compare_results("ridge")
ridge_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
2,Ridge Regression (Best Params),48215.88491,47375.823381,219.581158,217.65988,0.740275,0.735089,0.739931,0.734739
1,Ridge Regression (Best Alpha),48215.584632,47377.800442,219.580474,217.664422,0.740276,0.735078,0.739933,0.734728
0,Ridge Regression (Initial),48215.492015,47387.17416,219.580263,217.685953,0.740277,0.735026,0.739933,0.734675


In [26]:
import pickle
filename = 'ridge.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

# 2) Lasso Regression

Linear Model trained with L1 prior as regularizer (aka the Lasso).

In [27]:
#Reset best model for each section
best_model = ""

In [28]:
from sklearn import linear_model
#use default alpha
initial_lasso = linear_model.Lasso(alpha=0.01, random_state=7)
initial_lasso.fit(X_train,y_train)

Lasso(alpha=0.01, random_state=7)

In [29]:
best_model = get_best_model(initial_lasso, best_model)
res_val = perform_eval(initial_lasso, "Lasso Regression (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48215.91074966479
MSE (Test): 47375.76583028326
RMSE (Train): 219.58121675057907
RMSE (Test): 217.65974784117356
R2 (Train): 0.7402745861442865
R2 (Test): 0.7350894613415409
Adj R2 (Train): 0.7399310737679337
Adj R2 (Test): 0.7347390911282277


# Lets use Lasso CV to find the best alpha for our model

In [30]:
from sklearn.linear_model import LassoCV
lasso_cv = LassoCV(alphas = None, cv = 10, max_iter = 500, random_state=7)
lasso_cv.fit(X_train, y_train)

LassoCV(cv=10, max_iter=500, random_state=7)

In [31]:
best_alpha = lasso_cv.alpha_
best_alpha

0.09797670560760961

In [32]:
best_alpha_lasso = linear_model.Lasso(alpha = best_alpha, max_iter = 500,random_state=7)
best_alpha_lasso.fit(X_train, y_train)

Lasso(alpha=0.09797670560760961, max_iter=500, random_state=7)

In [33]:
best_model = get_best_model(best_alpha_lasso, best_model)
res_val = perform_eval(best_alpha_lasso, "Lasso Regression (Best Alpha)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48254.39574773834
MSE (Test): 47313.88074736757
RMSE (Train): 219.66883198974392
RMSE (Test): 217.51754124062631
R2 (Train): 0.7400672783926237
R2 (Test): 0.7354355034658802
Adj R2 (Train): 0.7397234918314024
Adj R2 (Test): 0.7350855909272731


In [34]:
laaso_df = compare_results("lasso")
laaso_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,Lasso Regression (Best Alpha),48254.395748,47313.880747,219.668832,217.517541,0.740067,0.735436,0.739723,0.735086
0,Lasso Regression (Initial),48215.91075,47375.76583,219.581217,217.659748,0.740275,0.735089,0.739931,0.734739


In [35]:
import pickle
filename = 'lasso.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

# 3) ElasticNet Regression

Linear regression with combined L1 and L2 priors as regularizer.

In [36]:
#Reset best model for each section
best_model = ""

In [37]:
from sklearn.linear_model import ElasticNet
initial_elastic_net= linear_model.ElasticNet(random_state=7, l1_ratio = 0.5,alpha=1)
initial_elastic_net.fit(X_train,y_train)

ElasticNet(alpha=1, random_state=7)

In [38]:
best_model = get_best_model(initial_elastic_net, best_model)
res_val = perform_eval(initial_elastic_net, "ElasticNet Regression (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 141002.42953963391
MSE (Test): 134699.34532205082
RMSE (Train): 375.50290217205236
RMSE (Test): 367.01409417357644
R2 (Train): 0.2404599685572235
R2 (Test): 0.2468031808913359
Adj R2 (Train): 0.23945540224519912
Adj R2 (Test): 0.24580700410080747


# Gridsearch CV to find best alpha

In [39]:
params = {'alpha':[0.01, 0.02, 0.1, 0.2, 0.5, 1.0]}
model = GridSearchCV(linear_model.ElasticNet(random_state=7, l1_ratio = 0.5), params, scoring='r2', cv=10)
model.fit(X_train,y_train)
model.best_params_

{'alpha': 0.01}

In [40]:
best_alpha_elastic_net= linear_model.ElasticNet(random_state=7, alpha = 0.01,l1_ratio = 0.5)
best_alpha_elastic_net.fit(X_train,y_train)

ElasticNet(alpha=0.01, random_state=7)

In [41]:
best_model = get_best_model(best_alpha_elastic_net, best_model)
res_val = perform_eval(best_alpha_elastic_net, "ElasticNet Regression (Best Alpha)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 50651.990813572615
MSE (Test): 48326.32865854733
RMSE (Train): 225.05997159329024
RMSE (Test): 219.83250136990057
R2 (Train): 0.7271521148905726
R2 (Test): 0.7297742098315974
Adj R2 (Train): 0.7267912467567794
Adj R2 (Test): 0.7294168096759693


In [42]:
elastic_net_df = compare_results("elastic")
elastic_net_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,ElasticNet Regression (Best Alpha),50651.990814,48326.328659,225.059972,219.832501,0.727152,0.729774,0.726791,0.729417
0,ElasticNet Regression (Initial),141002.42954,134699.345322,375.502902,367.014094,0.24046,0.246803,0.239455,0.245807


In [43]:
import pickle
filename = 'elasticnet.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

# 4) Stochastic Gradient Descent Regressor (SGD Regressor)

Linear model fitted by minimizing a regularized empirical loss with SGD.

In [44]:
#Reset best model for each section
best_model = ""

In [45]:
from sklearn.linear_model import SGDRegressor
initial_sgd_reg=SGDRegressor(max_iter=50, penalty='l2', eta0=0.1, alpha=1.0, random_state=7)
initial_sgd_reg.fit(X_train,y_train.ravel())

SGDRegressor(alpha=1.0, eta0=0.1, max_iter=50, random_state=7)

In [46]:
best_model = get_best_model(initial_sgd_reg, best_model)
res_val = perform_eval(initial_sgd_reg, "SGD Regressor (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 160876.16393668714
MSE (Test): 154291.50388554245
RMSE (Train): 401.09370966980663
RMSE (Test): 392.7995721554982
R2 (Train): 0.13340580716364114
R2 (Test): 0.13724992750162757
Adj R2 (Train): 0.1322596512057178
Adj R2 (Test): 0.1361088557705139


# Grid Search CV for best alpha and eta0

In [47]:
#from 0, 0.1,0.2...0.9 and default
alpha =[]
alpha.append(0.001) #default
for x in range(0, 10, 1):
    alpha.append((x/10))

#from 0.01, 0.02,.. 0.1
eta0 =[]
for x in range(1, 11, 1):
    eta0.append((x/100))

In [48]:
params = {'alpha':alpha, 'eta0':eta0}
sgd_reg = SGDRegressor(random_state=7, penalty='l2',max_iter=100)
model = GridSearchCV(sgd_reg, params, scoring='r2')
model.fit(X_train,y_train)
model.best_params_

{'alpha': 0.0, 'eta0': 0.02}

In [49]:
best_params_sgd_reg = SGDRegressor(max_iter=100, alpha=0, eta0=0.02, random_state=7, penalty='l2')
best_params_sgd_reg.fit(X_train,y_train.ravel())

SGDRegressor(alpha=0, eta0=0.02, max_iter=100, random_state=7)

In [50]:
best_model = get_best_model(best_params_sgd_reg, best_model)
res_val = perform_eval(best_params_sgd_reg, "SGD Regressor (Best Params)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48260.132984138436
MSE (Test): 47455.874786849236
RMSE (Train): 219.68189043282206
RMSE (Test): 217.84369347504472
R2 (Train): 0.7400363735299915
R2 (Test): 0.7346415169872209
Adj R2 (Train): 0.7396925460940532
Adj R2 (Test): 0.7342905543234912


In [51]:
sgd_df = compare_results("sgd")
sgd_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,SGD Regressor (Best Params),48260.132984,47455.874787,219.68189,217.843693,0.740036,0.734642,0.739693,0.734291
0,SGD Regressor (Initial),160876.163937,154291.503886,401.09371,392.799572,0.133406,0.13725,0.13226,0.136109


In [52]:
import pickle
filename = 'sgdRegressor.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

# 5) Support Vector Regression

In [53]:
#Reset best model for each section
best_model = ""

In [54]:
from sklearn.svm import LinearSVR
initial_svr = LinearSVR(random_state=7, max_iter=30000)
initial_svr.fit(X_train,y_train.ravel())

LinearSVR(max_iter=30000, random_state=7)

In [55]:
best_model = get_best_model(best_params_sgd_reg, best_model)
res_val = perform_eval(initial_svr, "Support Vector Regression (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 67086.34405439474
MSE (Test): 62904.090230065965
RMSE (Train): 259.010316501862
RMSE (Test): 250.80687835477312
R2 (Train): 0.6386249227135963
R2 (Test): 0.6482599038849677
Adj R2 (Train): 0.6381469686708613
Adj R2 (Test): 0.6477946930514134


# Grid Search CV for c and loss

In [56]:
param_grid = {'C':[0.01,0.1,0.2,0.3,0.5,1.0], 'loss':["epsilon_insensitive", "squared_epsilon_insensitive"]}

In [57]:
svr_search = LinearSVR(random_state=7, max_iter=30000)
rnd_grid_search_extra = GridSearchCV(svr_search, param_grid, cv=10)
rnd_grid_search_extra.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=LinearSVR(max_iter=30000, random_state=7),
             param_grid={'C': [0.01, 0.1, 0.2, 0.3, 0.5, 1.0],
                         'loss': ['epsilon_insensitive',
                                  'squared_epsilon_insensitive']})

In [58]:
rnd_grid_search_extra.best_params_

{'C': 0.1, 'loss': 'squared_epsilon_insensitive'}

In [59]:
best_params_svr=LinearSVR(random_state=7, max_iter=30000, loss="squared_epsilon_insensitive", C=0.1)
best_params_svr.fit(X_train,y_train.ravel())

LinearSVR(C=0.1, loss='squared_epsilon_insensitive', max_iter=30000,
          random_state=7)

In [60]:
best_model = get_best_model(best_params_svr, best_model)
res_val = perform_eval(best_params_svr, "Support Vector Regression (Best Params)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48217.83406760637
MSE (Test): 47344.7562916438
RMSE (Train): 219.58559622071382
RMSE (Test): 217.58850220460593
R2 (Train): 0.7402642257769185
R2 (Test): 0.7352628570310998
Adj R2 (Train): 0.7399206996979619
Adj R2 (Test): 0.7349127161506213


In [61]:
svr_df = compare_results("Support")
svr_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,Support Vector Regression (Best Params),48217.834068,47344.756292,219.585596,217.588502,0.740264,0.735263,0.739921,0.734913
0,Support Vector Regression (Initial),67086.344054,62904.09023,259.010317,250.806878,0.638625,0.64826,0.638147,0.647795


In [62]:
import pickle
filename = 'svr.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))

In [63]:
all_df = compare_results("all")
all_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
4,Lasso Regression (Best Alpha),48254.395748,47313.880747,219.668832,217.517541,0.740067,0.735436,0.739723,0.735086
10,Support Vector Regression (Best Params),48217.834068,47344.756292,219.585596,217.588502,0.740264,0.735263,0.739921,0.734913
3,Lasso Regression (Initial),48215.91075,47375.76583,219.581217,217.659748,0.740275,0.735089,0.739931,0.734739
2,Ridge Regression (Best Params),48215.88491,47375.823381,219.581158,217.65988,0.740275,0.735089,0.739931,0.734739
1,Ridge Regression (Best Alpha),48215.584632,47377.800442,219.580474,217.664422,0.740276,0.735078,0.739933,0.734728
0,Ridge Regression (Initial),48215.492015,47387.17416,219.580263,217.685953,0.740277,0.735026,0.739933,0.734675
8,SGD Regressor (Best Params),48260.132984,47455.874787,219.68189,217.843693,0.740036,0.734642,0.739693,0.734291
6,ElasticNet Regression (Best Alpha),50651.990814,48326.328659,225.059972,219.832501,0.727152,0.729774,0.726791,0.729417
9,Support Vector Regression (Initial),67086.344054,62904.09023,259.010317,250.806878,0.638625,0.64826,0.638147,0.647795
5,ElasticNet Regression (Initial),141002.42954,134699.345322,375.502902,367.014094,0.24046,0.246803,0.239455,0.245807
