In [1]:
import pandas as pd
import numpy as np
import openpyxl
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import joblib
%matplotlib inline

# Reading Combined Dataset

In [2]:
df = pd.read_excel('./datasets/combined.xlsx', index_col=0)
df.head()

Unnamed: 0,Dwelling Type,Year,Month,Region,Towns,Avg kWh,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,1-room / 2-room,2005,1,Central Region,Bishan,104.9,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
1,1-room / 2-room,2005,1,Central Region,Bukit Merah,140.7,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
2,1-room / 2-room,2005,1,Central Region,Central Region,136.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
3,1-room / 2-room,2005,1,Central Region,Geylang,148.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
4,1-room / 2-room,2005,1,Central Region,Kallang,115.6,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3


In [3]:
eval_results = {}
def perform_eval(model, model_name):
    #Mean squared error 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    #Train
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    #Test
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

    #R Squared Score
    #Train
    r2train = model.score(X_train, y_train)
    adjr2_train = 1 - (1-r2train) * (len(y)-1)/(len(y)-X.shape[1]-1)
    #Test
    r2test = model.score(X_test, y_test)
    adjr2_test = 1 - (1-r2test) * (len(y)-1)/(len(y)-X.shape[1]-1)
    
    eval_results[model_name] = {
        "MSE (Train)" : mse_train,
        "MSE (Test)" : mse_test,
        "RMSE (Train)" : rmse_train,
        "RMSE (Test)" : rmse_test,
        "R2 (Train)" : r2train,
        "R2 (Test)" : r2test,
        "Adj R2 (Train)" : adjr2_train,
        "Adj R2 (Test)" : adjr2_test
    }
    
    return eval_results[model_name]

In [4]:
best_model = ""
def get_best_model(model, best_model):
    if best_model == "":
        best_model = model

    #test adj r2
    best_adjr2_test = 1 - (1-best_model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)
    model_adjr2_test = 1 - (1-model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)

    if model_adjr2_test > best_adjr2_test:
        best_model = model

    return best_model

In [5]:
def compare_results(desired_model):
    metric_data = []
    col_names = ["Models"]
    col_done = False

    for model in eval_results:
        eval_list = []
        #Append Model names
        if (desired_model.lower() in str(model).lower()) or (desired_model.lower() == "all"):
            eval_list.append(model)
            for metric in eval_results[model]:
                if not col_done:
                    col_names.append(metric)
                eval_list.append(eval_results[model][metric])
            col_done = True
            metric_data.append(eval_list)
        
    df = pd.DataFrame(metric_data, columns=col_names)
    df = df.sort_values(by=['Adj R2 (Test)'], ascending=False)
    df = df.style.set_table_attributes("style='display:inline'").set_caption(f'{str(desired_model).capitalize()} Models (Sort by Adj R2 (Test))')
    
    return df

# Trying Linear Regression with Label Encoding

In [6]:
df_label_encoded = df.copy()

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_label_encoded['Region']= le.fit_transform(df_label_encoded['Region'])
df_label_encoded['Dwelling Type']= le.fit_transform(df_label_encoded['Dwelling Type'])
df_label_encoded['Towns']= le.fit_transform(df_label_encoded['Towns'])
del df_label_encoded['Highest 30 min Rainfall (mm)']
del df_label_encoded['Highest 60 min Rainfall (mm)']

In [8]:
df_label_encoded.sample(5)

Unnamed: 0,Dwelling Type,Year,Month,Region,Towns,Avg kWh,Daily Rainfall Total (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
21063,3,2007,8,0,17,507.1,7.9,5.0,27.5,31.3,25.1,7.2,35.3
10635,1,2016,1,2,30,258.8,2.7,2.3,27.9,31.8,25.7,8.5,34.3
23396,3,2013,2,4,5,419.5,14.9,5.8,26.2,30.3,24.0,5.2,28.8
19611,2,2020,8,3,23,411.9,3.4,3.1,27.3,32.1,24.4,14.9,34.0
37411,5,2008,9,2,14,547.2,7.2,5.0,27.6,31.6,24.4,8.9,32.5


In [9]:
X = df_label_encoded.loc[:, df_label_encoded.columns !='Avg kWh']
y = df_label_encoded['Avg kWh']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
# importing module
from sklearn.linear_model import LinearRegression
# creating an object of LinearRegression class
label_encode_LR = LinearRegression()
# fitting the training data
label_encode_LR.fit(X_train,y_train)

LinearRegression()

In [12]:
#best_model = get_best_model(label_encode_LR, best_model)
res_val = perform_eval(label_encode_LR, "Linear Regression (Label Encoding)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 160427.61282516192
MSE (Test): 154379.18459183435
RMSE (Train): 400.53415937365685
RMSE (Test): 392.9111662855032
R2 (Train): 0.13582202457538295
R2 (Test): 0.1367596442792528
Adj R2 (Train): 0.13565077305684448
Adj R2 (Test): 0.13658857856595663


# Trying Linear Regression with One Hot Encoding

In [13]:
df_one_hot_encoded = df.copy()

In [None]:
features_df = pd.get_dummies(df_one_hot_encoded, columns=['Dwelling Type', 'Month', 'Towns', 'Region'])
del features_df['Avg kWh']
del features_df['Highest 30 min Rainfall (mm)']
del features_df['Highest 60 min Rainfall (mm)']

In [16]:
# apply normalization techniques
for column in features_df.columns:
    features_df[column] = (features_df[column] - features_df[column].min()) / (features_df[column].max() - features_df[column].min())

In [17]:
features_df

Unnamed: 0,Year,Daily Rainfall Total (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Dwelling Type_1-room / 2-room,Dwelling Type_3-room,...,Towns_Tanglin,Towns_Toa Payoh,Towns_West Region,Towns_Woodlands,Towns_Yishun,Region_Central Region,Region_East Region,Region_North East Region,Region_North Region,Region_West Region
0,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.044892,0.206612,0.519231,0.416667,0.574468,0.238095,0.452273,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60563,1.0,0.151703,0.388430,0.634615,0.650000,0.446809,0.321429,0.375000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60564,1.0,0.116099,0.276860,0.653846,0.550000,0.595745,0.101190,0.252273,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60565,1.0,0.131579,0.326446,0.653846,0.600000,0.531915,0.208333,0.313636,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60566,1.0,0.131579,0.326446,0.653846,0.600000,0.531915,0.208333,0.313636,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
# Training and testing only accept matrix not data frame
X = features_df
y = df_one_hot_encoded['Avg kWh']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
# importing module
from sklearn.linear_model import LinearRegression
# creating an object of LinearRegression class
ohe_LR = LinearRegression()
# fitting the training data
ohe_LR.fit(X_train,y_train)

LinearRegression()

In [21]:
#best_model = get_best_model(ohe_LR, best_model)
res_val = perform_eval(ohe_LR, "Linear Regression (One Hot Encoding)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 48223.016269070606
MSE (Test): 47399.41083456493
RMSE (Train): 219.5973958613139
RMSE (Test): 217.71405750333378
R2 (Train): 0.7402363107298093
R2 (Test): 0.7349572458361857
Adj R2 (Train): 0.7398927477304604
Adj R2 (Test): 0.7346067007548772


In [22]:
lr_df = compare_results("linear regression")
lr_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
1,Linear Regression (One Hot Encoding),48223.016269,47399.410835,219.597396,217.714058,0.740236,0.734957,0.739893,0.734607
0,Linear Regression (Label Encoding),160427.612825,154379.184592,400.534159,392.911166,0.135822,0.13676,0.135651,0.136589


<b> Here, we save our trained model first for future use </b>

In [25]:
import pickle, os
filename = 'lr_ohe.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(ohe_LR, open(save_location, 'wb'))

<b>  It seems that there is a big difference between the two methods. It is clear that one hot label encoding is better. From here, we will stick with the one hot label encoding method </b>