In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')
# to remove warning calls/emitted

In [2]:
df = pd.read_excel('./datasets/combined.xlsx', index_col=0)

In [3]:
df.head()

Unnamed: 0,Dwelling Type,Year,Month,Region,Towns,Avg kWh,Daily Rainfall Total (mm),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h)
0,1-room / 2-room,2005,1,Central Region,Bishan,104.9,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
1,1-room / 2-room,2005,1,Central Region,Bukit Merah,140.7,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
2,1-room / 2-room,2005,1,Central Region,Central Region,136.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
3,1-room / 2-room,2005,1,Central Region,Geylang,148.5,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3
4,1-room / 2-room,2005,1,Central Region,Kallang,115.6,2.9,3.6,4.4,5.0,27.5,31.3,25.1,7.2,35.3


In [None]:
df_one_hot_encoded = df.copy()

In [4]:
# # Using 120 min data as identified in the notebook with K-NN
features_df = pd.get_dummies(df_one_hot_encoded, columns=['Dwelling Type', 'Month', 'Towns', 'Region'])
del features_df['Avg kWh']
del features_df['Highest 30 min Rainfall (mm)']
del features_df['Highest 60 min Rainfall (mm)']
features_df

Dwelling Type                    0
Year                             0
Month                            0
Region                           0
Towns                            0
Avg kWh                          0
Daily Rainfall Total (mm)        0
Highest 30 min Rainfall (mm)     0
Highest 60 min Rainfall (mm)     0
Highest 120 min Rainfall (mm)    0
Mean Temperature (°C)            0
Maximum Temperature (°C)         0
Minimum Temperature (°C)         0
Mean Wind Speed (km/h)           0
Max Wind Speed (km/h)            0
dtype: int64

In [None]:
# apply normalization techniques
for column in features_df.columns:
    features_df[column] = (features_df[column] - features_df[column].min()) / (features_df[column].max() - features_df[column].min())

In [17]:
X = features_df
y = df_one_hot_encoded['Avg kWh']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
eval_results = {}
def perform_eval(model, model_name):
    #Mean squared error 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    #Train
    mse_train = mean_squared_error(y_train, y_train_pred)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    #Test
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

    #R Squared Score
    #Train
    r2train = model.score(X_train, y_train)
    adjr2_train = 1 - (1-r2train) * (len(y)-1)/(len(y)-X.shape[1]-1)
    #Test
    r2test = model.score(X_test, y_test)
    adjr2_test = 1 - (1-r2test) * (len(y)-1)/(len(y)-X.shape[1]-1)
    
    eval_results[model_name] = {
        "MSE (Train)" : mse_train,
        "MSE (Test)" : mse_test,
        "RMSE (Train)" : rmse_train,
        "RMSE (Test)" : rmse_test,
        "R2 (Train)" : r2train,
        "R2 (Test)" : r2test,
        "Adj R2 (Train)" : adjr2_train,
        "Adj R2 (Test)" : adjr2_test
    }
    
    return eval_results[model_name]

In [20]:
def get_best_model(model, best_model):
    if best_model == "":
        best_model = model

    #test adj r2
    best_adjr2_test = 1 - (1-best_model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)
    model_adjr2_test = 1 - (1-model.score(X_test, y_test)) * (len(y)-1)/(len(y)-X.shape[1]-1)

    if model_adjr2_test > best_adjr2_test:
        best_model = model

    return best_model

In [21]:
def compare_results(desired_model):
    metric_data = []
    col_names = ["Models"]
    col_done = False

    for model in eval_results:
        eval_list = []
        #Append Model names
        if (desired_model.lower() in str(model).lower()) or (desired_model.lower() == "all"):
            eval_list.append(model)
            for metric in eval_results[model]:
                if not col_done:
                    col_names.append(metric)
                eval_list.append(eval_results[model][metric])
            col_done = True
            metric_data.append(eval_list)
        
    df = pd.DataFrame(metric_data, columns=col_names)
    df = df.sort_values(by=['Adj R2 (Test)'], ascending=False)
    df = df.style.set_table_attributes("style='display:inline'").set_caption(f'{str(desired_model).capitalize()} Models (Sort by Adj R2 (Test))')
    
    return df

# K-Neighbor Regressor

In [22]:
#Reset best model for each section
best_model = ""

In [23]:
initial_knn = KNeighborsRegressor(n_neighbors=3)
initial_knn.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [24]:
best_model = get_best_model(initial_knn, best_model)
res_val = perform_eval(initial_knn, "KNN (Initial)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 4379.342031323341
MSE (Test): 8037.020545082551
RMSE (Train): 66.17659730843934
RMSE (Test): 89.64943137065929
R2 (Train): 0.9764097285768042
R2 (Test): 0.9550594823219517
Adj R2 (Train): 0.9763785281252385
Adj R2 (Test): 0.955000044072175


# Grid Search CV for best K

In [25]:
params = {'n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}
knn = KNeighborsRegressor()
model = GridSearchCV(knn, params, scoring='r2')
model.fit(X_train,y_train)
model.best_params_

{'n_neighbors': 3}

We will use K = 3

Since now we have the best hyperparameter of K = 3, this can be used to fit a KNN model and check its performance on the unseen test dataset.

In [26]:
best_param_knn = KNeighborsRegressor(n_neighbors=3)
best_param_knn.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [27]:
best_model = get_best_model(best_param_knn, best_model)
res_val = perform_eval(best_param_knn, "KNN (Best Params)")
for key in res_val:
    print(f"{key}: {res_val[key]}")

MSE (Train): 4379.342031323341
MSE (Test): 8037.020545082551
RMSE (Train): 66.17659730843934
RMSE (Test): 89.64943137065929
R2 (Train): 0.9764097285768042
R2 (Test): 0.9550594823219517
Adj R2 (Train): 0.9763785281252385
Adj R2 (Test): 0.955000044072175


In [28]:
knn_df = compare_results("knn")
knn_df

Unnamed: 0,Models,MSE (Train),MSE (Test),RMSE (Train),RMSE (Test),R2 (Train),R2 (Test),Adj R2 (Train),Adj R2 (Test)
0,KNN (Initial),4379.342031,8037.020545,66.176597,89.649431,0.97641,0.955059,0.976379,0.955
1,KNN (Best Params),4379.342031,8037.020545,66.176597,89.649431,0.97641,0.955059,0.976379,0.955


In [29]:
import pickle, os
filename = 'knn.pkl'
save_location = os.path.join(".","trained_models", filename)
pickle.dump(best_model, open(save_location, 'wb'))