# KNN

## Importing Required Libraries

In [29]:
import pandas as pd
import numpy as np
import operator
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import DatetimeTickFormatter
from bokeh.layouts import row, column
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor

## Defining Required Functions for Visualization

In [30]:
def analysis_plot(x, y):
    p = figure(title="Date vs MW", 
               sizing_mode="stretch_width",
               x_axis_type='datetime',
               x_axis_label='Date',
               y_axis_label='MW')

    p.line(x, y, legend_label="Visualisation", line_width=1)
    p.xaxis[0].formatter = DatetimeTickFormatter(months="%b %Y")

    output_notebook()
    show(p)

In [31]:
def visualisation_plot(x, y1, y2):
    p = figure(title="Date vs MW", 
               sizing_mode="stretch_width",
               x_axis_type='datetime',
               x_axis_label='Date',
               y_axis_label='MW')

    p.line(x, y1, legend_label="Visualisation", line_width=1)
    p.line(x, y2, legend_label="Visualisation", line_width=1, color='red')
    p.xaxis[0].formatter = DatetimeTickFormatter(months="%b %Y")

    output_notebook()
    show(p)

## KNN Regression Algorithm

In [32]:
def KNNRegression(trainDataX, trainDataY, testDataX, testDataY):
    trainDataX = np.array(trainDataX)
    trainDataY = np.array(trainDataY)
    testDataX = np.array(testDataX)
    testDataY = np.array(testDataY)
    
    errorDict = {}
    for K in range(1, 200):
        KNN = KNeighborsRegressor(n_neighbors = K)

        # Fit training data on KNN regression model
        KNN.fit(trainDataX, trainDataY)
        
        # Predict on testing data
        predictedY = KNN.predict(testDataX)
        
        # Calculate Mean Squared Error
        MSE = np.mean((predictedY-testDataY)**2)
        MSE = round(MSE, 3)
        errorDict[1/K] = MSE
        
    return errorDict

# Dataset - 3

In [33]:
df = pd.read_excel('/content/3-hour-load-weather-data.xlsx')
df

Unnamed: 0,DATE,max-temp,min-temp,RH-0830,RH-1730,MW
0,2017-01-01 00:00:00,20.3,9.2,100,80,1815.571045
1,2017-01-01 01:00:00,20.3,9.2,100,80,1576.699585
2,2017-01-01 02:00:00,20.3,9.2,100,80,1428.967896
3,2017-01-01 03:00:00,20.3,9.2,100,80,1356.272705
4,2017-01-01 04:00:00,20.3,9.2,100,80,1354.029175
...,...,...,...,...,...,...
26275,2019-12-31 19:00:00,9.4,4.8,91,69,4157.812988
26276,2019-12-31 20:00:00,9.4,4.8,91,69,4008.450439
26277,2019-12-31 21:00:00,9.4,4.8,91,69,3757.650391
26278,2019-12-31 22:00:00,9.4,4.8,91,69,3556.840576


In [34]:
analysis_plot(df['DATE'], df['MW'])

In [35]:
train = df.loc[ df['DATE'].dt.year < 2019 ]
test = df.loc[ df['DATE'].dt.year >= 2019 ]

features = ['max-temp', 'min-temp', 'RH-0830', 'RH-1730']
label = 'MW'

X_train, y_train = train[features], train[label]
X_test, y_test = test[features], test[label]

In [36]:
# compute training error
trainErrorDict = KNNRegression(X_train, y_train, X_train, y_train)

# compute testing error
testErrorDict = KNNRegression(X_train, y_train, X_test, y_test)

In [37]:
# plot training and testing error vs 1/K
# plotTrainTestError(trainErrorDict, testErrorDict, 'Train Error', 'Test Error')

# find value of K with minimum testing error
sortedTestErrorDict = sorted(testErrorDict.items(), key=operator.itemgetter(1))
bestK = int(round(1/sortedTestErrorDict[0][0]))
KNNMSE = sortedTestErrorDict[0][1]
print('Best K:', bestK, 'with MSE:', KNNMSE)


Best K: 148 with MSE: 543537.133


In [38]:
# knn = KNeighborsRegressor(n_neighbors = 2)
KNN = KNeighborsRegressor(n_neighbors=bestK) 

# Fit training data on KNN regression model
KNN.fit(X_train, y_train)

# Predict on testing data
predictedY = KNN.predict(X_test)

# Calculate Mean Squared Error
MSE = np.mean((predictedY-y_test)**2)
MSE = round(MSE, 3)

In [39]:
predictedY

array([2805.612774  , 2805.612774  , 2805.612774  , ..., 2724.07889041,
       2724.07889041, 2724.07889041])

In [40]:
 score = round(KNN.score(X_test, y_test), 2)
 score

0.65

In [41]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 610.9741839744995
MSE: 543537.1332842925
RMSE: 737.2497089075672


In [42]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

0.645773987963381

In [43]:
visualisation_plot(test['DATE'], test['MW'], predictedY)

# Dataset - 4

In [44]:
df = pd.read_excel('/content/4-day-load-weather-data.xlsx')
df.rename({'date': 'DATE'}, axis=1, inplace=True)
df

Unnamed: 0,DATE,max-temp,min-temp,RH-0830,RH-1730,MW
0,2017-01-01,20.3,9.2,100,80,3536.238770
1,2017-01-02,23.2,9.3,100,82,3639.738770
2,2017-01-03,24.3,9.5,100,77,3673.321289
3,2017-01-04,24.0,8.9,97,66,3898.860840
4,2017-01-05,25.2,10.4,97,71,3547.965820
...,...,...,...,...,...,...
1090,2019-12-27,13.4,4.2,86,76,4976.180664
1091,2019-12-28,14.4,2.4,100,83,4708.879395
1092,2019-12-29,13.3,3.1,94,79,4831.750488
1093,2019-12-30,15.8,2.6,100,97,5298.331055


In [45]:
analysis_plot(df['DATE'], df['MW'])

In [46]:
train = df.loc[ df['DATE'].dt.year < 2019 ]
test = df.loc[ df['DATE'].dt.year >= 2019 ]

features = ['max-temp', 'min-temp', 'RH-0830', 'RH-1730']
label = 'MW'

X_train, y_train = train[features], train[label]
X_test, y_test = test[features], test[label]

In [47]:
# compute training error
trainErrorDict = KNNRegression(X_train, y_train, X_train, y_train)

# compute testing error
testErrorDict = KNNRegression(X_train, y_train, X_test, y_test)

In [48]:
# plot training and testing error vs 1/K
# plotTrainTestError(trainErrorDict, testErrorDict, 'Train Error', 'Test Error')

# find value of K with minimum testing error
sortedTestErrorDict = sorted(testErrorDict.items(), key=operator.itemgetter(1))
bestK = int(round(1/sortedTestErrorDict[0][0]))
KNNMSE = sortedTestErrorDict[0][1]
print('Best K:', bestK, 'with MSE:', KNNMSE)


Best K: 8 with MSE: 286639.813


In [49]:
# knn = KNeighborsRegressor(n_neighbors = 2)
KNN = KNeighborsRegressor(n_neighbors=bestK) 

# Fit training data on KNN regression model
KNN.fit(X_train, y_train)

# Predict on testing data
predictedY = KNN.predict(X_test)

# Calculate Mean Squared Error
MSE = np.mean((predictedY-y_test)**2)
MSE = round(MSE, 3)

In [50]:
predictedY

array([4076.09158325, 3829.277771  , 3900.94692993, 3974.92681885,
       3846.38397217, 3913.8163147 , 4009.7142334 , 3982.99414062,
       3904.53115845, 3930.56665039, 3822.82519531, 3900.94692993,
       3746.6463623 , 3530.13928223, 3933.01937866, 3901.50418091,
       3896.93023682, 3898.73114014, 4034.89956665, 3917.61343384,
       3704.08868408, 3710.15179443, 3811.44232178, 3782.64846802,
       3778.73031616, 4104.66098022, 3882.78756714, 3949.03259277,
       3896.80282593, 3934.06008911, 3816.58786011, 3757.61297607,
       3833.73828125, 3947.08288574, 3861.61605835, 3611.52810669,
       3730.74237061, 3912.55651855, 4122.03918457, 3682.85821533,
       3960.90628052, 3850.05429077, 3695.8026123 , 3867.73675537,
       3689.11291504, 3563.19854736, 3804.99508667, 3859.37466431,
       3837.9180603 , 3589.74398804, 3723.12484741, 3806.04837036,
       3571.15097046, 3701.09484863, 3732.41870117, 3508.06607056,
       3820.43341064, 3865.64279175, 3943.99111938, 3840.77612

In [51]:
 score = round(KNN.score(X_test, y_test), 2)
 score

0.76

In [52]:
from sklearn import metrics

In [53]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 415.2458523684985
MSE: 286639.8133363235
RMSE: 535.3875356564845


In [54]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

0.7594494677633123

In [55]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 415.2458523684985
MSE: 286639.8133363235
RMSE: 535.3875356564845


In [56]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

0.7594494677633123

In [57]:
visualisation_plot(test['DATE'], test['MW'], predictedY)

# Dataset - 5

In [58]:
df = pd.read_excel('/content/5-hour-load-holiday-data.xlsx')
df

Unnamed: 0,DATE,day,MW
0,2017-01-01 00:00:00,2,1815.571045
1,2017-01-01 01:00:00,2,1576.699585
2,2017-01-01 02:00:00,2,1428.967896
3,2017-01-01 03:00:00,2,1356.272705
4,2017-01-01 04:00:00,2,1354.029175
...,...,...,...
26275,2019-12-31 19:00:00,0,4157.812988
26276,2019-12-31 20:00:00,0,4008.450439
26277,2019-12-31 21:00:00,0,3757.650391
26278,2019-12-31 22:00:00,0,3556.840576


In [59]:
analysis_plot(df['DATE'], df['MW'])

In [60]:
train = df.loc[ df['DATE'].dt.year < 2019 ]
test = df.loc[ df['DATE'].dt.year >= 2019 ]

features = ['day']
label = 'MW'

X_train, y_train = train[features], train[label]
X_test, y_test = test[features], test[label]

In [61]:
# compute training error
trainErrorDict = KNNRegression(X_train, y_train, X_train, y_train)

# compute testing error
testErrorDict = KNNRegression(X_train, y_train, X_test, y_test)

In [62]:
# plot training and testing error vs 1/K
# plotTrainTestError(trainErrorDict, testErrorDict, 'Train Error', 'Test Error')

# find value of K with minimum testing error
sortedTestErrorDict = sorted(testErrorDict.items(), key=operator.itemgetter(1))
bestK = int(round(1/sortedTestErrorDict[0][0]))
KNNMSE = sortedTestErrorDict[0][1]
print('Best K:', bestK, 'with MSE:', KNNMSE)


Best K: 16 with MSE: 2719734.55


In [63]:
# knn = KNeighborsRegressor(n_neighbors = 2)
KNN = KNeighborsRegressor(n_neighbors=bestK) 

# Fit training data on KNN regression model
KNN.fit(X_train, y_train)

# Predict on testing data
predictedY = KNN.predict(X_test)

# Calculate Mean Squared Error
MSE = np.mean((predictedY-y_test)**2)
MSE = round(MSE, 3)

In [64]:
predictedY

array([2355.29010013, 2355.29010013, 2355.29010013, ..., 2896.26857758,
       2896.26857758, 2896.26857758])

In [65]:
 score = round(KNN.score(X_test, y_test), 2)
 score

-0.77

In [66]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 1339.6553008434644
MSE: 2719734.549931535
RMSE: 1649.1617719106682


In [67]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

-0.7724653283560625

In [68]:
visualisation_plot(test['DATE'], test['MW'], predictedY)

# Dataset - 6

In [69]:
df = pd.read_excel('/content/6-day-load-holiday-data.xlsx')
df.rename({'date': 'DATE'}, axis=1, inplace=True)
df

Unnamed: 0,DATE,day,MW
0,2017-01-01,2,3536.238770
1,2017-01-02,0,3639.738770
2,2017-01-03,0,3673.321289
3,2017-01-04,0,3898.860840
4,2017-01-05,2,3547.965820
...,...,...,...
1090,2019-12-27,0,4976.180664
1091,2019-12-28,1,4708.879395
1092,2019-12-29,1,4831.750488
1093,2019-12-30,0,5298.331055


In [70]:
analysis_plot(df['DATE'], df['MW'])

In [71]:
train = df.loc[ df['DATE'].dt.year < 2019 ]
test = df.loc[ df['DATE'].dt.year >= 2019 ]

features = ['day']
label = 'MW'

X_train, y_train = train[features], train[label]
X_test, y_test = test[features], test[label]

In [72]:
# compute training error
trainErrorDict = KNNRegression(X_train, y_train, X_train, y_train)

# compute testing error
testErrorDict = KNNRegression(X_train, y_train, X_test, y_test)

In [73]:
# plot training and testing error vs 1/K
# plotTrainTestError(trainErrorDict, testErrorDict, 'Train Error', 'Test Error')

# find value of K with minimum testing error
sortedTestErrorDict = sorted(testErrorDict.items(), key=operator.itemgetter(1))
bestK = int(round(1/sortedTestErrorDict[0][0]))
KNNMSE = sortedTestErrorDict[0][1]
print('Best K:', bestK, 'with MSE:', KNNMSE)


Best K: 160 with MSE: 1181123.181


In [74]:
# knn = KNeighborsRegressor(n_neighbors = 2)
KNN = KNeighborsRegressor(n_neighbors=bestK) 

# Fit training data on KNN regression model
KNN.fit(X_train, y_train)

# Predict on testing data
predictedY = KNN.predict(X_test)

# Calculate Mean Squared Error
MSE = np.mean((predictedY-y_test)**2)
MSE = round(MSE, 3)

In [75]:
predictedY

array([4360.64295807, 4758.27862854, 4758.27862854, 4758.27862854,
       4523.41375275, 4523.41375275, 4758.27862854, 4758.27862854,
       4758.27862854, 4758.27862854, 4758.27862854, 4523.41375275,
       4360.64295807, 4360.64295807, 4360.64295807, 4758.27862854,
       4758.27862854, 4758.27862854, 4523.41375275, 4523.41375275,
       4758.27862854, 4758.27862854, 4758.27862854, 4758.27862854,
       4758.27862854, 4360.64295807, 4523.41375275, 4758.27862854,
       4758.27862854, 4758.27862854, 4758.27862854, 4758.27862854,
       4523.41375275, 4523.41375275, 4758.27862854, 4758.27862854,
       4758.27862854, 4758.27862854, 4758.27862854, 4523.41375275,
       4360.64295807, 4758.27862854, 4758.27862854, 4758.27862854,
       4758.27862854, 4758.27862854, 4523.41375275, 4523.41375275,
       4758.27862854, 4360.64295807, 4758.27862854, 4758.27862854,
       4758.27862854, 4523.41375275, 4523.41375275, 4758.27862854,
       4758.27862854, 4758.27862854, 4758.27862854, 4360.64295

In [76]:
 score = round(KNN.score(X_test, y_test), 2)
 score

0.01

In [77]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 936.5564018270422
MSE: 1181123.1808523764
RMSE: 1086.7949120475198


In [78]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

0.008791533583083933

In [79]:
visualisation_plot(test['DATE'], test['MW'], predictedY)

# Dataset - 7

In [80]:
df = pd.read_excel('/content/7-hour-load-weather-holiday-data.xlsx')
df

Unnamed: 0,DATE,max-temp,min-temp,RH-0830,RH-1730,day,MW
0,2017-01-01 00:00:00,20.3,9.2,100,80,2,1815.571045
1,2017-01-01 01:00:00,20.3,9.2,100,80,2,1576.699585
2,2017-01-01 02:00:00,20.3,9.2,100,80,2,1428.967896
3,2017-01-01 03:00:00,20.3,9.2,100,80,2,1356.272705
4,2017-01-01 04:00:00,20.3,9.2,100,80,2,1354.029175
...,...,...,...,...,...,...,...
26275,2019-12-31 19:00:00,9.4,4.8,91,69,0,4157.812988
26276,2019-12-31 20:00:00,9.4,4.8,91,69,0,4008.450439
26277,2019-12-31 21:00:00,9.4,4.8,91,69,0,3757.650391
26278,2019-12-31 22:00:00,9.4,4.8,91,69,0,3556.840576


In [81]:
analysis_plot(df['DATE'], df['MW'])

In [82]:
train = df.loc[ df['DATE'].dt.year < 2019 ]
test = df.loc[ df['DATE'].dt.year >= 2019 ]

features = ['max-temp', 'min-temp', 'RH-0830', 'RH-1730', 'day']
label = 'MW'

X_train, y_train = train[features], train[label]
X_test, y_test = test[features], test[label]

In [83]:
# compute training error
trainErrorDict = KNNRegression(X_train, y_train, X_train, y_train)

# compute testing error
testErrorDict = KNNRegression(X_train, y_train, X_test, y_test)

In [84]:
# plot training and testing error vs 1/K
# plotTrainTestError(trainErrorDict, testErrorDict, 'Train Error', 'Test Error')

# find value of K with minimum testing error
sortedTestErrorDict = sorted(testErrorDict.items(), key=operator.itemgetter(1))
bestK = int(round(1/sortedTestErrorDict[0][0]))
KNNMSE = sortedTestErrorDict[0][1]
print('Best K:', bestK, 'with MSE:', KNNMSE)


Best K: 128 with MSE: 540757.935


In [85]:
# knn = KNeighborsRegressor(n_neighbors = 2)
KNN = KNeighborsRegressor(n_neighbors=bestK) 

# Fit training data on KNN regression model
KNN.fit(X_train, y_train)

# Predict on testing data
predictedY = KNN.predict(X_test)

# Calculate Mean Squared Error
MSE = np.mean((predictedY-y_test)**2)
MSE = round(MSE, 3)

In [86]:
predictedY

array([2798.27512741, 2798.27512741, 2798.27512741, ..., 2743.30033875,
       2743.30033875, 2743.30033875])

In [87]:
 score = round(KNN.score(X_test, y_test), 2)
 score

0.65

In [88]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 609.5098666521677
MSE: 540757.9348371692
RMSE: 735.362451337549


In [89]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

0.6475852062266748

In [90]:
visualisation_plot(test['DATE'], test['MW'], predictedY)

# Dataset - 8

In [91]:
df = pd.read_excel('/content/8-day-load-weather-holiday-data.xlsx')
df.rename({'date': 'DATE'}, axis=1, inplace=True)
df

Unnamed: 0,DATE,max-temp,min-temp,RH-0830,RH-1730,MW,day
0,2017-01-01,20.3,9.2,100,80,3536.238770,2
1,2017-01-02,23.2,9.3,100,82,3639.738770,0
2,2017-01-03,24.3,9.5,100,77,3673.321289,0
3,2017-01-04,24.0,8.9,97,66,3898.860840,0
4,2017-01-05,25.2,10.4,97,71,3547.965820,2
...,...,...,...,...,...,...,...
1090,2019-12-27,13.4,4.2,86,76,4976.180664,0
1091,2019-12-28,14.4,2.4,100,83,4708.879395,1
1092,2019-12-29,13.3,3.1,94,79,4831.750488,1
1093,2019-12-30,15.8,2.6,100,97,5298.331055,0


In [92]:
analysis_plot(df['DATE'], df['MW'])

In [93]:
train = df.loc[ df['DATE'].dt.year < 2019 ]
test = df.loc[ df['DATE'].dt.year >= 2019 ]

features = ['max-temp', 'min-temp', 'RH-0830', 'RH-1730', 'day']
label = 'MW'

X_train, y_train = train[features], train[label]
X_test, y_test = test[features], test[label]

In [94]:
# compute training error
trainErrorDict = KNNRegression(X_train, y_train, X_train, y_train)

# compute testing error
testErrorDict = KNNRegression(X_train, y_train, X_test, y_test)

In [95]:
# plot training and testing error vs 1/K
# plotTrainTestError(trainErrorDict, testErrorDict, 'Train Error', 'Test Error')

# find value of K with minimum testing error
sortedTestErrorDict = sorted(testErrorDict.items(), key=operator.itemgetter(1))
bestK = int(round(1/sortedTestErrorDict[0][0]))
KNNMSE = sortedTestErrorDict[0][1]
print('Best K:', bestK, 'with MSE:', KNNMSE)


Best K: 8 with MSE: 283234.16


In [96]:
# knn = KNeighborsRegressor(n_neighbors = 2)
KNN = KNeighborsRegressor(n_neighbors=bestK) 

# Fit training data on KNN regression model
KNN.fit(X_train, y_train)

# Predict on testing data
predictedY = KNN.predict(X_test)

# Calculate Mean Squared Error
MSE = np.mean((predictedY-y_test)**2)
MSE = round(MSE, 3)

In [97]:
predictedY

array([4105.03527832, 3829.277771  , 3900.94692993, 3974.92681885,
       3846.38397217, 3913.8163147 , 4031.00628662, 3982.99414062,
       3880.69198608, 3930.56665039, 3840.97619629, 3900.94692993,
       3746.6463623 , 3530.13928223, 3949.90078735, 3901.50418091,
       3896.93023682, 3898.73114014, 4034.89956665, 3917.61343384,
       3704.08868408, 3770.16845703, 3811.44232178, 3782.64846802,
       3778.73031616, 4105.24758911, 3882.78756714, 3949.03259277,
       3972.62072754, 3934.06008911, 3816.58786011, 3757.61297607,
       3833.73828125, 3947.08288574, 3894.79998779, 3611.52810669,
       3672.02941895, 3912.55651855, 4122.03918457, 3682.85821533,
       3949.90078735, 3869.19857788, 3695.8026123 , 3867.73675537,
       3692.20910645, 3563.19854736, 3804.99508667, 3848.97250366,
       3837.9180603 , 3613.38671875, 3808.19082642, 3806.04837036,
       3571.15097046, 3664.27911377, 3732.41870117, 3508.06607056,
       3820.43341064, 3842.63259888, 3943.99111938, 3840.77612

In [98]:
 score = round(KNN.score(X_test, y_test), 2)
 score

0.76

In [99]:
print('MAE:', metrics.mean_absolute_error(y_test, predictedY))
print('MSE:', metrics.mean_squared_error(y_test, predictedY))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictedY)))

MAE: 410.7808522681668
MSE: 283234.1601328927
RMSE: 532.1974822684647


In [100]:
from sklearn.metrics import r2_score 
r2_score(y_test, predictedY)

0.7623075204572614

In [101]:
visualisation_plot(test['DATE'], test['MW'], predictedY)