<a href="https://colab.research.google.com/github/ilija-ra/ElectricityConsumptionForecast.BE/blob/main/ElectricityConsumptionForecast_LinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn import datasets

In [None]:
nyYears_merged = pd.read_csv('NYSProcessedData.csv')

In [None]:
x=nyYears_merged.drop('Load', axis=1)
y=nyYears_merged['Load']
print(x.shape)
print(x.columns)
x.head()

(32117, 19)
Index(['temp', 'feelslike', 'dew', 'humidity', 'windspeed', 'winddir',
       'sealevelpressure', 'cloudcover', 'visibility', 'uvindex', 'conditions',
       'Time Zone', 'year', 'month', 'day', 'hour', 'day_of_week',
       'Load_avg_prev_day', 'temp_avg_prev_day'],
      dtype='object')


Unnamed: 0,temp,feelslike,dew,humidity,windspeed,winddir,sealevelpressure,cloudcover,visibility,uvindex,conditions,Time Zone,year,month,day,hour,day_of_week,Load_avg_prev_day,temp_avg_prev_day
0,45.9,40.0,44.0,93.11,12.8,51.0,1016.4,99.0,2.3,0,2,1,2020,3,29,0,7,4455.92,45.52
1,45.9,39.9,44.0,93.11,12.9,53.0,1016.8,100.0,2.7,0,5,1,2020,3,29,1,7,4455.92,45.52
2,45.2,39.7,44.0,95.65,10.9,59.0,1016.5,100.0,4.0,0,2,1,2020,3,29,2,7,4455.92,45.52
3,45.2,38.3,43.2,92.69,15.5,59.0,1016.0,100.0,2.3,0,5,1,2020,3,29,3,7,4455.92,45.52
4,45.0,38.3,43.0,92.84,14.8,48.0,1015.8,99.0,3.2,0,2,1,2020,3,29,4,7,4455.92,45.52


In [None]:
def model_evaluation(y, y_predicted):
    mape = mean_absolute_percentage_error(y, y_predicted)
    print('Mean absolute percentage error: ' + str((mape * 100)) + '%')

    res=pd.concat([pd.DataFrame(y.values), pd.DataFrame(y_predicted)], axis=1)
    res.columns = ['y', 'y_pred']
    print(res.head(20))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

In [None]:
first_regression_model = LinearRegression(fit_intercept=True)

first_regression_model.fit(x_train, y_train)

y_predicted = first_regression_model.predict(x_test)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 6.207134282082223%
          y       y_pred
0   5844.90  5536.979930
1   6087.59  5872.221991
2   5707.20  5362.189059
3   7307.50  6748.484681
4   5208.20  5591.853113
5   4404.80  5240.661537
6   6209.90  5695.644802
7   6297.97  6079.157122
8   6306.80  6395.076146
9   4850.60  4611.000915
10  6190.70  5562.052647
11  4563.10  5052.812187
12  4593.20  3778.378784
13  6151.40  6145.144573
14  6589.00  6453.908713
15  5480.60  5470.403764
16  4071.50  4087.877310
17  5284.70  5013.847763
18  5419.10  5371.655713
19  7924.50  7550.967371


In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)
x_train_std = pd.DataFrame(x_train_std)
x_test_std = pd.DataFrame(x_test_std)
x_train_std.columns = list(x.columns)
x_test_std.columns = list(x.columns)
x_train_std.head()

Unnamed: 0,temp,feelslike,dew,humidity,windspeed,winddir,sealevelpressure,cloudcover,visibility,uvindex,conditions,Time Zone,year,month,day,hour,day_of_week,Load_avg_prev_day,temp_avg_prev_day
0,1.574127,1.501398,1.124156,-0.554213,0.338775,0.604734,-1.338274,-0.199829,0.41511,2.119309,-0.226099,-0.719636,-0.339058,0.244569,-0.424751,0.359671,0.501715,2.042323,1.28666
1,-0.315178,-0.164319,0.269397,1.383914,-1.336435,0.726286,-1.036838,-0.806141,-1.398925,-0.507758,-0.226099,-0.719636,-1.275988,-0.646066,1.39107,-0.79421,1.001691,-1.023148,0.022004
2,0.713819,0.701665,0.182903,-0.967097,-0.344535,-1.235918,0.671294,-0.47267,0.41511,2.119309,-0.226099,-0.719636,-0.339058,-0.052309,-0.424751,-0.217269,-0.498236,-0.030166,0.593571
3,0.876884,0.838897,0.71204,-0.220196,-1.336435,-1.50507,0.683854,-0.560861,0.41511,-0.507758,-0.226099,-0.719636,1.534803,0.541448,0.029205,-1.371151,-1.498187,1.038839,1.068035
4,0.556377,0.569165,-0.081666,-1.176889,0.184479,-0.280863,0.30706,-0.886064,0.41511,-0.507758,-0.872004,-0.719636,-1.275988,-0.052309,-0.538239,1.080847,-1.498187,-0.158018,0.423787


In [None]:
regression_model_std = LinearRegression()

regression_model_std.fit(x_train_std, y_train)

y_predicted = regression_model_std.predict(x_test_std)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 6.207134282082214%
          y       y_pred
0   5844.90  5536.979930
1   6087.59  5872.221991
2   5707.20  5362.189059
3   7307.50  6748.484681
4   5208.20  5591.853113
5   4404.80  5240.661537
6   6209.90  5695.644802
7   6297.97  6079.157122
8   6306.80  6395.076146
9   4850.60  4611.000915
10  6190.70  5562.052647
11  4563.10  5052.812187
12  4593.20  3778.378784
13  6151.40  6145.144573
14  6589.00  6453.908713
15  5480.60  5470.403764
16  4071.50  4087.877310
17  5284.70  5013.847763
18  5419.10  5371.655713
19  7924.50  7550.967371


In [None]:
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
x_inter_train = poly.fit_transform(x_train_std)
x_inter_test = poly.transform(x_test_std)

regression_model_inter = LinearRegression()

regression_model_inter.fit(x_inter_train, y_train)

y_predicted = regression_model_inter.predict(x_inter_test)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 4.9746330368222464%
          y       y_pred
0   5844.90  5833.783272
1   6087.59  5969.780215
2   5707.20  5375.899441
3   7307.50  6941.288696
4   5208.20  5498.627407
5   4404.80  4025.446290
6   6209.90  5448.592896
7   6297.97  6179.400873
8   6306.80  5877.141669
9   4850.60  4366.595285
10  6190.70  5468.837623
11  4563.10  4847.736783
12  4593.20  3871.855695
13  6151.40  5915.284579
14  6589.00  6430.492918
15  5480.60  5412.685848
16  4071.50  3884.748661
17  5284.70  5207.169592
18  5419.10  5341.184896
19  7924.50  7485.787115


In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
x_inter_train = poly.fit_transform(x_train_std)
x_inter_test = poly.transform(x_test_std)

regression_model_degree = LinearRegression()

regression_model_degree.fit(x_inter_train, y_train)

y_predicted = regression_model_degree.predict(x_inter_test)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 4.6082876618033755%
          y       y_pred
0   5844.90  5944.658422
1   6087.59  5919.228268
2   5707.20  5533.674331
3   7307.50  7004.033367
4   5208.20  5623.083921
5   4404.80  4085.277167
6   6209.90  5658.804757
7   6297.97  6211.001872
8   6306.80  6053.699953
9   4850.60  4001.536493
10  6190.70  5648.270355
11  4563.10  4626.708048
12  4593.20  3522.338458
13  6151.40  5887.288548
14  6589.00  6459.708665
15  5480.60  5530.317826
16  4071.50  3631.274365
17  5284.70  4875.729547
18  5419.10  5260.152552
19  7924.50  7615.741311


In [None]:
poly = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
x_inter_train = poly.fit_transform(x_train_std)
x_inter_test = poly.transform(x_test_std)

regression_model_degree = LinearRegression()

regression_model_degree.fit(x_inter_train, y_train)

y_predicted = regression_model_degree.predict(x_inter_test)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 2.763383858325656%
          y       y_pred
0   5844.90  5951.931396
1   6087.59  5969.309326
2   5707.20  5431.196045
3   7307.50  7107.578613
4   5208.20  5453.034424
5   4404.80  4326.784424
6   6209.90  5819.193848
7   6297.97  6289.041992
8   6306.80  6101.863525
9   4850.60  4418.528076
10  6190.70  5843.349854
11  4563.10  4667.750732
12  4593.20  4214.371582
13  6151.40  6137.025391
14  6589.00  6469.041504
15  5480.60  5735.644043
16  4071.50  3769.065430
17  5284.70  5523.871338
18  5419.10  5388.780518
19  7924.50  7632.834473


In [None]:
ridge_model = Ridge(alpha=1)

ridge_model.fit(x_inter_train, y_train)

y_predicted = ridge_model.predict(x_inter_test)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 2.7621086636665253%
          y       y_pred
0   5844.90  5940.691352
1   6087.59  5980.999854
2   5707.20  5440.209067
3   7307.50  7109.113197
4   5208.20  5451.917166
5   4404.80  4331.999369
6   6209.90  5816.884817
7   6297.97  6320.315479
8   6306.80  6097.007805
9   4850.60  4437.639200
10  6190.70  5846.818602
11  4563.10  4669.044534
12  4593.20  4220.103683
13  6151.40  6160.695842
14  6589.00  6464.833416
15  5480.60  5729.946274
16  4071.50  3738.971817
17  5284.70  5524.213286
18  5419.10  5390.896917
19  7924.50  7633.756247


In [None]:

lasso_model = Lasso(alpha=0.1)

lasso_model.fit(x_inter_train, y_train)

y_predicted = lasso_model.predict(x_inter_test)

model_evaluation(y_test, y_predicted)

Mean absolute percentage error: 2.767419188349065%
          y       y_pred
0   5844.90  5966.379823
1   6087.59  6010.630036
2   5707.20  5453.698540
3   7307.50  7123.133551
4   5208.20  5439.293883
5   4404.80  4307.923355
6   6209.90  5751.858039
7   6297.97  6407.553860
8   6306.80  6079.628855
9   4850.60  4426.616863
10  6190.70  5831.221722
11  4563.10  4679.288881
12  4593.20  4204.246717
13  6151.40  6198.954704
14  6589.00  6493.366402
15  5480.60  5730.114553
16  4071.50  3639.056082
17  5284.70  5554.912688
18  5419.10  5464.771933
19  7924.50  7627.578654


  model = cd_fast.enet_coordinate_descent(


In [None]:
# import pickle

# with open("primary_linear_regression_ridge.pkl", 'wb') as file:
#     pickle.dump(ridge_model, file)

In [None]:
# import uuid
# import pickle
# file_name = f"linear_regression_ridge_{str(uuid.uuid4())[:10]}.pkl"
# with open(file_name, 'wb') as file:
#     pickle.dump(ridge_model, file)

In [None]:
# import pickle

# with open("primary_linear_regression_ridge.pkl", 'rb') as file:
#     model = pickle.load(file)

# # evaluate model
# y_predicted = model.predict(x_inter_test)