# Electricity Usage Analysis

In [1]:
for name in dir():
    if not name.startswith("_"):
        del globals()[name]

In [2]:
import sys
!{sys.executable} -m pip install hvplot



In [27]:
# Libraries
import pandas as pd
import numpy as np
import os
from datetime import datetime
import matplotlib.pyplot as plt
import hvplot.pandas

from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
df = pd.read_csv('df3.csv', header='infer')
df.set_index('date', inplace=True)
df['Hour'] = df['Hour'].astype(str)

In [6]:
print(df.shape)
df.head()

(16459, 29)


Unnamed: 0_level_0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,v1,Weekday,Hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2016-01-11 17:00:00,60.0,30.0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,Mon,17
2016-01-11 17:10:00,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,21.79,48.863333,18.89,45.56,6.483333,733.6,92.0,6.666667,40.0,5.2,18.606195,Mon,17
2016-01-11 17:20:00,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,755.0,92.0,6.333333,55.333333,5.1,28.642668,Mon,17
2016-01-11 17:30:00,50.0,40.0,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,84.333333,6.0,51.5,5.0,45.41039,Mon,17
2016-01-11 17:40:00,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,34.927778,18.1,48.59,17.0,45.4,6.133333,733.9,84.333333,5.666667,47.666667,4.9,10.084097,Mon,17


In [7]:
df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,v1
count,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0,16459.0
mean,96.233064,3.955283,21.411526,40.169764,20.03625,40.440233,21.93522,39.375334,20.468408,39.095497,19.286999,51.174386,7.166662,59.619036,19.787542,35.457365,21.69246,43.134914,19.029985,41.566813,6.848526,754.79426,80.719657,4.120142,38.61759,3.335968,24.934725
std,102.028297,8.060528,1.389577,3.513932,1.875805,3.643092,1.668433,3.038812,1.807956,3.971781,1.54812,8.694324,5.296161,28.978911,1.760854,4.457492,1.717435,4.855274,1.674996,3.856741,4.901699,6.937394,14.110826,2.358126,10.59493,3.680742,14.517159
min,10.0,0.0,16.79,27.023333,16.1,20.596667,17.2,30.663333,15.1,27.66,15.33,29.815,-6.03,1.0,15.39,23.2,16.362222,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322
25%,50.0,0.0,20.79,37.9,18.89,38.4,20.823333,37.29,19.39,36.29,18.2,46.09,3.933333,40.725,18.7,32.5,20.79,39.568036,18.0,38.9765,3.663889,751.1,73.0,2.333333,32.5,1.166667,12.425313
50%,60.0,0.0,21.39,39.626667,19.76,40.59,21.89,38.73,20.39,38.5,19.26,49.4,6.656667,62.261667,19.6,34.927778,21.79,42.6,18.89,41.0,6.4,755.0,84.333333,3.833333,40.0,3.2,24.774977
75%,100.0,0.0,22.1,42.433333,20.79,42.826667,22.79,41.29,21.39,41.465,20.2,53.334,9.726667,84.945,20.926667,37.905409,22.73,46.06,20.251429,43.76,9.6,759.266667,91.166667,5.333333,40.0,5.486111,37.583769
max,1080.0,70.0,25.7,63.36,29.856667,56.026667,27.6,50.163333,26.2,51.09,25.745,96.321667,28.236,99.9,25.39,51.4,27.23,58.78,24.2,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653


In [203]:
# some testing ...
if 0:
    x = df3['Appliances'] - df3['Appliances'].min() + 1
    print(x[0:10].tolist())

    xbox, opt_lambda = boxcox(x)
    print(xbox[0:10].tolist())
    print(opt_lambda)

    xinvbox = inv_boxcox(xbox, opt_lambda)
    print(xinvbox[0:10].tolist())

[51.0, 51.0, 41.0, 41.0, 51.0, 41.0, 51.0, 51.0, 51.0, 61.0]
[2.646922942509263, 2.646922942509263, 2.5515385251450624, 2.5515385251450624, 2.646922942509263, 2.5515385251450624, 2.646922942509263, 2.646922942509263, 2.646922942509263, 2.7218760785681684]
-0.21655783519823987
[51.00000000000002, 51.00000000000002, 41.00000000000001, 41.00000000000001, 51.00000000000002, 41.00000000000001, 51.00000000000002, 51.00000000000002, 51.00000000000002, 60.99999999999995]


In [8]:
# df
def get_filtered_df(df, excluded_cols):
    features = [col for col in df.columns if col not in excluded_cols]
    X = df[features].copy()
    y = df['Appliances']
    return (X, y)

In [9]:
def log_transform(v, subtract_min=1, add_one=0):
    vt = v
    if (subtract_min):
        vt = v - v.min() + 1
    elif (add_one):
        vt = v + 1
    
    return np.log(vt)

In [10]:
# log transformation of selected features in a dataaframe
def log_transform_features(X, features, subtract_min=1, add_one=0):
    Xt = X.copy()
    for col in features:
        vt = log_transform(X[col], subtract_min, add_one)
        Xt[col] = vt

    return (Xt)

In [11]:
def boxcox_transform(v, subtract_min=1):
    v1 = v
    vmin = v.min()
    if (subtract_min):
        v1 = v - vmin + 1
    vt, opt_lambda = boxcox(v1)
    return (vt, vmin, opt_lambda)

In [12]:
# Box-Cox transformation of selected features in a dataaframe
def boxcox_transform_features(X, features):
    Xt = X.copy()
    opt_lambda_dict = {}
    for col in features:
        vt, vmin, opt_lambda = boxcox_transform(X[col])
        Xt[col] = vt
        opt_lambda_dict[col] = opt_lambda

    return (Xt, opt_lambda_dict)

In [15]:
def one_hot_encoding(X, cat_col_list):
    df_cat = X[cat_col_list]

    encoding_cat = OneHotEncoder()
    encoding_cat.fit(df_cat)
    df_cat_enc = pd.DataFrame(
        encoding_cat.transform(df_cat).toarray(), columns=encoding_cat.get_feature_names()
    )

    df_cat_enc.set_index(df_cat.index, inplace=True)
    Xt = pd.merge(X, df_cat_enc, how='left', left_index=True, right_index=True)
    Xt = Xt.drop(cat_col_list, axis=1)
    
    return (Xt)

In [13]:
#scoring = make_scorer(r2_score)
scoring = make_scorer(root_mean_squared_error, greater_is_better=False)

In [14]:
#-------------------------------------------------------------------------------
# Execute the cross-validation for all parameter combinations and plots the
# evaluation metrics. At each iteration of the cross validation process, the 
# model is evaluated using all the below metrics but the optimal model is 
# determined using the AUC score (as this minimizes both false positives &
# false negatives).
#-------------------------------------------------------------------------------
def build_optimal_model(model_type, model, model_parameter_grid, X, y, scoring):
    grid_object = GridSearchCV(model, param_grid=model_parameter_grid, cv=5, scoring=scoring, refit=True, return_train_score=True, n_jobs=-1)
    print('Performing grid search to determine the optimal parameters ...')
    grid_object.fit(X, y)
    grid_search_output = pd.DataFrame(grid_object.cv_results_)

    optimal_model = grid_object.best_estimator_
    optimal_model = optimal_model.fit(X, y)

    return grid_object, grid_search_output, optimal_model

In [16]:
# Ridge regression setup
model_type = 'Linear Model'
model = Ridge(copy_X=False)
model_parameter_grid = {
         'normalize': [True, False],
         'alpha': [0, 0.001, 0.01, 0.05, 0.1, 0.5, 1]
        }

In [17]:
%%time

excluded_cols = ['Appliances', 'Weekday', 'Hour']
X, y = get_filtered_df(df, excluded_cols)

# log transform of Appliances variable
Xt = X
yt = log_transform(y, subtract_min=0)

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

Performing grid search to determine the optimal parameters ...
CPU times: user 1.3 s, sys: 273 ms, total: 1.58 s
Wall time: 2.63 s


In [19]:
print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

0.5977335089944273
{'alpha': 0.05, 'normalize': True}


Unnamed: 0,feature,coef
0,lights,20.119781
2,RH_1,19.962973
5,T3,19.053394
15,T8,11.209785
6,RH_3,8.797314
11,T6,5.553496
10,RH_5,3.489372
3,T2,2.781603
19,T_out,1.86676
22,Windspeed,1.752574


In [20]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(np.exp(predicted), np.exp(yt)))

print(r2_score(yt, predicted))
print(r2_score(y, np.exp(predicted)))

0.5812281481041298
98.70622856552946
0.22159871512311147
0.06400348911715137


In [22]:
%%time

# 2. Box-Cox transformation of features

excluded_cols = ['Appliances', 'Weekday', 'Hour']
X, y = get_filtered_df(df, excluded_cols)

Xt, opt_lambda_dict = boxcox_transform_features(X, X.columns.tolist())
yt, _, opt_lambda = boxcox_transform(y, subtract_min=0)
opt_lambda_dict['Appliances'] = opt_lambda # Appliances > 0

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

Performing grid search to determine the optimal parameters ...
CPU times: user 1.5 s, sys: 37.4 ms, total: 1.54 s
Wall time: 477 ms


In [23]:
print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

0.05658156256379262
{'alpha': 0.05, 'normalize': True}


Unnamed: 0,feature,coef
0,lights,2.031315
2,RH_1,1.746945
5,T3,1.510629
15,T8,1.17409
3,T2,0.590084
11,T6,0.526776
6,RH_3,0.51183
19,T_out,0.440486
10,RH_5,0.396229
12,RH_6,0.291279


In [24]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(inv_boxcox(predicted, opt_lambda_dict['Appliances']), y))

print(r2_score(yt, predicted))
print(r2_score(y, inv_boxcox(predicted, opt_lambda_dict['Appliances'])))

0.05536059542875124
101.00397373013176
0.24311669532665703
0.019918856548492236


In [28]:
%%time

# 3. One-hot encoding of Weekday & Hour features

excluded_cols = ['Appliances']
X, y = get_filtered_df(df, excluded_cols)

Xt = one_hot_encoding(X, ['Weekday', 'Hour'])
yt = log_transform(y, subtract_min=0, add_one=0)

#print(Xt.shape)
#Xt.head()

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

Performing grid search to determine the optimal parameters ...
0.5538258696531965
{'alpha': 0.01, 'normalize': True}
CPU times: user 1.69 s, sys: 219 ms, total: 1.91 s
Wall time: 1.83 s


Unnamed: 0,feature,coef
5,T3,22.335213
0,lights,15.368073
43,x1_18,14.745988
15,T8,14.704659
6,RH_3,13.922963
42,x1_17,10.483675
44,x1_19,8.973289
36,x1_11,7.173118
2,RH_1,6.877059
37,x1_12,6.644773


In [29]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(np.exp(predicted), np.exp(yt)))

print(r2_score(yt, predicted))
print(r2_score(np.exp(yt), np.exp(predicted)))

0.5322354982566241
93.84781790138422
0.3472934942439606
0.1538770572000654


In [30]:
%%time

# 4. One-hot encoding of Weekday & Hour features followed by Box-Cox transformation of numerical features

excluded_cols = ['Appliances']
X, y = get_filtered_df(df, excluded_cols)

cat_col_list = ['Weekday', 'Hour']
features_to_transform = [col for col in X.columns if col not in cat_col_list]
Xt1, opt_lambda_dict = boxcox_transform_features(X, features_to_transform)
yt, _, opt_lambda = boxcox_transform(y, subtract_min=0)
opt_lambda_dict['Appliances'] = opt_lambda # Appliances > 0

Xt = one_hot_encoding(Xt1, cat_col_list)

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

Performing grid search to determine the optimal parameters ...




0.0529453126842919
{'alpha': 0.05, 'normalize': True}
CPU times: user 2.3 s, sys: 135 ms, total: 2.43 s
Wall time: 984 ms


Unnamed: 0,feature,coef
5,T3,1.560526
0,lights,1.493836
43,x1_18,1.270487
15,T8,1.23544
42,x1_17,0.898427
44,x1_19,0.896893
6,RH_3,0.725022
46,x1_20,0.656011
37,x1_12,0.595248
36,x1_11,0.554005


In [31]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(inv_boxcox(predicted, opt_lambda_dict['Appliances']), y))

print(r2_score(yt, predicted))
print(r2_score(y, inv_boxcox(predicted, opt_lambda_dict['Appliances'])))

0.05082725317096058
97.18192825821772
0.3619999677739726
0.0926890819724474


In [33]:
%%time

# 5. One-hot encoding of Weekday & Hour features followed by log transformation of numerical features

excluded_cols = ['Appliances']
X, y = get_filtered_df(df, excluded_cols)

cat_col_list = ['Weekday', 'Hour']
features_to_transform = [col for col in X.columns if col not in cat_col_list]
Xt1 = log_transform_features(X, features_to_transform, subtract_min=1, add_one=0)
yt = log_transform(y, subtract_min=0, add_one=0)

Xt = one_hot_encoding(Xt1, cat_col_list)

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

Performing grid search to determine the optimal parameters ...




0.5526373643530169
{'alpha': 0.05, 'normalize': True}
CPU times: user 1.81 s, sys: 225 ms, total: 2.04 s
Wall time: 2.05 s


Unnamed: 0,feature,coef
5,T3,15.578716
0,lights,15.430128
43,x1_18,14.373386
15,T8,12.861226
42,x1_17,9.996765
6,RH_3,9.647591
44,x1_19,8.794239
2,RH_1,6.337391
36,x1_11,6.055101
37,x1_12,5.88711


In [34]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(np.exp(predicted), np.exp(yt)))

print(r2_score(yt, predicted))
print(r2_score(np.exp(yt), np.exp(predicted)))

0.5341515322762614
94.46136266431134
0.3425855822314995
0.14277757139917557


# Experiments on scaled data

In [35]:
df = pd.read_csv('df3_scaled_range.csv', header='infer')
df.set_index('date', inplace=True)
df['Hour'] = df['Hour'].astype(str)

print(df.shape)
df.head()

(16459, 29)


Unnamed: 0_level_0,Appliances,Weekday,Hour,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,v1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2016-01-11 17:00:00,60.0,Mon,17,0.428571,0.347924,0.566187,0.225345,0.682849,0.249038,0.721368,0.351351,0.764262,0.176348,0.381691,0.381039,0.841827,0.181,0.653428,0.169103,0.661412,0.230218,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449
2016-01-11 17:10:00,60.0,Mon,17,0.428571,0.347924,0.541326,0.225345,0.680944,0.249038,0.724444,0.351351,0.782437,0.176348,0.381691,0.375396,0.839872,0.181,0.651064,0.499438,0.660155,0.429646,0.678532,0.369239,0.1,0.894737,0.47619,0.6,0.533937,0.372083
2016-01-11 17:20:00,50.0,Mon,17,0.428571,0.347924,0.530502,0.225345,0.678239,0.249038,0.731795,0.344745,0.778062,0.176348,0.380037,0.36742,0.830704,0.181,0.646572,0.169103,0.655586,0.226638,0.676049,0.365488,0.597674,0.894737,0.452381,0.835897,0.529412,0.572848
2016-01-11 17:30:00,50.0,Mon,17,0.571429,0.347924,0.52408,0.225345,0.677204,0.249038,0.735214,0.341441,0.770949,0.176348,0.380037,0.363723,0.833401,0.174333,0.641489,0.159902,0.650788,0.226638,0.671909,0.361736,0.104651,0.79386,0.428571,0.776923,0.524887,0.908261
2016-01-11 17:40:00,60.0,Mon,17,0.571429,0.347924,0.531419,0.225345,0.67551,0.249038,0.735214,0.341441,0.762697,0.179549,0.380037,0.361777,0.848264,0.181,0.415879,0.159902,0.650788,0.226638,0.671909,0.357985,0.106977,0.79386,0.404762,0.717949,0.520362,0.201611


In [36]:
%%time

# 3. One-hot encoding of Weekday & Hour features

excluded_cols = ['Appliances']
X, y = get_filtered_df(df, excluded_cols)

Xt = one_hot_encoding(X, ['Weekday', 'Hour'])
yt = log_transform(y, subtract_min=0, add_one=0)

#print(Xt.shape)
#Xt.head()

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

Performing grid search to determine the optimal parameters ...




0.5538258696531964
{'alpha': 0.01, 'normalize': True}
CPU times: user 1.74 s, sys: 129 ms, total: 1.87 s
Wall time: 1.37 s


Unnamed: 0,feature,coef
5,T3,22.335213
0,lights,15.368073
43,x1_18,14.745988
15,T8,14.704659
6,RH_3,13.922963
42,x1_17,10.483675
44,x1_19,8.973289
36,x1_11,7.173118
2,RH_1,6.877059
37,x1_12,6.644773


In [37]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(np.exp(predicted), np.exp(yt)))

print(r2_score(yt, predicted))
print(r2_score(np.exp(yt), np.exp(predicted)))

0.532235498256624
93.84781790138419
0.3472934942439607
0.15387705720006617


In [38]:
%%time

# 4. One-hot encoding of Weekday & Hour features followed by Box-Cox transformation of numerical features

excluded_cols = ['Appliances']
X, y = get_filtered_df(df, excluded_cols)

cat_col_list = ['Weekday', 'Hour']
features_to_transform = [col for col in X.columns if col not in cat_col_list]
Xt1, opt_lambda_dict = boxcox_transform_features(X, features_to_transform)
yt, _, opt_lambda = boxcox_transform(y, subtract_min=0)
opt_lambda_dict['Appliances'] = opt_lambda # Appliances > 0

Xt = one_hot_encoding(Xt1, cat_col_list)

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

Performing grid search to determine the optimal parameters ...




0.052946184406459616
{'alpha': 0.05, 'normalize': True}
CPU times: user 2.12 s, sys: 152 ms, total: 2.27 s
Wall time: 1.51 s


Unnamed: 0,feature,coef
5,T3,1.537725
0,lights,1.520836
43,x1_18,1.266457
15,T8,1.21751
44,x1_19,0.900435
42,x1_17,0.896007
6,RH_3,0.710559
46,x1_20,0.663535
37,x1_12,0.592587
36,x1_11,0.551877


In [39]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(inv_boxcox(predicted, opt_lambda_dict['Appliances']), y))

print(r2_score(yt, predicted))
print(r2_score(y, inv_boxcox(predicted, opt_lambda_dict['Appliances'])))

0.050839924613537986
97.1860418943107
0.3616818160737354
0.09261226880356832


In [40]:
%%time

# 5. One-hot encoding of Weekday & Hour features followed by log transformation of numerical features

excluded_cols = ['Appliances']
X, y = get_filtered_df(df, excluded_cols)

cat_col_list = ['Weekday', 'Hour']
features_to_transform = [col for col in X.columns if col not in cat_col_list]
Xt1 = log_transform_features(X, features_to_transform, subtract_min=1, add_one=0)
yt = log_transform(y, subtract_min=0, add_one=0)

Xt = one_hot_encoding(Xt1, cat_col_list)

grid_object, grid_search_output, optimal_model = build_optimal_model(model_type, model, model_parameter_grid, Xt, yt, scoring)

print(-grid_object.best_score_)
print(grid_object.best_params_)
pd.DataFrame.from_dict({'feature': Xt.columns.tolist(), 'coef': optimal_model.coef_}).sort_values('coef', ascending=False)

Performing grid search to determine the optimal parameters ...




0.552474106580279
{'alpha': 1, 'normalize': False}
CPU times: user 1.79 s, sys: 246 ms, total: 2.04 s
Wall time: 2.79 s


Unnamed: 0,feature,coef
5,T3,1.583365
0,lights,1.212695
15,T8,1.091359
6,RH_3,1.019509
2,RH_1,0.72816
43,x1_18,0.584864
42,x1_17,0.41692
44,x1_19,0.358441
10,RH_5,0.326735
1,T1,0.309107


In [41]:
# model evaluation
predicted = optimal_model.predict(Xt)

print(root_mean_squared_error(predicted, yt))
print(root_mean_squared_error(np.exp(predicted), np.exp(yt)))

print(r2_score(yt, predicted))
print(r2_score(np.exp(yt), np.exp(predicted)))

0.5322166442761317
93.7788677867745
0.34733973654760375
0.15511989574114837
