In [8]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [9]:
# Read Data

df = pd.read_csv('energydata_complete.csv')
df = df.drop(['date', 'lights'], axis=1)

df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [10]:
# Data Normalization

scaler = MinMaxScaler()
df[df.columns] = scaler.fit_transform(df[df.columns])
df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [41]:
X = df.drop('Appliances', axis=1)
y = df["Appliances"]

In [42]:
# Data splitting

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [43]:
# Linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)

In [44]:
# Model evaluation

print('Test rmse is ',round(np.sqrt(mean_squared_error(y_test, pred_lr)),3))
print('The R-square value is, ', round(r2_score(y_test, pred_lr),2))
print('Test mae is ', round(mean_absolute_error(y_test, pred_lr), 2))

rss = np.sum(np.square(y_test - pred_lr))
print('The rss error is', round(rss, 2))

Test rmse is  0.088
The R-square value is,  0.15
Test mae is  0.05
The rss error is 45.35


In [45]:
# Parameter Estimation for the Linear regression model

coefficients = pd.DataFrame(lr.coef_, index=X.columns, columns=['Value'])
coefficients.sort_values('Value')

Unnamed: 0,Value
RH_2,-0.456698
T_out,-0.32186
T2,-0.236178
T9,-0.189941
RH_8,-0.157595
RH_out,-0.077671
RH_7,-0.044614
RH_9,-0.0398
T5,-0.015657
T1,-0.003281


In [46]:
# Ridge Regression

ridge = Ridge(alpha=0.4)
ridge.fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)

In [47]:
# Model evaluation

print('Test rmse is ',round(np.sqrt(mean_squared_error(y_test, pred_ridge)),3))

Test rmse is  0.088


In [48]:
# Lasso Regression

lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)
pred_lasso = lasso.predict(X_test)
train_lasso = lasso.predict(X_train)

In [49]:
# Parameter Estimation for the Lasso model

coefficients = pd.DataFrame(lasso.coef_, index=X.columns, columns=['Value'])
coefficients.sort_values('Value')

Unnamed: 0,Value
RH_out,-0.049557
RH_8,-0.00011
T1,0.0
Tdewpoint,0.0
Visibility,0.0
Press_mm_hg,-0.0
T_out,0.0
RH_9,-0.0
T9,-0.0
T8,0.0


In [50]:
# Simple linear regression

X_2 = pd.DataFrame(df['T2'])
y_2 = df['T6']

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_2, y_2, test_size=0.3, random_state=42)

lr2 = LinearRegression()
lr2.fit(X_train2, y_train2)
pred_lr2 = lr2.predict(X_test2)

print('The R-square value is, ', round(r2_score(y_test2, pred_lr2),2))

The R-square value is,  0.64
