In [36]:
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.linear_model import LinearRegression

import numpy as np
import pandas as pd
energy = pd.read_csv('energydata_complete.csv')
energy.head(5)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [14]:
energy.head(3)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668


In [15]:
#checking for missing values
energy.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [37]:
# BUILDING A MULTIPLE LINEAR REGRESSION MODEL FOR THE DATASET

from sklearn.model_selection import train_test_split

#dropping columns not relevant to the model
energy = energy.drop(columns = ['date', 'lights'])

#normalizing the dataset
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(energy), columns = energy.columns)

#X is independent variables while Y is the dependent variable, our appliances is the dependent variable while other columns are the independent variables
X= df.drop(columns = ['Appliances'])
Y = df['Appliances']

#splitting the data into train and test data
x_train, x_test, y_train, y_test = train_test_split (X, Y, test_size = 0.3, random_state = 42)

#train dmodel

model = LinearRegression()
model.fit (x_train, y_train)

#predicting-y-values ie the dependent variable
y_predict = model.predict(x_test)

# The total number of non-zero feature weights of a Lasso Model with an alpha of 0.001

In [46]:
from sklearn.linear_model import Lasso

#defining the lasso regression
lasso_reg = Lasso (alpha = 0.001)
#train model
lasso_reg.fit(x_train, y_train)

# predicting the dependent variable 

lasso_predicted_y = lasso_reg.predict(x_test)

lasso_reg_weights = pd.DataFrame(lasso_reg.coef_, X.columns, columns =['coeff']).sort_values( by ='coeff' ).rename_axis("features")
lasso_reg_weights
(lasso_reg_weights[lasso_reg_weights.coeff != 0]).count()

coeff    4
dtype: int64

# Features with the lowest and highest values of model weight

In [40]:
model_weights = pd.DataFrame(model.coef_, X.columns, columns =['coeff']).sort_values( by ='coeff' ).rename_axis("features")
#model_weights
print(model_weights[model_weights.coeff == model_weights.coeff.min()]) 
print(model_weights[model_weights.coeff == model_weights.coeff.max()]) 

             coeff
features          
RH_2     -0.456698
             coeff
features          
RH_1      0.553547


# Coefficient of determination

In [20]:
import numpy as np
r2 = metrics.r2_score(y_test, y_predict)
r2 = round (r2, 2)
r2

0.15

# Mean Absolute Error

In [21]:
mae = metrics.mean_absolute_error(y_test, y_predict)
mae = round(mae, 2)
mae

0.05

# Residual sum of squares in 2 dp

In [22]:
rss = np.sum(np.square(y_test - y_predict))
rss = round(rss, 2)
rss

45.35

# Root Mean Square Error in 3 dp

In [23]:
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_predict))
rmse = round( rmse, 3)
rmse

0.088

# Is there a change in RMSE of a Ridge Regression model using an alpha 0.5 and 0.4

In [48]:

from sklearn.linear_model import Ridge
#RIDGE REGRESSION ALPHA 0.5
ridge_reg = Ridge(alpha = 0.5)
ridge_reg.fit(x_train, y_train)

#predict values where r_rmse is the RR using an alpha of 0.5

ridge_predicted_y = ridge_reg.predict(x_test)
r_rmse = np.sqrt(metrics.mean_squared_error(y_test, ridge_predicted_y))
r_rmse


#RIDGE REGRESSION ALPHA 0.4
ridge_reg1 = Ridge(alpha = 0.4)
ridge_reg1.fit(x_train, y_train)

#predict values where r_rmse1 is the RR using an alpha of 0.4

ridge_predicted_y = ridge_reg1.predict(x_test)
r_rmse1 = np.sqrt(metrics.mean_squared_error(y_test, ridge_predicted_y))
print ('The rsme for alpha of 0.4 is', r_rmse1)
print('The rsme for alpha of 0.5 is', r_rmse)
print ('From the results, there is more or less no change ')


The rsme for alpha of 0.4 is 0.08753385704628004
The rsme for alpha of 0.5 is 0.08754118590838059
From the results, there is more or less no change 


# RMSE with the lasso regression in 3 dp

In [31]:
from sklearn.linear_model import Lasso

#lasso regression
lasso_reg = Lasso (alpha = 0.001)
#build model
lasso_reg.fit(x_train, y_train)


lasso_predicted_y = lasso_reg.predict(x_test)

l_rmse = np.sqrt(metrics.mean_squared_error(y_test, lasso_predicted_y))
l_rmse = round(l_rmse,3)
l_rmse

0.094

# Coefficient of determination of a linear model in 2 dp where x=t2, y=t6

In [32]:
xx = energy['T2'].values.reshape(-1, 1)
yy = energy['T6'].values.reshape(-1, 1)

#splitting the data into train and test data
x_train, x_test, y_train, y_test = train_test_split (xx, yy, test_size = 0.3, random_state = 42)

#train model

model = LinearRegression()
model.fit (x_train, y_train)

#predict 
predicted_y = model.predict(x_test)


r2 = metrics.r2_score(y_test, predicted_y)
r2 = round(r2, 2)
r2

0.64