## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Load and Inspect Dataset

In [2]:
df = pd.read_csv('energy_data.csv')

In [3]:
# increase maximum display columns
pd.set_option('display.max_columns',30)

# inspect top few rows
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
# rename columns
df.columns = df.columns.str.lower()
df.rename(columns={'tdewpoint':'t_dewpoint','rv1':'rv_1','rv2':'rv_2'},inplace=True)

# confirm changes
df.head()

Unnamed: 0,date,appliances,lights,t1,rh_1,t2,rh_2,t3,rh_3,t4,rh_4,t5,rh_5,t6,rh_6,t7,rh_7,t8,rh_8,t9,rh_9,t_out,press_mm_hg,rh_out,windspeed,visibility,t_dewpoint,rv_1,rv_2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


## 12. What is the R^2 value in two d.p.?

In [5]:
# import Linear Regression Model
from sklearn.linear_model import LinearRegression

# seperate feature matrix form target vector 
X = df.t2.values.reshape(-1,1)
y = df.t6

# fit model to data
model = LinearRegression()
model.fit(X,y)

# evaluate r2_score
round(model.score(X,y),2)

0.64

## 13. What is the Mean Absolute Error (in two decimal places)?

In [6]:
# import scaler class
from sklearn.preprocessing import MinMaxScaler 

# drop date and lights columns
df1 = df.drop(columns=['date', 'lights'])

# normalize data
scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df1), columns=df1.columns) 

# seperate features from target
features_df = normalised_df.drop(columns=['appliances']) 
target = normalised_df['appliances'] 

# split data into train and test sets
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(features_df,target,test_size=0.3,random_state=42) 

# fit model to train set
linear_model = LinearRegression() 
linear_model.fit(X_train, y_train) 

# obtain linear model predictions 
predicted_values = linear_model.predict(X_test) 

# evaluate mean absolute error 
from sklearn.metrics import mean_absolute_error 
mae = mean_absolute_error(y_test, predicted_values) 
round(mae,2)

0.05

## 14. What is the Residual Sum of Squares (in two decimal places)?

In [7]:
rss = np.sum(np.square(y_test - predicted_values)) 
round(rss, 2)

45.35

## 15. What is the Root Mean Squared Error (in three decimal places)?

In [8]:
from sklearn.metrics import mean_squared_error as mse
lin_reg_rmse = np.sqrt(mse(y_test,predicted_values))
round(lin_reg_rmse,3)

0.088

## 16. What is the Coefficient of Determination (in two decimal places)?

In [9]:
round(linear_model.score(X_test,y_test),2)

0.15

## 17. Which features have the lowest and highest weights respectively?

In [10]:
weights = linear_model.coef_
feats = X_train.columns
feat_weights = sorted(zip(weights,feats))
lowest = feat_weights[0]
highest = feat_weights[-1]
lowest[1],highest[1]

('rh_2', 'rh_1')

## 18.  Is there any change to the root mean squared error (RMSE)?

In [11]:
from sklearn.linear_model import Ridge 
ridge_reg = Ridge(alpha=0.4) 
ridge_reg.fit(X_train, y_train) 

ridge_preds = ridge_reg.predict(X_test)
ridge_reg_rmse = np.sqrt(mse(y_test,ridge_preds))
round(ridge_reg_rmse,3) != round(lin_reg_rmse,3)

False

## 19.  How many of the features have non-zero feature weights?

In [12]:
from sklearn.linear_model import Lasso 
lasso_reg = Lasso(alpha=0.001) 
lasso_reg.fit(X_train, y_train)
(abs(lasso_reg.coef_)>0).sum()

4

## 20. What is the new RMSE with the lasso regression? 

In [13]:
lasso_preds = lasso_reg.predict(X_test)
lasso_rmse = np.sqrt(mse(y_test,lasso_preds))
round(lasso_rmse,3)

0.094