In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
energy = pd.read_csv('energydata_complete.csv')
energy.head(3)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668


In [3]:
energy.shape

(19735, 29)

In [4]:
# drop date and lights column
energy.drop(["date","lights"],axis = 1, inplace =True)

In [5]:
# normalise the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised = pd.DataFrame(scaler.fit_transform(energy), columns=energy.columns)
features= normalised.drop(columns=['Appliances'])
target = normalised['Appliances']

In [6]:
# split the data.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test =  train_test_split(features,target, test_size=0.3, random_state=42)

In [7]:
from sklearn.linear_model import LinearRegression
Lmodel = LinearRegression()

In [8]:
#Train the model
Lmodel.fit(x_train,y_train)

#obtain predictions
predicted_values = Lmodel.predict(x_test)

In [9]:
def get_weights(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

#### Question 1

In [10]:
# A best fit line relating X and Y has a R-Squared value of 0.75.
# How do I interpret this information?

"The correlation between X and Y is 0.75"


'The correlation between X and Y is 0.75'

#### Question 2

In [11]:
# Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [12]:
lasso_model_weights =  get_weights(lasso_reg, x_train, 'lasso_weights')

In [13]:
t = 0
for i in lasso_model_weights['lasso_weights']:
    if i != 0.000000 or -0.000000:
        t+=1
print(f'There are {t} none zero values.')

There are 4 none zero values.


#### Question 3

In [14]:
# The Lasso can be interpreted as least-squares linear regression where:

"Weights are regularized with the L1 norm"

'Weights are regularized with the L1 norm'

#### Question 4

In [15]:
# Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

linear_model_weights =  get_weights(Lmodel, x_train, 'linear_model_weights')

In [16]:
min_val =  min(linear_model_weights['linear_model_weights'])

In [17]:
# MIN VALUE
linear_model_weights[linear_model_weights['linear_model_weights'] == min_val]

Unnamed: 0,Features,linear_model_weights
0,RH_2,-0.456698


In [18]:
max_val = max(linear_model_weights['linear_model_weights'])

In [19]:
# MAX VALUE
linear_model_weights[linear_model_weights['linear_model_weights'] == max_val]

Unnamed: 0,Features,linear_model_weights
25,RH_1,0.553547


#### Question 5

In [20]:
# In the different terms of the bias-variance tradeoff, which of the following is substantially more harmful to the test error 
# than the training error?

'Answer: Bias'

'Answer: Bias'

#### Question 6

In [21]:
# What is the Residual Sum of Squares (in two decimal places)?
rss = np.sum(np.square(y_test-predicted_values))
output = "{:.2f}".format(rss)
print(output)

45.35


#### Question 7

In [22]:
#How many coefficients do you need to estimate a simple linear regression model (One independent variable)?
"Answer: 1"

'Answer: 1'

#### Question 8

In [23]:
# What is the Coefficient of Determination (in two decimal places)?

from sklearn.metrics import r2_score
r2score = r2_score(y_test, predicted_values)
output = "{:.2f}".format(r2score)
print(output)

0.15


#### Question 9

In [24]:
# In linear regression, L2 regularization is equivalent to imposing a:
"Answer: Gaussian prior"

'Answer: Gaussian prior'

#### Question 10

In [25]:
# What is the Mean Absolute Error (in two decimal places)?

from sklearn.metrics import mean_absolute_error
mae =  mean_absolute_error(y_test, predicted_values)
output = "{:.2f}".format(mae)
print(output)

0.05


#### Question 11

In [26]:
# What is the new RMSE with the Lasso Regression (in 3 decimal places)?  

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [27]:
pred = lasso_reg.predict(x_test)

In [29]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, pred))
output = "{:.3f}".format(rmse)
print(output)

0.094


#### Question 12

In [30]:
# Which of these is not an assumption of Linear Regression?
"Answer: Heteroscedasticity "

'Answer: Heteroscedasticity '

#### Question 13

In [31]:
# Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE)
# when evaluated on the test set?

from sklearn.linear_model import Ridge
ridge_reg =  Ridge(alpha=0.4)
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [32]:
pred_val =  ridge_reg.predict(x_test)

In [33]:
rmse = np.sqrt(mean_squared_error(y_test, pred_val))
output = "{:.3f}".format(rmse)
print(output) # NO

0.088


###### There us no change in RMSE 

#### QUESTION 14

In [34]:
# What can you use to find the best fit line for data in Linear Regression
'Answer: Least Square Error'

'Answer: Least Square Error'

#### QUESTION 15

In [35]:
# Adding more basis functions in a linear model:
'Answer: Decreases model bias '

'Answer: Decreases model bias '

#### QUESTION 16

In [36]:
#Cross validation:

"Answer:Is often used to select hyperparameters "

'Answer:Is often used to select hyperparameters '

#### QUESTION 17

In [37]:
# What is the Root Mean Squared Error (in three decimal places)?

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
output = "{:.3f}".format(rmse)
print(output)

0.088


#### QUESTION 18

In [38]:
# From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) 
# and the temperature outside the building (y = T6). What is the R^2 value in two D.P?

x= normalised['T2']
y= normalised['T6']

In [39]:
from sklearn.metrics import r2_score
r2score = r2_score(x, y)
output = "{:.2f}".format(r2score)
print(output)

0.16


#### QUESTION 19

In [40]:
'Ridge regression: Reduces variance at the expense of higher bias'

'Ridge regression: Reduces variance at the expense of higher bias'

#### QUESTION 20

In [41]:
# Which of the following sentence is true about outliers in Linear Regression:

"Answer: Linear regression is sensitive to outliers"

'Answer: Linear regression is sensitive to outliers'