In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("energydata_complete (1).csv")
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
df = df.drop(["date", "lights"], axis=1)

In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(["Appliances"], axis=1)
Y = df["Appliances"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error

model = LinearRegression()
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

# Obtaining Mean Absolute Error on the training set
mae_train = mean_absolute_error(Y_train, Y_train_pred)
print(f"The Mean Absolute Error on the training set is: {mae_train:.3f}")

# Obtaining Root Mean Squared Error on the training set
rmse_train = np.sqrt(mean_squared_error(Y_train, Y_train_pred))
print(f"The Root Mean Squared Error on the training set is: {rmse_train:.3f}")

# Obtaining Mean Absolute Error on the test set
mae_test = mean_absolute_error(Y_test, Y_test_pred)
print(f"Mean Absolute Error on the test set: {mae_test:.3f}")

# Obtaining Root Mean Squared Error on the test set
rmse_test = np.sqrt(mean_squared_error(Y_test, Y_test_pred))
print(f"Root Mean Squared Error on the test set: {rmse_test:.3f}")

The Mean Absolute Error on the training set is: 53.742
The Root Mean Squared Error on the training set is: 95.216
Mean Absolute Error on the test set: 53.641
Root Mean Squared Error on the test set: 93.637


In [7]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=.001)
lasso_model.fit(X_train, Y_train)

Y_test_pred = lasso_model.predict(X_test)

# Obtaining Root Mean Squared Error (RMSE) for the test set
rmse_test_lasso = np.sqrt(mean_squared_error(Y_test, Y_test_pred))
print(f"RMSE with Lasso Regression on the test set: {rmse_test_lasso:.3f}")

RMSE with Lasso Regression on the test set: 93.641


In [8]:
lasso_model = Lasso()
lasso_model.fit(X_train, Y_train)

feature_weights = lasso_model.coef_

# Obtaining non-feature zero weights
non_zero_features = np.sum(feature_weights != 0)
print(f"Number of features with non-zero feature weights: {non_zero_features}")

Number of features with non-zero feature weights: 4


In [9]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()
ridge_model.fit(X_train, Y_train)

Y_train_pred = ridge_model.predict(X_train)
Y_test_pred = ridge_model.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(Y_train, Y_train_pred))
rmse_test = np.sqrt(mean_squared_error(Y_test, Y_test_pred))

# checking if change to the root mean squared error (RMSE) when evaluated on the test set
if rmse_train == rmse_test:
    print("RMSE on training and test sets is the same.")
else:
    print(f"RMSE on the training set: {rmse_train:.3f}")
    print(f"RMSE on the test set: {rmse_test:.3f}")
    if rmse_train < rmse_test:
        print("RMSE on the training set is lower.")
    else:
        print("RMSE on the test set is lower.")


RMSE on the training set: 95.260
RMSE on the test set: 93.709
RMSE on the test set is lower.


In [10]:
# Linear model on relationship b/w T2 and T6
X = df[['T2']].values
Y = df['T6'].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

# Obtaining Root Mean Squared error
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")

Root Mean Squared Error (RMSE): 3.630


In [11]:
# Determining if there was an overfit using cross-validation
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
from sklearn.linear_model import LinearRegression

model = LinearRegression()

k_folds = 5
cv = KFold(n_splits=k_folds, shuffle=True, random_state=42)

scoring = make_scorer(mean_squared_error, greater_is_better=False)

scores = cross_val_score(model, X, Y, cv=cv, scoring=scoring)

rmse_scores = [abs(score) ** 0.5 for score in scores]

mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f"The mean rmse: {mean_rmse}")
print(f"The std rmse: {std_rmse}")

The mean rmse: 3.6449218985868086
The std rmse: 0.032607197188006844


In [12]:
"""The RMSE on the test set is close to the mean RMSE from cross-validation, 
    indicating that the model is performing consistently and generalizing well. 
    The low standard deviation shows that the model's performance is stable across
    different data splits, which is another good sign.
"""

"The RMSE on the test set is close to the mean RMSE from cross-validation, \n    indicating that the model is performing consistently and generalizing well. \n    The low standard deviation shows that the model's performance is stable across\n    different data splits, which is another good sign.\n"