In [None]:
# General Libraries
import pandas as pd  # for handling dataframes
import seaborn as sns  # for visualization
import matplotlib.pyplot as plt  # for plotting graphs
import numpy as np  # for numerical computing

In [None]:
# Scikit-Learn Libraries
from sklearn.preprocessing import StandardScaler  # for feature scaling
from sklearn.model_selection import train_test_split  # for splitting data
from sklearn.metrics import mean_squared_error, confusion_matrix, recall_score, precision_score

In [None]:
# TensorFlow & Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model  # tensorflow keras API
from tensorflow.keras.layers import Dense, Dropout, LSTM, Activation  # tensorflow keras layers
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint  # callbacks for training
from tensorflow.keras.optimizers import Adam  # optimizer
from tensorflow.keras.utils import pad_sequences  # utility function for sequence padding
from tensorflow.keras import layers # import layers explicitly

**Feature set description**

|**Feature**         |**Description**|
|----------------|:----------------|
|Date Time       | year-month-day hour:minute:second   |
|Appliances      | energy use in Wh|
|lights          | energy use of light fixtures in the house in Wh|
|T1              | Temperature in kitchen area, in Celsius|
|RH_1            | Temperature in kitchen area, in Celsius|
|T2              | Temperature in living room area, in Celsius
|RH_2            | Humidity in living room area, in %
|T3              | Temperature in laundry room area
|RH_3            | Humidity in laundry room area, in %
|T4              | Temperature in office room, in Celsius
|RH_4            | Humidity in office room, in %
|T5              | Temperature in bathroom, in Celsius
|RH_5            | Humidity in bathroom, in %
|T6              | Temperature outside the building (north side), in Celsius
|RH_6            | Humidity outside the building (north side), in %
|T7              | Temperature in ironing room , in Celsius
|RH_7            | Humidity in ironing room, in %
|T8              | Temperature in teenager room 2, in Celsius
|RH_8            | Humidity in teenager room 2, in %
|T9              | Temperature in parents room, in Celsius
|RH_9            | Humidity in parents room, in %
|To              | Temperature outside (from Chievres weather station), in Celsius
|Pressure (from Chievres weather station) | in mm Hg
|RH_out Humidity outside (from Chievres weather station) | in %
|Wind speed (from Chievres weather station) | in m/s
|Visibility (from Chievres weather station) | in km
|Tdewpoint (from Chievres weather station) | Â°C
|rv1 | Random variable 1 nondimensional
|rv2 | Random variable 2 nondimensional


In [None]:
file_path = 'energydata_complete.csv'

# Read our dataset into our dataframe
df = pd.read_csv(file_path)
df_raw = df.copy()

# Print the head of the dataframe
df.head()

In [None]:
# Check for missing values in the features
df.isna().sum()


Per our output, we see not missing values in the dataset.

In [None]:
# Conver date to Datetime format
df['Datetime'] = pd.to_datetime(df['date'])

In [None]:
# Verify our dataframe types
df.dtypes

In [None]:
# Round all values to 2 decimal places for easier processing and formatting
df = df.round(2)

In [None]:
# Validate our rounding
df.head()

In [None]:
# Drop the 'Date' column
df = df.drop(columns=['date'])

# Move the 'Datetime' column to the first position
df = df[['Datetime'] + [col for col in df.columns if col != 'Datetime']]

In [None]:
df.head()

In [None]:
correlation_matrix = df.corr()

In [None]:
# Set up the figure size
plt.figure(figsize=(12, 10))

# Plot the heatmap
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f", cbar=True)

# Add titles and labels
plt.title('Heatmap of Correlations Between Variables', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

# Display the heatmap
plt.show()

The below features were chosen based on their strong correlation with Appliances (energy consumption) while avoiding redundant or weak predictors. <br>
lights was included because lighting directly contributes to household energy use. Indoor temperature features (T1, T2) were selected as they <br>
impact HVAC operations, which are a major factor in energy consumption. T_out (outdoor temperature) and Tdewpoint were chosen because external <br>
weather conditions influence heating and cooling requirements inside the home. RH_out (outdoor humidity) and Windspeed were kept as they can affect <br>
temperature regulation and ventilation needs. Finally, hour was included to capture daily energy usage patterns, as appliance consumption tends to <br>
vary throughout the day. These features provide a balanced mix of internal conditions, external environmental influences, and time-based patterns, <br>
ensuring the model captures key factors affecting energy consumption.

|**Feature**         |**Description**|
|----------------|:----------------|
|lights                    | Directly affects energyh consumption  |
|T1 (Kitchen Temp)         | Strong correlation with Appliances    |
|T2 (Living Room Temp)     | Represents HVAC energy use            |
|T_out (Outside Temp)      | External factor on energy consumption |
|RH_out (Outdoor Humidity) | Impcasts cooling and heating          |
|Windspeed                 | Influences temp reulation             |
|Tdewpoint                 | Outdoor conditions                    |
|hour                      | Captures time-based energy patterns   |

In [None]:
# Plot a distribution of the energy consumption
sns.histplot(df['Appliances'], bins=30, kde=True)
plt.title("Appliances Energy Consumption Distribution")
plt.xlabel("Energy Consumption (Wh)")
plt.ylabel("Occurrence")
plt.show()


1. We can see from our distribution plot for energy consumption that most of the consumption is < 200 Wh
2. A sharp peak > 50 Wh < 100 Wh suggests many appliances consume low energy.
3. We have a right skewed plot, dmeonstrating that anything > 400 Wh is rare.

In [None]:
# Extract time-based features
df['hour'] = df['Datetime'].dt.hour  # Extract hour of the day (0-23)


In [None]:
# Plot raw data (individual appliance consumption points)
plt.figure(figsize=(10, 5))
plt.scatter(df['hour'], df['Appliances'], alpha=0.3, color='blue', label="Appliance Energy Consumption")

# Overlay the 24-hour average trend
hourly_avg = df.groupby('hour')['Appliances'].mean()
plt.plot(hourly_avg.index, hourly_avg.values, marker='o', linestyle='-', color='red', label="Hourly Average")

# Plot our hourly average across the dataset
plt.xlabel("Hour of the Day")
plt.ylabel("Energy Consumption (Wh)")
plt.title("Appliance Energy Consumption by Hour of the Day")
plt.legend()
plt.grid()
plt.show()

The hourly average did not provide to strong of an insight. As expected, energy consumption is lowest during sleeping hours. <br>
Early evening has the peak on average.

In [None]:
#sns.pairplot(df[['Appliances', 'lights', 'T1', 'T2', 'T_out', 'RH_out', 'Windspeed', 'Tdewpoint', 'hour']])
#plt.show()


# Feedfoward Neural Network (FNN)

In [None]:
# set our features and target
features = ['lights', 'T1', 'T2', 'T_out', 'RH_out', 'Windspeed', 'Tdewpoint', 'hour']
target = 'Appliances'

In [None]:
# Prepare our data for training
X = df[features]  # independent variables
y = df[target]  # target variable

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")

In [None]:
# build of FNN
model = Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # input
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # output layer for energy consumption prediction
])

# compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_test, y_test))

# model evaluation
loss, mae = model.evaluate(X_test, y_test)
print(f"Neural Network MAE: {mae:.2f} Wh")


In [None]:
# Extract loss and validation loss from the training history
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss (MSE)")
plt.title("Training & Validation Loss Over Epochs")
plt.legend()
plt.grid(True)
plt.show()

# FNN Optimized

In [None]:
# callbacks to prevent overfitting and optimize learning
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=5, min_lr=1e-5)

# updated our FNN model
model = Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # input Layer
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),  # dropout to reduce overfitting
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # output layer for regression
])

# compile the model with a lower learning rate for stable training
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# train the model with early stopping to prevent overfitting
history = model.fit(
    X_train, y_train,
    epochs=100, batch_size=16,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping, reduce_lr]
)

# evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Optimized Neural Network MAE: {mae:.2f} Wh")


In [None]:

# Extract loss and validation loss from the training history
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss (MSE)")
plt.title("Training & Validation Loss Over Epochs")
plt.legend()
plt.grid(True)
plt.show()


# LSTM

In [None]:
# Normalize the data
scaler = StandardScaler()
feat_cols = ['Appliances']  # Define the features
df[feat_cols] = scaler.fit_transform(df[feat_cols])

# Define target variable
target_col = 'Appliances'

In [None]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(df))
train_df = df.iloc[:train_size].reset_index(drop=True)
val_df = df.iloc[train_size:].reset_index(drop=True)

# Define X_train (features for LSTM)
X_train = train_df[feat_cols].values

# Define model parameters
nb_features = X_train.shape[1]  # Now correctly defined
nb_out = 1

In [None]:
# Sequence parameters
seq_length = 60
ph = 5

# Prepare training sequences
seq_arrays = []
seq_labs = []

for i in range(len(train_df) - seq_length - ph):
    seq = train_df[feat_cols].iloc[i:i + seq_length].values
    label = train_df[target_col].iloc[i + seq_length + ph - 1]
    seq_arrays.append(seq)
    seq_labs.append(label)

# Convert sequences to NumPy arrays
seq_arrays = np.array(seq_arrays, dtype=np.float32)
seq_labs = np.array(seq_labs, dtype=np.float32).reshape(-1)

In [None]:
# Define model path
model_path = 'LSTM_base_model.keras'

# Build the LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(seq_length, nb_features), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=25, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out))
model.add(Activation('linear'))

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])

# Print model summary
print(model.summary())

# Train the model without callbacks
history = model.fit(
    seq_arrays, seq_labs,
    epochs=100,
    batch_size=500,
    validation_split=0.1,
    verbose=2
)

In [None]:
# Prepare validation sequences
val_arrays = []
val_labs = []

for i in range(seq_length, len(val_df) - ph):
    seq = val_df[feat_cols].iloc[i - seq_length:i].values
    label = val_df[feat_cols].iloc[i + ph - 1].values
    val_arrays.append(seq)
    val_labs.append(label)

val_arrays = np.array(val_arrays, dtype=np.float32)
val_labs = np.array(val_labs, dtype=np.float32).reshape(-1)

# Evaluate the model
scores_test = model.evaluate(val_arrays, val_labs, verbose=2)
print('\nMSE: {}'.format(scores_test[1]))

# Predictions
y_pred_test = model.predict(val_arrays)
y_true_test = val_labs

# Plot the results
fig_verify = plt.figure(figsize=(10, 5))
plt.plot(y_pred_test[-500:], label='Predicted Value')
plt.plot(y_true_test[-500:], label='Actual Value')
plt.title('Appliances Power Prediction - Last 500 Points', fontsize=22, fontweight='bold')
plt.ylabel('Value')
plt.xlabel('Row')
plt.legend()
plt.show()
fig_verify.savefig("model_regression_verify.png")

# LSTM Optimized

In [None]:
# Normalize the data
scaler = StandardScaler()
feat_cols = ['Appliances']
df[feat_cols] = scaler.fit_transform(df[feat_cols])

# Split the data into training and validation sets
train_size = int(0.8 * len(df))
train_df = df.iloc[:train_size].reset_index(drop=True)  # First 80% for training
val_df = df.iloc[train_size:].reset_index(drop=True)    # Last 20% for validation

In [None]:
# Sequence parameters
seq_length = 60  # Increased sequence length
ph = 5

# Prepare training sequences
seq_arrays = []
seq_labs = []

for i in range(len(train_df) - seq_length - ph):
    seq = train_df[feat_cols].iloc[i:i + seq_length].values
    label = train_df[feat_cols].iloc[i + seq_length + ph - 1].values
    seq_arrays.append(seq)
    seq_labs.append(label)

seq_arrays = np.array(seq_arrays, dtype=np.float32)
seq_labs = np.array(seq_labs, dtype=np.float32).reshape(-1)

assert seq_arrays.shape == (len(train_df) - seq_length - ph, seq_length, len(feat_cols))
assert seq_labs.shape == (len(train_df) - seq_length - ph,)

In [None]:
# Build the LSTM model
model_path = 'LSTM_model1.keras'
nb_features = len(feat_cols)
nb_out = 1

model = Sequential()
model.add(LSTM(units=50, input_shape=(seq_length, nb_features), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=25, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=nb_out))
model.add(Activation('linear'))

optimizer = keras.optimizers.Adam(learning_rate=0.005)  # Adjusted learning rate
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])

print(model.summary())

# Callbacks

# Learning rate scheduler: Reduces learning rate by a factor of 0.5 if validation loss does not improve for 5 epochs
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

# Early stopping: Stops training if validation loss does not improve for 10 consecutive epochs, preventing overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')

# Model checkpoint: Saves the model with the best validation loss during training to avoid saving suboptimal weights
model_checkpoint = ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True, mode='min', verbose=1)


# Train the model
history = model.fit(
    seq_arrays, seq_labs,
    epochs=100,
    batch_size=500,
    validation_split=0.1,
    verbose=2,
    callbacks=[lr_scheduler, early_stopping, model_checkpoint]
)


In [None]:
# Prepare validation sequences
val_arrays = []
val_labs = []

for i in range(seq_length, len(val_df) - ph):
    seq = val_df[feat_cols].iloc[i - seq_length:i].values
    label = val_df[feat_cols].iloc[i + ph - 1].values
    val_arrays.append(seq)
    val_labs.append(label)

val_arrays = np.array(val_arrays, dtype=np.float32)
val_labs = np.array(val_labs, dtype=np.float32).reshape(-1)

# Evaluate the model
scores_test = model.evaluate(val_arrays, val_labs, verbose=2)
print('\nMSE: {}'.format(scores_test[1]))

# Predictions
y_pred_test = model.predict(val_arrays)
y_true_test = val_labs

# Plot the results
fig_verify = plt.figure(figsize=(10, 5))
plt.plot(y_pred_test[-500:], label='Predicted Value')
plt.plot(y_true_test[-500:], label='Actual Value')
plt.title('Appliances Power Prediction - Last 500 Points', fontsize=22, fontweight='bold')
plt.ylabel('Value')
plt.xlabel('Row')
plt.legend()
plt.show()
fig_verify.savefig("model_regression_verify.png")