In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.signal import find_peaks

# Load datasets
train_data = pd.read_csv('TrainData_A.csv')
aggregated_load = pd.read_csv('AggregatedLoad_A.csv')
test_data = pd.read_csv('TestData_A.csv')
template = pd.read_csv('DisaggregatedLoad_Template.csv')

# Rename columns for convenience
train_data.columns = ['index', 'aggregated_load'] + [f'appliance_{i}' for i in range(1, 22)]
aggregated_load.columns = ['index', 'aggregated_load']
test_data.columns = ['index', 'aggregated_load'] + [f'appliance_{i}' for i in range(1, 22)]
template.columns = ['index'] + [f'appliance_{i}_pred' for i in range(1, 22)]




In [2]:
# Add rolling mean feature to train, test, and aggregated_load datasets
window_size = 10

train_data['agg_load_mean'] = train_data['aggregated_load'].rolling(window=window_size, min_periods=1).mean()
test_data['agg_load_mean'] = test_data['aggregated_load'].rolling(window=window_size, min_periods=1).mean()
aggregated_load['agg_load_mean'] = aggregated_load['aggregated_load'].rolling(window=window_size, min_periods=1).mean()


In [3]:
def calculate_frequency(data, window_size):
    # Find local maxima and minima
    peaks, _ = find_peaks(data)
    troughs, _ = find_peaks(-data)
    
    # Combine and sort indices of peaks and troughs
    extrema = np.sort(np.concatenate([peaks, troughs]))
    
    # Calculate frequency of extrema within the window
    frequency = np.zeros_like(data)
    for i in range(len(data)):
        start = max(0, i - window_size)
        end = i
        frequency[i] = np.sum((extrema >= start) & (extrema < end))
    
    return frequency

# Add frequency feature to train, test, and aggregated_load datasets
train_data['agg_load_freq'] = calculate_frequency(train_data['aggregated_load'].values, window_size)
test_data['agg_load_freq'] = calculate_frequency(test_data['aggregated_load'].values, window_size)
aggregated_load['agg_load_freq'] = calculate_frequency(aggregated_load['aggregated_load'].values, window_size)


In [4]:
from sklearn.preprocessing import StandardScaler

# Normalize the data for aggregated load, its mean, and frequency separately
scaler_agg = StandardScaler()
scaler_agg_mean = StandardScaler()
scaler_agg_freq = StandardScaler()
scaler_appliances = StandardScaler()

# Fit scaler on the aggregated load, its mean, and frequency from training data
train_agg_normalized = scaler_agg.fit_transform(train_data[['aggregated_load']])
train_agg_mean_normalized = scaler_agg_mean.fit_transform(train_data[['agg_load_mean']])
train_agg_freq_normalized = scaler_agg_freq.fit_transform(train_data[['agg_load_freq']])

# Fit scaler on the appliance loads from training data
train_appliances_normalized = scaler_appliances.fit_transform(train_data.iloc[:, 2:-2])  # Exclude agg_load_mean and agg_load_freq

# Concatenate the normalized aggregate load, its mean, and frequency
train_features_normalized = np.concatenate([train_agg_normalized, train_agg_mean_normalized, train_agg_freq_normalized], axis=1)
test_features_normalized = np.concatenate([
    scaler_agg.transform(test_data[['aggregated_load']]),
    scaler_agg_mean.transform(test_data[['agg_load_mean']]),
    scaler_agg_freq.transform(test_data[['agg_load_freq']])
], axis=1)
aggregated_features_normalized = np.concatenate([
    scaler_agg.transform(aggregated_load[['aggregated_load']]),
    scaler_agg_mean.transform(aggregated_load[['agg_load_mean']]),
    scaler_agg_freq.transform(aggregated_load[['agg_load_freq']])
], axis=1)

# Split data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(train_features_normalized, train_appliances_normalized, test_size=0.2, random_state=42)

# Reshape the training and test data
X_train = X_train.reshape(-1, 1, 3)  # Include 3 features: aggregated load, its mean, and its frequency
X_test = X_test.reshape(-1, 1, 3)

# Prepare features for aggregated load data
X_aggregated = aggregated_features_normalized.reshape(-1, 1, 3)



In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten
from tensorflow.keras.optimizers import Adam

# CNN Model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(1, 3)))  # Update input_shape to (1, 3)
cnn_model.add(Flatten())
cnn_model.add(Dense(21, activation='linear'))

cnn_model.compile(optimizer=Adam(), loss='mean_squared_error')
cnn_model.summary()

# CNN Training
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Predicting with CNN
y_pred_cnn_test = cnn_model.predict(X_test)
y_pred_cnn_full = cnn_model.predict(X_aggregated)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 848us/step - loss: 0.9464 - val_loss: 0.8197
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 836us/step - loss: 0.8254 - val_loss: 0.7888
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 884us/step - loss: 0.8331 - val_loss: 0.7739
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 870us/step - loss: 0.7856 - val_loss: 0.7646
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 964us/step - loss: 0.8013 - val_loss: 0.7571
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 881us/step - loss: 0.7771 - val_loss: 0.7466
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 940us/step - loss: 0.7833 - val_loss: 0.7420
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.7653 - val_loss: 0.7326
Epoch 9/10

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(1, 3)))  # Update input_shape to (1, 3)
lstm_model.add(Dense(21))

lstm_model.compile(optimizer=Adam(), loss='mean_squared_error')
lstm_model.summary()

# LSTM Training
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Predicting with LSTM
y_pred_lstm_test = lstm_model.predict(X_test)
y_pred_lstm_full = lstm_model.predict(X_aggregated)


  super().__init__(**kwargs)


Epoch 1/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 827us/step - loss: 0.9020 - val_loss: 0.8144
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 779us/step - loss: 0.8009 - val_loss: 0.7898
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 792us/step - loss: 0.7924 - val_loss: 0.7772
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 895us/step - loss: 0.8595 - val_loss: 0.7666
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 934us/step - loss: 0.7832 - val_loss: 0.7545
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 958us/step - loss: 0.7807 - val_loss: 0.7464
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 847us/step - loss: 0.7619 - val_loss: 0.7369
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.7755 - val_loss: 0.7266
Epoch 9/10

In [13]:
# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train.reshape(-1, 3), y_train)  # Reshape X_train for RandomForest

# Predicting with Random Forest
y_pred_rf_test = rf_model.predict(X_test.reshape(-1, 3))
y_pred_rf_full = rf_model.predict(X_aggregated.reshape(-1, 3))


In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to calculate RMSE as in the provided notebook
def calculate_rmse(true_values, pred_values):
    true_values_scaled = true_values / true_values.max(axis=0)
    pred_values_scaled = pred_values / true_values.max(axis=0)
    rmse = np.sqrt(mean_squared_error(true_values_scaled, pred_values_scaled, multioutput='raw_values'))
    return rmse

# Calculate RMSE for each appliance
rmse_cnn = calculate_rmse(y_test, y_pred_cnn_test)
rmse_lstm = calculate_rmse(y_test, y_pred_lstm_test)
rmse_rf = calculate_rmse(y_test, y_pred_rf_test)

# Combine RMSE into a DataFrame for comparison
rmse_df = pd.DataFrame({
    'appliance': [f'appliance_{i}' for i in range(1, 22)],
    'rmse_cnn': rmse_cnn,
    'rmse_lstm': rmse_lstm,
    'rmse_rf': rmse_rf
})

print(rmse_df)

       appliance  rmse_cnn  rmse_lstm   rmse_rf
0    appliance_1  0.115818   0.115697  0.053510
1    appliance_2  0.063365   0.060643  0.017213
2    appliance_3  0.120732   0.101668  0.028800
3    appliance_4  0.072022   0.071875  0.025376
4    appliance_5  0.026112   0.026268  0.016792
5    appliance_6  0.142504   0.141431  0.060933
6    appliance_7  0.037051   0.037127  0.019167
7    appliance_8  0.121368   0.089625  0.030247
8    appliance_9  0.032504   0.031759  0.015649
9   appliance_10  0.504936   0.502516  0.082220
10  appliance_11  0.010030   0.010020  0.010170
11  appliance_12  0.215545   0.205802  0.081439
12  appliance_13  0.007311   0.007312  0.008297
13  appliance_14  0.049319   0.049214  0.028491
14  appliance_15  0.091075   0.091025  0.053619
15  appliance_16  0.127408   0.127915  0.072600
16  appliance_17  0.113832   0.114826  0.049497
17  appliance_18  0.155699   0.152781  0.067042
18  appliance_19  0.044301   0.044274  0.023113
19  appliance_20  0.043366   0.043307  0

In [15]:
# Determine the best model for each appliance
best_models = rmse_df[['rmse_cnn', 'rmse_lstm', 'rmse_rf']].idxmin(axis=1)
best_models = best_models.replace({'rmse_cnn': 'CNN', 'rmse_lstm': 'LSTM', 'rmse_rf': 'RF'})
rmse_df['best_model'] = best_models

print(rmse_df)



       appliance  rmse_cnn  rmse_lstm   rmse_rf best_model
0    appliance_1  0.115818   0.115697  0.053510         RF
1    appliance_2  0.063365   0.060643  0.017213         RF
2    appliance_3  0.120732   0.101668  0.028800         RF
3    appliance_4  0.072022   0.071875  0.025376         RF
4    appliance_5  0.026112   0.026268  0.016792         RF
5    appliance_6  0.142504   0.141431  0.060933         RF
6    appliance_7  0.037051   0.037127  0.019167         RF
7    appliance_8  0.121368   0.089625  0.030247         RF
8    appliance_9  0.032504   0.031759  0.015649         RF
9   appliance_10  0.504936   0.502516  0.082220         RF
10  appliance_11  0.010030   0.010020  0.010170       LSTM
11  appliance_12  0.215545   0.205802  0.081439         RF
12  appliance_13  0.007311   0.007312  0.008297        CNN
13  appliance_14  0.049319   0.049214  0.028491         RF
14  appliance_15  0.091075   0.091025  0.053619         RF
15  appliance_16  0.127408   0.127915  0.072600         

In [18]:
# Initialize the hybrid predictions array for test data
hybrid_predictions_test = np.zeros_like(y_pred_cnn_test)

# Assign the best model's predictions to the hybrid model for test data
for i in range(21):
    if rmse_df.loc[i, 'best_model'] == 'CNN':
        hybrid_predictions_test[:, i] = y_pred_cnn_test[:, i]
    elif rmse_df.loc[i, 'best_model'] == 'LSTM':
        hybrid_predictions_test[:, i] = y_pred_lstm_test[:, i]
    else:
        hybrid_predictions_test[:, i] = y_pred_rf_test[:, i]

# Calculate and show summed RMSE for the hybrid model on test data
rmse_hybrid_test = calculate_rmse(y_test, hybrid_predictions_test)
print("Hybrid Model RMSE Sum on Test Data:", rmse_hybrid_test)

# Initialize the hybrid predictions array for AggregatedLoad_A
hybrid_predictions_full = np.zeros_like(y_pred_cnn_full)

# Assign the best model's predictions to the hybrid model for AggregatedLoad_A
for i in range(21):
    if rmse_df.loc[i, 'best_model'] == 'CNN':
        hybrid_predictions_full[:, i] = y_pred_cnn_full[:, i]
    elif rmse_df.loc[i, 'best_model'] == 'LSTM':
        hybrid_predictions_full[:, i] = y_pred_lstm_full[:, i]
    else:
        hybrid_predictions_full[:, i] = y_pred_rf_full[:, i]

# Save predictions to CSV
def save_predictions(predictions, filename):
    predictions_scaled = scaler_appliances.inverse_transform(predictions)
    template_copy = template.copy()
    template_copy.iloc[:, 1:] = predictions_scaled
    template_copy.to_csv(filename, index=False)
    return predictions_scaled


save_predictions(y_pred_cnn_full, 'Predicted_CNN.csv')
save_predictions(y_pred_lstm_full, 'Predicted_LSTM.csv')
save_predictions(y_pred_rf_full, 'Predicted_RF.csv')
hybrid_predictions_full=save_predictions(hybrid_predictions_full, 'Predicted_Hybrid.csv')


# Load ground truth data for AggregatedLoad_A
true_A = pd.read_csv("TestData_A.csv", index_col=0)
true_A = true_A.iloc[:, 1:]  # Ignore aggregate load column
true_A_values_scaled = true_A.values / true_A.max(axis=0).values

# Calculate and show summed RMSE for the hybrid model on AggregatedLoad_A
pred_A_values_scaled = hybrid_predictions_full / true_A.max(axis=0).values
rmse_sum_hybrid = np.sum(mean_squared_error(true_A_values_scaled, pred_A_values_scaled, multioutput="raw_values"))
print("Hybrid Model RMSE Sum on AggregatedLoad_A:", rmse_sum_hybrid)


Hybrid Model RMSE Sum on Test Data: [0.05351014 0.01721342 0.02880005 0.02537591 0.0167919  0.06093277
 0.01916724 0.03024693 0.01564889 0.08221996 0.01001972 0.08143932
 0.00731135 0.02849126 0.05361942 0.07259979 0.04949721 0.0670421
 0.02311253 0.01652113 0.01183085]
Hybrid Model RMSE Sum on AggregatedLoad_A: 0.14749399928704257
