In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.signal import find_peaks
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

# Load datasets
train_data = pd.read_csv('TrainData_A.csv')
aggregated_load = pd.read_csv('AggregatedLoad_A.csv')
test_data = pd.read_csv('TestData_A.csv')
template = pd.read_csv('DisaggregatedLoad_Template.csv')

# Rename columns for convenience
train_data.columns = ['index', 'aggregated_load'] + [f'appliance_{i}' for i in range(1, 22)]
aggregated_load.columns = ['index', 'aggregated_load']
test_data.columns = ['index', 'aggregated_load'] + [f'appliance_{i}' for i in range(1, 22)]
template.columns = ['index'] + [f'appliance_{i}_pred' for i in range(1, 22)]

# Add rolling mean feature to train, test, and aggregated_load datasets
window_size = 10

train_data['agg_load_mean'] = train_data['aggregated_load'].rolling(window=window_size, min_periods=1).mean()
test_data['agg_load_mean'] = test_data['aggregated_load'].rolling(window=window_size, min_periods=1).mean()
aggregated_load['agg_load_mean'] = aggregated_load['aggregated_load'].rolling(window=window_size, min_periods=1).mean()

def calculate_frequency(data, window_size):
    # Find local maxima and minima
    peaks, _ = find_peaks(data)
    troughs, _ = find_peaks(-data)
    
    # Combine and sort indices of peaks and troughs
    extrema = np.sort(np.concatenate([peaks, troughs]))
    
    # Calculate frequency of extrema within the window
    frequency = np.zeros_like(data)
    for i in range(len(data)):
        start = max(0, i - window_size)
        end = i
        frequency[i] = np.sum((extrema >= start) & (extrema < end))
    
    return frequency

# Add frequency feature to train, test, and aggregated_load datasets
train_data['agg_load_freq'] = calculate_frequency(train_data['aggregated_load'].values, window_size)
test_data['agg_load_freq'] = calculate_frequency(test_data['aggregated_load'].values, window_size)
aggregated_load['agg_load_freq'] = calculate_frequency(aggregated_load['aggregated_load'].values, window_size)

# Add lagged aggregated load feature
train_data['lagged_agg_load'] = train_data['aggregated_load'].shift(1)
test_data['lagged_agg_load'] = test_data['aggregated_load'].shift(1)
aggregated_load['lagged_agg_load'] = aggregated_load['aggregated_load'].shift(1)

# Drop the first row as it will have NaN values due to shifting
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
aggregated_load.dropna(inplace=True)

# Normalize the data for aggregated load, its mean, frequency, and lagged aggregated load separately
scaler_agg = StandardScaler()
scaler_agg_mean = StandardScaler()
scaler_agg_freq = StandardScaler()
scaler_agg_lagged = StandardScaler()
scaler_appliances = StandardScaler()

# Fit scaler on the aggregated load, its mean, frequency, and lagged aggregated load from training data
train_agg_normalized = scaler_agg.fit_transform(train_data[['aggregated_load']])
train_agg_mean_normalized = scaler_agg_mean.fit_transform(train_data[['agg_load_mean']])
train_agg_freq_normalized = scaler_agg_freq.fit_transform(train_data[['agg_load_freq']])
train_lagged_normalized = scaler_agg_lagged.fit_transform(train_data[['lagged_agg_load']])

# Fit scaler on the appliance loads from training data
train_appliances_normalized = scaler_appliances.fit_transform(train_data.iloc[:, 2:-3])  # Exclude agg_load_mean, agg_load_freq, and lagged_agg_load

# Concatenate the normalized features
train_features_normalized = np.concatenate([
    train_agg_normalized, train_agg_mean_normalized, train_agg_freq_normalized, train_lagged_normalized
], axis=1)
test_features_normalized = np.concatenate([
    scaler_agg.transform(test_data[['aggregated_load']]),
    scaler_agg_mean.transform(test_data[['agg_load_mean']]),
    scaler_agg_freq.transform(test_data[['agg_load_freq']]),
    scaler_agg_lagged.transform(test_data[['lagged_agg_load']])
], axis=1)
aggregated_features_normalized = np.concatenate([
    scaler_agg.transform(aggregated_load[['aggregated_load']]),
    scaler_agg_mean.transform(aggregated_load[['agg_load_mean']]),
    scaler_agg_freq.transform(aggregated_load[['agg_load_freq']]),
    scaler_agg_lagged.transform(aggregated_load[['lagged_agg_load']])
], axis=1)

# Split data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(train_features_normalized, train_appliances_normalized, test_size=0.2, random_state=42)

# Reshape the training and test data
X_train = X_train.reshape(-1, 1, 4)  # Include 4 features: aggregated load, its mean, its frequency, and lagged aggregated load
X_test = X_test.reshape(-1, 1, 4)
X_aggregated = aggregated_features_normalized.reshape(-1, 1, 4)

# CNN Model
cnn_model = Sequential()
cnn_model.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(1, 4)))  # Update input_shape to (1, 4)
cnn_model.add(Flatten())
cnn_model.add(Dense(21, activation='linear'))

cnn_model.compile(optimizer=Adam(), loss='mean_squared_error')
cnn_model.summary()

# CNN Training
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Predicting with CNN
y_pred_cnn_test = cnn_model.predict(X_test)
y_pred_cnn_full = cnn_model.predict(X_aggregated)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(1, 4)))  # Update input_shape to (1, 4)
lstm_model.add(Dense(21))

lstm_model.compile(optimizer=Adam(), loss='mean_squared_error')
lstm_model.summary()

# LSTM Training
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Predicting with LSTM
y_pred_lstm_test = lstm_model.predict(X_test)
y_pred_lstm_full = lstm_model.predict(X_aggregated)

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train.reshape(-1, 4), y_train)  # Reshape X_train for RandomForest

# Predicting with Random Forest
y_pred_rf_test = rf_model.predict(X_test.reshape(-1, 4))
y_pred_rf_full = rf_model.predict(X_aggregated.reshape(-1, 4))

# Function to calculate RMSE
def calculate_rmse(true_values, pred_values):
    true_values_scaled = true_values / true_values.max(axis=0)
    pred_values_scaled = pred_values / true_values.max(axis=0)
    rmse = np.sqrt(mean_squared_error(true_values_scaled, pred_values_scaled, multioutput='raw_values'))
    return rmse

# Calculate RMSE for each appliance
rmse_cnn = calculate_rmse(y_test, y_pred_cnn_test)
rmse_lstm = calculate_rmse(y_test, y_pred_lstm_test)
rmse_rf = calculate_rmse(y_test, y_pred_rf_test)

# Combine RMSE into a DataFrame for comparison
rmse_df = pd.DataFrame({
    'appliance': [f'appliance_{i}' for i in range(1, 22)],
    'rmse_cnn': rmse_cnn,
    'rmse_lstm': rmse_lstm,
    'rmse_rf': rmse_rf
})

# Determine the best model for each appliance
best_models = rmse_df[['rmse_cnn', 'rmse_lstm', 'rmse_rf']].idxmin(axis=1)
best_models = best_models.replace({'rmse_cnn': 'CNN', 'rmse_lstm': 'LSTM', 'rmse_rf': 'RF'})
rmse_df['best_model'] = best_models

# Initialize the hybrid predictions array for test data
hybrid_predictions_test = np.zeros_like(y_pred_cnn_test)

# Assign the best model's predictions to the hybrid model for test data
for i in range(21):
    if rmse_df.loc[i, 'best_model'] == 'CNN':
        hybrid_predictions_test[:, i] = y_pred_cnn_test[:, i]
    elif rmse_df.loc[i, 'best_model'] == 'LSTM':
        hybrid_predictions_test[:, i] = y_pred_lstm_test[:, i]
    else:
        hybrid_predictions_test[:, i] = y_pred_rf_test[:, i]

# Calculate and show summed RMSE for the hybrid model on test data
rmse_hybrid_test = calculate_rmse(y_test, hybrid_predictions_test)
print("Hybrid Model RMSE Sum on Test Data:", rmse_hybrid_test)

# Initialize the hybrid predictions array for AggregatedLoad_A
hybrid_predictions_full = np.zeros_like(y_pred_cnn_full)

# Assign the best model's predictions to the hybrid model for AggregatedLoad_A
for i in range(21):
    if rmse_df.loc[i, 'best_model'] == 'CNN':
        hybrid_predictions_full[:, i] = y_pred_cnn_full[:, i]
    elif rmse_df.loc[i, 'best_model'] == 'LSTM':
        hybrid_predictions_full[:, i] = y_pred_lstm_full[:, i]
    else:
        hybrid_predictions_full[:, i] = y_pred_rf_full[:, i]

# Save predictions to CSV
def save_predictions(predictions, filename):
    predictions_scaled = scaler_appliances.inverse_transform(predictions)
    template_copy = template.copy()
    if len(predictions_scaled) < len(template_copy):
        # Extend predictions_scaled to match the length of template_copy
        diff = len(template_copy) - len(predictions_scaled)
        predictions_scaled = np.concatenate([predictions_scaled, np.zeros((diff, predictions_scaled.shape[1]))])
    template_copy.iloc[:, 1:] = predictions_scaled
    template_copy.to_csv(filename, index=False)
    return predictions_scaled

save_predictions(y_pred_cnn_full[:-1], 'Predicted_CNN.csv')
save_predictions(y_pred_lstm_full[:-1], 'Predicted_LSTM.csv')
save_predictions(y_pred_rf_full[:-1], 'Predicted_RF.csv')
hybrid_predictions_full=save_predictions(hybrid_predictions_full[:-1], 'Predicted_Hybrid.csv')


# Load ground truth data for AggregatedLoad_A
true_A = pd.read_csv("TestData_A.csv", index_col=0)
true_A = true_A.iloc[:, 1:]  # Ignore aggregate load column
true_A_values_scaled = true_A.values / true_A.max(axis=0).values

# Calculate and show summed RMSE for the hybrid model on AggregatedLoad_A
pred_A_values_scaled = hybrid_predictions_full / true_A.max(axis=0).values
rmse_sum_hybrid = np.sum(mean_squared_error(true_A_values_scaled, pred_A_values_scaled, multioutput="raw_values"))
print("Hybrid Model RMSE Sum on AggregatedLoad_A:", rmse_sum_hybrid)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.8977 - val_loss: 0.7846
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 909us/step - loss: 0.8530 - val_loss: 0.7614
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 851us/step - loss: 0.8301 - val_loss: 0.7491
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 942us/step - loss: 0.8267 - val_loss: 0.7396
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.7634 - val_loss: 0.7287
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.8192 - val_loss: 0.7216
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.7722 - val_loss: 0.7165
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.7479 - val_loss: 0.7102
Epoch 9/10
[1m250

  super().__init__(**kwargs)


Epoch 1/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 871us/step - loss: 0.8932 - val_loss: 0.7863
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 848us/step - loss: 0.8801 - val_loss: 0.7620
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 865us/step - loss: 0.8604 - val_loss: 0.7504
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 902us/step - loss: 0.8483 - val_loss: 0.7401
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 967us/step - loss: 0.7592 - val_loss: 0.7319
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 878us/step - loss: 0.7930 - val_loss: 0.7218
Epoch 7/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 848us/step - loss: 0.7968 - val_loss: 0.7146
Epoch 8/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 866us/step - loss: 0.7623 - val_loss: 0.7064
Epoch 9/