In [None]:
from PyEMD import EEMD
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import mean_squared_error
from scipy.signal import savgol_filter

from scipy.optimize import differential_evolution

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import GridSearchCV, KFold

from sklearn.pipeline import make_pipeline

import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
import time
import copy
import os

from datetime import datetime, timedelta

In [None]:
def read_data(fileName: str) -> object:
    return pd.read_csv(os.getcwd() + fileName +".csv")

In [None]:
def plot_eemd(nIMFs: object, eIMFs: object, df: object) -> None:
    plt.figure(figsize=(15,17))
    plt.subplot(nIMFs+1, 1, 1)
    plt.plot(df["grid1-loss"].values, 'r')
    
    for n in range(nIMFs):
        plt.subplot(nIMFs+1, 1, n+2)
        plt.plot(eIMFs[n], 'g')
        plt.ylabel("eIMF %i" %(n+1))
        plt.locator_params(axis='y', nbins=5)
        
    plt.xlabel("Time [H]")
    plt.tight_layout()
    plt.show()

In [None]:
def plot_results(y_test: [float], y_pred: [float]) -> None:
    plt.figure(figsize=(15,5))
    plt.plot(y_test,  label='True Data')
    plt.plot(y_pred,  label='Prediction')
    plt.xlabel("Time [H]")
    plt.tight_layout()
    plt.legend()
    #plt.show()

In [None]:
def make_eemd(df: object, detection: str) -> object:
    num_trials = 12
    # Create an EEMD instance
    eemd = EEMD(trials=num_trials)

    # Perform EEMD decomposition
    return eemd.eemd(df["grid1-loss"].values)

In [None]:
df = read_data("dataset_all_ewma_curve_fs")
df.head()

In [None]:
# Convert 'datetime' column to datetime objects
df['datetime'] = pd.to_datetime(df['datetime'])
df.sort_values(by='datetime', inplace=True)

reference_date = pd.to_datetime("2019-11-30")

window_size_days = 180  # 180 days for training
step_days = 1  # 1 day for sliding window

# Convert window size and step to hourly intervals
window_size_hours = window_size_days * 24
step_hours = step_days * 24

# List to store the indices of the training and testing data
train_indices = []
test_indices = []

ttrain_indices = list()
ttest_indices = list()

# Select 181 days (180 days for training and 1 day for testing), sliding window for testing
for i in range(window_size_hours, len(df) - step_hours, step_hours):
    # Convert the current timestamp to the same timezone as reference_date
    current_timestamp = df['datetime'][i].tz_localize(reference_date.tzinfo)
    
    if current_timestamp >= reference_date:
        # Add indices for training data
        ttrain_indices.append(range(i - window_size_hours, i))
        # Add indices for testing data
        ttest_indices.append(range(i, i + step_hours))
    else:
        # Add indices for training data
        train_indices.append(range(i - window_size_hours, i))
        # Add indices for testing data
        test_indices.append(range(i, i  + step_hours))

In [None]:
svr_archi = [
    [6, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}],
    [7, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}],
    [8, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}],
    [9, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}],
    [10, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}],
    [11, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}],
    #[12, {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}]
]

error_list_v1 = [[] for _ in range(len(svr_archi))]
true_values_v1 = [[] for _ in range(len(svr_archi))]
svr_train_models = list()

imfs = [0, 1, 2, 3, 4, 5]
pred_cat_list = [[] for _ in range(len(imfs))]
true_cat_list = [[] for _ in range(len(imfs))]
trained_models = list()

In [None]:
def evaluation_eemd(grid_loss, adder):
    return mean_absolute_percentage_error(grid_loss, adder)*100

def add_up(decomposition, k):
    adder = list()
    for i in range(len(decomposition[0])):
        sumator = 0
        for j in range(len(decomposition) - k):
            sumator += decomposition[j][i]
        adder.append(sumator)
    return adder

mini = 100
best_k = None
best_decomp = None

for i in range(3):
    decomposition = make_eemd(df[:test_indices[-1][-1]+1], "parabolic")
    for k in range(3):
        adder = add_up(decomposition, k)
        mape = evaluation_eemd(df["grid1-loss"][:test_indices[-1][-1]+1], adder)
        if mape < mini:
            mini = mape
            best_k = k
            best_decomp = decomposition
            
add_up_decom = add_up(best_decomp, best_k)
    
plt.plot(df["grid1-loss"][:test_indices[-1][-1]+1], label = "true")
plt.plot(add_up_decom, label = "IMFs")
plt.legend()
plt.show()

In [None]:
for idx, archi in enumerate(svr_archi):
    print(f"SVR Architecture {idx}: {archi}")
    C = archi[1]["C"]
    gamma = archi[1]["gamma"]
    kernel = archi[1]["kernel"]
    model = SVR(C = C, gamma = gamma, kernel = kernel)

    for j in range(len(train_indices)):
        df_indexes = list(train_indices[j]) + list(test_indices[j])
        print(df_indexes[0], df_indexes[-1])

        train_df = copy.deepcopy(df.iloc[train_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)
        test_df = copy.deepcopy(df.iloc[test_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)

        y_train = best_decomp[archi[0]][train_indices[j]]
        y_test = best_decomp[archi[0]][test_indices[j]]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(train_df.values)
        X_test_scaled = scaler.transform(test_df.values)

        # Train the best model on the entire training data
        model.fit(X_train_scaled, y_train)

        # Make predictions
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)

        error_list_v1[idx].extend(y_test_pred)
        true_values_v1[idx].extend(y_test)

    svr_train_models.append(model)

In [None]:
for i in range(len(true_values_v1)):
    plt.plot(true_values_v1[i], color="blue", label = "true data")
    plt.plot(error_list_v1[i], color="red", label = "prediction")
    plt.legend()
    plt.show()
    
    train_mse = mean_squared_error(true_values_v1[i], error_list_v1[i])
    train_mae =  mean_absolute_error(true_values_v1[i], error_list_v1[i])
    sklearn_rmse = np.sqrt(train_mse)

    # Create a DataFrame to display the results
    metrics_df = pd.DataFrame({
        "MAE": [train_mae],
        "MSE": [train_mse],
        "RMSE": [sklearn_rmse],
    }, index=[i])

    print(metrics_df, end= "\n\n")

In [None]:
for i in imfs:
    model = CatBoostRegressor()
    
    for j in range(len(train_indices)):
        df_indexes = list(train_indices[j]) + list(test_indices[j])        
        temp_df = copy.deepcopy(df.iloc[df_indexes]).drop(["has incorrect data"], axis= 1)
        train_df = copy.deepcopy(df.iloc[train_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)
        test_df = copy.deepcopy(df.iloc[test_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)

        y_train = best_decomp[i][train_indices[j]]
        y_test = best_decomp[i][test_indices[j]]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(train_df.values)
        X_test_scaled = scaler.transform(test_df.values)

        train_pool = Pool(X_train_scaled, y_train)
        test_pool = Pool(X_test_scaled)

        #train the model
        model.fit(train_pool, verbose = False)
        
        # make the prediction using the model
        prediction = model.predict(test_pool)
        pred_cat_list[i].extend(prediction)
        true_cat_list[i].extend(y_test)
    trained_models.append(model)

In [None]:
for i in range(len(true_cat_list)):
    plt.plot(true_cat_list[i], color="blue", label = "true data")
    plt.plot(pred_cat_list[i], color="red", label = "prediction")
    plt.legend()
    plt.show()
    
    train_mse = mean_squared_error(true_cat_list[i], pred_cat_list[i])
    train_mae =  mean_absolute_error(true_cat_list[i], pred_cat_list[i])
    sklearn_rmse = np.sqrt(train_mse)

    # Create a DataFrame to display the results
    metrics_df = pd.DataFrame({
        "MAE": [train_mae],
        "MSE": [train_mse], # MAPE is not available in scikit-learn, so we use the custom function
        "RMSE": [sklearn_rmse],
    }, index=[i])

    print(metrics_df, end= "\n\n")

In [None]:
super_pos = list()

for i in range(len(pred_cat_list[0])):
    sumator = 0
    for j in range(len(pred_cat_list)):
        sumator += pred_cat_list[j][i]
    
    for j in range(len(error_list_v1) - best_k):
        sumator += error_list_v1[j][i]   
        
    super_pos.append(sumator)
super_pos_array = np.array(super_pos)

In [None]:
def evaluate_metrics(start: int, end: int):
    # If you have a DataFrame column, convert it to a NumPy array for comparison
    grid1_loss_array = df["grid1-loss"].iloc[start:end].reset_index(drop=True).values

    # Calculate MAPE (Mean Absolute Percentage Error)
    def mape(y_true, y_pred):
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    # Calculate metrics using NumPy
    mae = np.mean(np.abs(grid1_loss_array - super_pos_array))
    mse = np.mean((grid1_loss_array - super_pos_array) ** 2)
    mape_val = mape(grid1_loss_array, super_pos_array)

    # Calculate metrics using scikit-learn functions
    sklearn_mae = mean_absolute_error(grid1_loss_array, super_pos_array)
    sklearn_mse = mean_squared_error(grid1_loss_array, super_pos_array)
    sklearn_rmse = np.sqrt(sklearn_mse)
    sklearn_mape = mean_absolute_percentage_error(grid1_loss_array, super_pos_array)*100

    # Create a DataFrame to display the results
    metrics_df = pd.DataFrame({
        "MAE": [sklearn_mae],
        "MSE": [sklearn_mse], # MAPE is not available in scikit-learn, so we use the custom function
        "RMSE": [sklearn_rmse],
        "MAPE": [sklearn_mape]
    }, index=["sklearn"])

    plot_results(df["grid1-loss"].iloc[test_indices[0][0]:test_indices[-1][-1]+1].reset_index(drop=True), super_pos_array)
    print(metrics_df)

evaluate_metrics(test_indices[0][0],test_indices[-1][-1]+1)

In [None]:
decomposition_part = list()

best_ks = list()

for j in range(len(ttest_indices)):
    print(j)
    mini = 100
    best_k_local = 0
    best_decom = None
    
    for i in range(5):
        decom = make_eemd(df[:ttrain_indices[j][-1]+1], "parabolic")

        for k in range(3):
            adder = add_up(decom, k)
            mape = evaluation_eemd(df["grid1-loss"][:ttrain_indices[j][-1]+1], adder)
            if mape < mini:
                mini = mape
                best_k_local = k
                best_decom = decom
            
    best_ks.append(best_k_local)

    print(best_k_local, mini)
    adder = add_up(best_decom, best_k_local)

    plt.plot(df["grid1-loss"][:ttrain_indices[j][-1]+1], label = "true")
    plt.plot(adder, label = "IMFs")
    plt.legend()
    plt.show()
    
    decomposition_part.append(best_decom)

In [None]:
predictions_list_cat = [[] for _ in range(len(trained_models))]
true_cat_pred_list = [[] for _ in range(len(trained_models))]

for i, model in enumerate(trained_models):
    for j in range(len(ttest_indices)):
        df_indexes = list(ttrain_indices[j]) + list(ttest_indices[j])
        print(ttrain_indices[j])
        
        test_df = copy.deepcopy(df.iloc[ttest_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)
        train_df = copy.deepcopy(df.iloc[ttrain_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)
        
        y_train = decomposition_part[j][i][ttrain_indices[j]]
        
        scaler = StandardScaler()
        X_test_scaled = scaler.fit_transform(test_df.values)
        X_train_scaled = scaler.transform(train_df.values)
        
        test_pool = Pool(X_test_scaled)
        train_pool = Pool(X_train_scaled, y_train)
        
        model.fit(train_pool, verbose = False)

        # make the prediction using the model
        prediction = model.predict(test_pool)
        predictions_list_cat[i].extend(prediction)

In [None]:
for i in range(len(predictions_list_cat)):
    plt.plot(predictions_list_cat[i], color="blue", label = "true data")
    plt.legend()
    plt.show()

In [None]:
predictions_list_svr = [[] for _ in range(len(svr_train_models))]

for i, model in enumerate(svr_train_models):
    for j in range(len(ttest_indices)):
        df_indexes = list(ttrain_indices[j]) + list(ttest_indices[j])
        print(ttrain_indices[j], ttest_indices[j])
        
        test_df = copy.deepcopy(df.iloc[ttest_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)
        train_df = copy.deepcopy(df.iloc[ttrain_indices[j]]).drop(["datetime", "grid1-loss", "has incorrect data"], axis= 1)
        y_train = decomposition_part[j][i+6][ttrain_indices[j]]
        
        scaler = StandardScaler()
        X_test_scaled = scaler.fit_transform(test_df.values)
        X_train_scaled = scaler.transform(train_df.values)
        
        model.fit(X_train_scaled, y_train)
        
        # make the prediction using the model
        prediction = model.predict(X_test_scaled)
        predictions_list_svr[i].extend(prediction)

In [None]:
for i in range(len(predictions_list_svr)):
    plt.plot(predictions_list_svr[i], color="blue", label = "true data")
    plt.legend()
    plt.show()

In [None]:
super_pos1 = list()

counter = 0
for i in range(len(predictions_list_cat[0])):
    sumator = 0

    for j in range(len(predictions_list_cat)):
        sumator += predictions_list_cat[j][i]
    
    for j in range(len(predictions_list_svr)):
        sumator += predictions_list_svr[j][i]
    
    super_pos1.append(sumator)
super_pos_array1 = np.array(super_pos1)

In [None]:
# If you have a DataFrame column, convert it to a NumPy array for comparison
grid1_loss_array = df["grid1-loss"].iloc[ttest_indices[0][0]:ttest_indices[-1][-1]+1].reset_index(drop=True).values

# Calculate MAPE (Mean Absolute Percentage Error)
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Calculate metrics using NumPy
mae = np.mean(np.abs(grid1_loss_array - super_pos_array1))
mse = np.mean((grid1_loss_array - super_pos_array1) ** 2)
mape_val = mape(grid1_loss_array, super_pos_array1)


# Calculate metrics using scikit-learn functions
sklearn_mae = mean_absolute_error(grid1_loss_array, super_pos_array1)
sklearn_mse = mean_squared_error(grid1_loss_array, super_pos_array1)
sklearn_rmse = np.sqrt(sklearn_mse)
sklearn_mape = mean_absolute_percentage_error(grid1_loss_array, super_pos_array1)*100

# Create a DataFrame to display the results
metrics_df = pd.DataFrame({
    "MAE": [sklearn_mae],
    "MSE": [sklearn_mse], # MAPE is not available in scikit-learn, so we use the custom function
    "RMSE": [sklearn_rmse],
    "MAPE": [sklearn_mape]
}, index=["sklearn"])

print(metrics_df)
plot_results(df["grid1-loss"].iloc[ttest_indices[0][0]:ttest_indices[-1][-1]+1].reset_index(drop=True), super_pos_array1)