# loading libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import pathlib
import pandas as pd
import numpy as np
import darts
from darts import TimeSeries
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import os
import glob
from tqdm import tqdm
from darts.dataprocessing.transformers.scaler import Scaler

# Helping Functions


In [None]:
import numpy as np


def calculate_metrics(actual, predicted):
    # Convert inputs to numpy arrays for easier calculations
    actual = np.array(actual)
    predicted = np.array(predicted)

    # Calculate individual metrics
    mae = np.mean(np.abs(predicted - actual))
    rmse = np.sqrt(np.mean((predicted - actual) ** 2))
    mape = np.mean(np.abs((predicted - actual) / actual)) * 100
    mse = np.mean((predicted - actual) ** 2)

    metrics = {"MAE": mae, "RMSE": rmse, "MAPE": mape, "MSE": mse}

    return metrics

# Data Reading 😶

In [None]:
fileName = "amazon"
df = pd.read_csv(f"../ProcessedData/{fileName}.csv")
df.head()

In [None]:
df = df[["Date", "Close"]]
df = df.drop_duplicates()
df.head()

# stationary Tesst

In [None]:
def adfuller_test(values):
    result = adfuller(values)
    labels = [
        "ADF Test Statistic",
        "p-value",
        "#Lags Used",
        "Number of Observations Used",
    ]
    for value, label in zip(result, labels):
        print(label + " : " + str(value))
    if result[1] <= 0.05:
        print(
            "P value is less than 0.05 that means we can reject the null hypothesis(Ho). Therefore we can conclude that data has no unit root and is stationary"
        )
    else:
        print(
            "Weak evidence against null hypothesis that means time series has a unit root which indicates that it is non-stationary "
        )

In [None]:
adfuller_test(df["Close"])

In [None]:
adfuller_test(df["Close"].diff(1).iloc[1:])

## Spliting Data into Training & Testing Data

In [None]:
from darts import TimeSeries
import numpy as np
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing daily data
series = TimeSeries.from_dataframe(
    df, "Date", "Close", freq="1D", fill_missing_dates=True, fillna_value=0
)


split_point = 0.80

train_series, test_series = series.split_after(split_point)

# Set the figure size and style
plt.figure(figsize=(18, 6))
# Plot the training and testing data
train_series.plot(label="Training Data", color="blue", linewidth=1.5, marker="o")
test_series.plot(label="Testing Data", color="orange", linewidth=1.5, marker="o")

# Add title and labels
plt.title("Training and Testing Data")
plt.xlabel("Date")
plt.ylabel("Amazon Close Price")

# Add grid lines
plt.grid(True)

# Add legend
plt.legend()

# Display the plot
plt.show()

# XGB Model

In [None]:
from darts.models.forecasting.xgboost import XGBModel

xgb_model = XGBModel(
    lags=15,
    output_chunk_length=30,
    add_encoders={
        "cyclic": {"future": ["month"]},
        "datetime_attribute": {"future": ["hour", "dayofweek"]},
        "position": {"future": ["relative"]},
        "custom": {"future": [lambda idx: (idx.year - 2013) / 50]},
        "transformer": Scaler(),
    },
    likelihood=None,
    quantiles=False,
    random_state=199,
    multi_models=True,
    use_static_covariates=True,
)

xgb_model.fit(train_series)

horizan = 30

test_series_ = test_series[0:horizan]
plt.figure(figsize=(18, 6))

forcast_arima = xgb_model.predict(horizan)
xgb_model.predict(horizan).plot(marker="o", label="predicted")
test_series_.plot(marker="o", label="Actual/Ground truth")
# Add title and labels
plt.title("Ground truth vs predicted")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.xticks(
    forcast_arima.time_index, forcast_arima.time_index.strftime("%Y-%m-%d"), rotation=90
)

# Add grid lines
plt.grid(True)

# Add legend
plt.legend()

# Display the plot
plt.show()

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm


# Function to predict and evaluate
def predict_and_evaluate(
    window_size,
    prediction_horizon,
    slide_step,
    test_series,
    arima_model,
    result_plot_path,
):
    num_predictions = len(test_series) - window_size - prediction_horizon + 1

    meta_information_evaluation = {
        "Iterations": [],
        "MAE": [],
        "RMSE": [],
        "MAPE": [],
        "MSE": [],
        "input_window_size": [],
        "horizon": [],
        "stride": [],
    }

    try:
        for i in tqdm(range(0, num_predictions, slide_step)):
            input_window = test_series[i : i + window_size]
            ground_truth = test_series[
                i + window_size : i + window_size + prediction_horizon
            ]
            forecast = arima_model.predict(n=prediction_horizon, series=input_window)

            actual = ground_truth.values().flatten().tolist()
            predicted = forecast.values().flatten().tolist()

            metrics = calculate_metrics(actual, predicted)

            meta_information_evaluation["Iterations"].append(i)
            meta_information_evaluation["MAE"].append(metrics["MAE"])
            meta_information_evaluation["RMSE"].append(metrics["RMSE"])
            meta_information_evaluation["MAPE"].append(metrics["MAPE"])
            meta_information_evaluation["MSE"].append(metrics["MSE"])
            meta_information_evaluation["input_window_size"].append(window_size)
            meta_information_evaluation["horizon"].append(prediction_horizon)
            meta_information_evaluation["stride"].append(slide_step)

            bypass_information = {
                "slide_step": slide_step,
                "window_size": window_size,
                "horizon": prediction_horizon,
            }
            create_plots(
                input_window,
                forecast,
                ground_truth,
                result_plot_path,
                bypass_information,
            )

        evalaution_df = pd.DataFrame.from_dict(meta_information_evaluation)

        return evalaution_df

    except Exception as e:
        print("Error Occurred in fuction predict_and_evaluate():", e)
        evalaution_df = pd.DataFrame.from_dict(meta_information_evaluation)

        return evalaution_df


# Function to create plots
def create_plots(
    input_window, forecast, ground_truth, result_plot_path, bypass_information
):
    plt.figure(figsize=(30, 6))
    input_window.plot(label="Input Data", marker="o")
    forecast.plot(label="Predicted", marker="o")
    ground_truth.plot(label="Ground Truth", marker="o")

    combined_time_index = input_window.time_index.append(forecast.time_index).append(
        ground_truth.time_index
    )
    starting_date_of_input_data = input_window.time_index[0].strftime("%Y-%m-%d")
    ending_date_of_input_data = input_window.time_index[-1].strftime("%Y-%m-%d")
    starting_date_predicted = forecast.time_index[0].strftime("%Y-%m-%d")
    ending_date_of_predicted = forecast.time_index[-1].strftime("%Y-%m-%d")

    plt.xticks(
        combined_time_index, combined_time_index.strftime("%Y-%m-%d"), rotation=90
    )
    plt.title(
        f"Results of Input Data from {starting_date_of_input_data} to {ending_date_of_input_data} & Evaluation on from {starting_date_predicted} to {ending_date_of_predicted}",
        fontsize=16,
    )
    plt.ylabel("Amazon Close Price", fontsize=14)
    plt.xlabel("Dates", fontsize=14)
    plt.legend()

    plot_filename = f"{result_plot_path}/{bypass_information['window_size']}_{bypass_information['horizon']}_{bypass_information['slide_step']}.png"
    plt.savefig(plot_filename)
    plt.close()
    # plt.show()

# Model Evaluation

In [None]:
def model_evaluation(model_name, model_object, test_series, FileName):
    result_path = f"../ProcessedData/Results/{model_name}/{FileName}"
    result_plot_path = (
        f"../ProcessedData/Results/{model_name}/{FileName}/{model_name}_Plots"
    )
    os.makedirs(result_path, exist_ok=True)
    os.makedirs(result_plot_path, exist_ok=True)

    # Set your parameters
    window_sizes = [30, 45, 90]
    prediction_horizons = [15, 30, 35]
    slide_steps = [5, 10, 15]

    test_series = test_series
    model = model_object

    for window_size in window_sizes:
        for prediction_horizon in prediction_horizons:
            for slide_step in slide_steps:
                print(
                    f"Iteration : Window size : {window_size} Horizan: {prediction_horizon}, Stride : {slide_step}"
                )
                evaluation_df = predict_and_evaluate(
                    window_size,
                    prediction_horizon,
                    slide_step,
                    test_series,
                    model,
                    result_plot_path,
                )
                evaluation_df.to_csv(
                    f"{result_path}/window_size_{window_size}_horizon_{prediction_horizon}_stride_{slide_step}.csv",
                    index=False,
                )

                print(
                    f"Window_size_{window_size}_prediction_horizon_{prediction_horizon}_slide_step_{slide_step} - Evaluation completed."
                )
        #         break
        #     break
        # break

## XGBModel

In [None]:
model_name = "XGBMODEl"
model_object = xgb_model
FileName = fileName
test_series = test_series
model_evaluation(model_name, model_object, test_series, FileName)

# Evaluate Metrics

In [1]:
import pandas as pd
import glob


def aggregate_evaluation_results(file_pattern):
    eval_dict = {
        "window_size": [],
        "horizan": [],
        "stride": [],
        "AVG_MAE": [],
        "AVG_MSE": [],
        "AVG_RMSE": [],
        "AVG_MAPE": [],
    }

    paths = glob.glob(file_pattern)

    for path in paths:
        window_size = path.split("/")[-1].split("_")[2]
        horizan = path.split("/")[-1].split("_")[4]
        stride = path.split("/")[-1].split("_")[6].split(".")[0]

        df = pd.read_csv(path)
        eval_dict["window_size"].append(window_size)
        eval_dict["horizan"].append(horizan)
        eval_dict["stride"].append(stride)

        eval_dict["AVG_MAE"].append(df["MAE"].mean())
        eval_dict["AVG_MSE"].append(df["MSE"].mean())
        eval_dict["AVG_RMSE"].append(df["RMSE"].mean())
        eval_dict["AVG_MAPE"].append(df["MAPE"].mean())

    eval_df = pd.DataFrame.from_dict(eval_dict)
    eval_df = eval_df.dropna()
    eval_df.sort_values(
        ["window_size", "horizan", "stride"], inplace=True, ascending=True
    )

    return eval_df


# Example usage
file_pattern = "../ProcessedData/Results/XGBMODEl/amazon/*.csv"
result_df = aggregate_evaluation_results(file_pattern)
# result_df.to_csv('../ProcessedData/FinalResults/') # have to add folder for each model

In [2]:
result_df

Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE,AVG_MAPE
5,30,15,10,10.753196,498.880099,19.988719,inf
15,30,15,15,10.22538,447.549338,19.070956,inf
23,30,15,5,10.265047,471.228789,19.378302,inf
14,30,30,10,12.256367,531.103617,21.965,inf
26,30,30,15,11.886851,509.62304,21.281586,inf
10,30,30,5,11.986695,519.188635,21.63896,inf
16,30,35,10,12.437512,534.730327,22.175387,inf
2,30,35,15,12.165002,523.864992,21.752059,inf
3,30,35,5,12.367644,535.265729,22.210347,inf
8,45,15,10,9.671862,448.11766,18.824295,inf


In [4]:
result_df = result_df[['window_size','horizan','stride','AVG_MAE','AVG_MSE','AVG_RMSE']]
for name , group in result_df.groupby(['window_size','horizan']):
    print(name)
    display(group)    

('30', '15')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
5,30,15,10,10.753196,498.880099,19.988719
15,30,15,15,10.22538,447.549338,19.070956
23,30,15,5,10.265047,471.228789,19.378302


('30', '30')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
14,30,30,10,12.256367,531.103617,21.965
26,30,30,15,11.886851,509.62304,21.281586
10,30,30,5,11.986695,519.188635,21.63896


('30', '35')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
16,30,35,10,12.437512,534.730327,22.175387
2,30,35,15,12.165002,523.864992,21.752059
3,30,35,5,12.367644,535.265729,22.210347


('45', '15')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
8,45,15,10,9.671862,448.11766,18.824295
20,45,15,15,10.256615,459.83613,19.350754
18,45,15,5,10.060116,464.292474,19.246717


('45', '30')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
19,45,30,10,11.514305,491.169083,20.980353
11,45,30,15,11.685685,487.869526,20.825143
9,45,30,5,11.745045,499.219024,21.219436


('45', '35')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
4,45,35,10,12.127082,523.398326,21.98948
7,45,35,15,11.961057,506.391719,21.374925
13,45,35,5,12.138824,519.001794,21.863507


('90', '15')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
12,90,15,10,10.452555,476.290911,19.576538
25,90,15,15,10.016456,421.240637,18.545601
24,90,15,5,10.0565,462.995552,19.244234


('90', '30')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
1,90,30,10,11.659254,499.904692,21.274607
22,90,30,15,11.168984,461.250099,20.177616
17,90,30,5,11.492445,500.027198,21.188888


('90', '35')


Unnamed: 0,window_size,horizan,stride,AVG_MAE,AVG_MSE,AVG_RMSE
0,90,35,10,11.812568,509.764371,21.60118
6,90,35,15,11.387665,482.146882,20.82393
21,90,35,5,11.807295,519.604045,21.839725
