# None vs Full vs Technical vs Firm

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
import os
os.chdir('/content/drive/My Drive/Volatility_forecasting/')

In [None]:
!pip install neuralforecast ruptures dask[dataframe] scikit-learn
import os
os.environ['NIXTLA_ID_AS_COL'] = '1'

## All models results merging

In [None]:
import os
import pandas as pd
import gc  # Garbage collection module to free up memory

def process_model(base_path, horizon, base_df_path, model_name, firm_model, tech_model, output_filename):
    try:
        # Load the base DataFrame
        base_df = pd.read_csv(base_df_path)
        base_df = base_df[['unique_id', 'ds', 'y', model_name, f'{model_name}1']].drop_duplicates(subset=['ds', 'unique_id'])

        # Process FIRM model
        firm_path = f'{base_path}/horizon_{horizon}/{firm_model}_model1_horizon_{horizon}.csv'
        firm_df = pd.read_csv(firm_path)
        firm_df = firm_df.drop(['cutoff', f'{model_name}-median', f'{model_name}-lo-90', f'{model_name}-hi-90'], axis=1, errors='ignore')
        firm_df = firm_df.rename(columns={model_name: f'AutoFirm{model_name}'}).drop_duplicates(subset=['ds', 'unique_id'])

        # Process TECH model
        tech_path = f'{base_path}/horizon_{horizon}/{tech_model}_model1_horizon_{horizon}.csv'
        tech_df = pd.read_csv(tech_path)
        tech_df = tech_df.drop(['cutoff', f'{model_name}-median', f'{model_name}-lo-90', f'{model_name}-hi-90'], axis=1, errors='ignore')
        tech_df = tech_df.rename(columns={model_name: f'AutoTech{model_name}'}).drop_duplicates(subset=['ds', 'unique_id'])

        # Merge DataFrames
        merged_df = pd.merge(base_df, firm_df, on=['unique_id', 'ds', 'y'], how='left')
        merged_df = pd.merge(merged_df, tech_df, on=['unique_id', 'ds', 'y'], how='left')

        # Save the merged DataFrame
        output_path = os.path.join(base_path, f'horizon_{horizon}/{output_filename}')
        merged_df.to_csv(output_path, index=False)
        print(f"Saved merged DataFrame to {output_path}")
    finally:
        # Free memory by deleting DataFrames
        del base_df, firm_df, tech_df, merged_df
        gc.collect()  # Force garbage collection to free memory

# Parameters
base_path = 'Data/Test'
horizons = [1, 5, 10, 20]  # Process these horizons
output_filename_template = 'Tech_Firm_{model_name}_model0_1_sub_horizon_{horizon}.csv'

# Define models and corresponding FIRM/TECH file prefixes
models = {
    'AutoTFT': ('FIRMtft', 'TECHtft'),
    'AutoBiTCN': ('FIRMbitcn', 'TECHbitcn'),
    'AutoTiDE': ('FIRMtide', 'TECHtide'),
    'AutoNHITS': ('FIRMnhits', 'TECHnhits')
}

# Process each model for each horizon
for horizon in horizons:
    base_df_path = f'{base_path}/horizon_{horizon}/bitcn_tide_tft_nhits_model0_1_sub_horizon_{horizon}.csv'

    for model_name, (firm_model, tech_model) in models.items():
        output_filename = output_filename_template.replace('{model_name}', model_name).replace('{horizon}', str(horizon))
        process_model(base_path, horizon, base_df_path, model_name, firm_model, tech_model, output_filename)

Saved merged DataFrame to Data/Test/horizon_1/Tech_Firm_AutoTFT_model0_1_sub_horizon_1.csv
Saved merged DataFrame to Data/Test/horizon_1/Tech_Firm_AutoBiTCN_model0_1_sub_horizon_1.csv
Saved merged DataFrame to Data/Test/horizon_1/Tech_Firm_AutoTiDE_model0_1_sub_horizon_1.csv
Saved merged DataFrame to Data/Test/horizon_1/Tech_Firm_AutoNHITS_model0_1_sub_horizon_1.csv
Saved merged DataFrame to Data/Test/horizon_5/Tech_Firm_AutoTFT_model0_1_sub_horizon_5.csv
Saved merged DataFrame to Data/Test/horizon_5/Tech_Firm_AutoBiTCN_model0_1_sub_horizon_5.csv
Saved merged DataFrame to Data/Test/horizon_5/Tech_Firm_AutoTiDE_model0_1_sub_horizon_5.csv
Saved merged DataFrame to Data/Test/horizon_5/Tech_Firm_AutoNHITS_model0_1_sub_horizon_5.csv
Saved merged DataFrame to Data/Test/horizon_10/Tech_Firm_AutoTFT_model0_1_sub_horizon_10.csv
Saved merged DataFrame to Data/Test/horizon_10/Tech_Firm_AutoBiTCN_model0_1_sub_horizon_10.csv
Saved merged DataFrame to Data/Test/horizon_10/Tech_Firm_AutoTiDE_model0_1

## Evaluation

Following the evaluation metrics as well as MCS and DM test from Suoto and Moradi 2024

https://www.emerald.com/insight/content/doi/10.1108/cfri-01-2024-0032/full/html

https://github.com/hugogobato/Can-Transformers-Transform-Financial-Forecasting-

In [None]:
import os
import pandas as pd
import numpy as np
from itertools import combinations
from arch.bootstrap import MCS

# Define the horizons and prefix
horizons = [1, 5, 10 , 20]
prefix = 'Tech_Firm'

# Target models
target_models = ['AutoTFT', 'AutoBiTCN', 'AutoTiDE', 'AutoNHITS']

# Function for Diebold-Mariano Test
def dm_test(actual, pred1, pred2, h=1, crit="RMSE"):
    actual, pred1, pred2 = map(np.array, [actual, pred1, pred2])
    T = len(actual)

    # Ensure predictions and actuals are of the same length
    if len(actual) != len(pred1) or len(actual) != len(pred2):
        return np.nan

    # Remove NaN values
    mask = ~np.isnan(actual) & ~np.isnan(pred1) & ~np.isnan(pred2)
    actual, pred1, pred2 = actual[mask], pred1[mask], pred2[mask]
    T = len(actual)
    if T == 0:
        return np.nan

    if crit == "RMSE":
        loss_diff = (actual - pred1) ** 2 - (actual - pred2) ** 2
    elif crit == "MAE":
        loss_diff = np.abs(actual - pred1) - np.abs(actual - pred2)
    elif crit == "MAPE":
        with np.errstate(divide='ignore', invalid='ignore'):
            loss_diff = (np.abs(actual - pred1) / actual) - (np.abs(actual - pred2) / actual)
    elif crit == "QLIKE":
        with np.errstate(divide='ignore', invalid='ignore'):
            loss1 = actual / np.abs(pred1) - np.log(actual / np.abs(pred1)) - 1
            loss2 = actual / np.abs(pred2) - np.log(actual / np.abs(pred2)) - 1
            loss_diff = loss1 - loss2
    else:
        raise ValueError("Unsupported criterion")

    loss_diff = loss_diff[~np.isnan(loss_diff)]
    if len(loss_diff) == 0:
        return np.nan

    d_mean = np.mean(loss_diff)
    gamma = [np.correlate(loss_diff - d_mean, loss_diff - d_mean, 'full')[len(loss_diff) - 1:] / len(loss_diff)][0]
    V_d = gamma[0] + 2 * sum(gamma[1:h])
    DM_stat = d_mean / np.sqrt(V_d / len(loss_diff))
    adj = ((len(loss_diff) + 1 - 2 * h + h * (h - 1) / len(loss_diff)) / len(loss_diff)) ** 0.5
    return DM_stat * adj


# Loop over each horizon and model
for horizon in horizons:
    for model in target_models:
        print(f"Processing Model: '{model}', Horizon: {horizon}")

        # Define file paths based on model, prefix, and horizon
        file_name = f"{prefix}_{model}_model0_1_sub_horizon_{horizon}.csv"
        data_path = f"Data/Test/horizon_{horizon}/{file_name}"
        output_dir = f"Data/Evaluation/horizon_{horizon}"
        os.makedirs(output_dir, exist_ok=True)

        # Check if file exists
        if not os.path.exists(data_path):
            print(f"File not found: {data_path}")
            continue

        # Read the CSV file
        df = pd.read_csv(data_path)
        df = df.sort_values(by=['unique_id', 'ds']).drop_duplicates(subset=['ds', 'unique_id'])
        df['ds'] = pd.to_datetime(df['ds'])

        # Extract model columns dynamically (target model and variants, e.g., AutoBiTCN1)
        model_columns = [col for col in df.columns if model in col]

        # Create a dictionary to hold pivoted DataFrames for each variant of the current model
        model_dfs = {}
        for variant in model_columns:
            model_df = df.pivot(index='ds', columns='unique_id', values=variant)
            model_dfs[variant] = model_df

        # Pivot actual values
        Actuals = df.pivot(index='ds', columns='unique_id', values='y')

        # Calculate error metrics for each model variant
        metrics = ['RMSE', 'MAE', 'MAPE', 'QLIKE']
        error_metrics = {metric: {} for metric in metrics}

        for model_name, predictions in model_dfs.items():
            # Align the predictions and actuals
            predictions, actuals = predictions.align(Actuals, join='inner', axis=0)
            errors = predictions - actuals

            # RMSE Calculation
            mse = (errors ** 2).mean().mean()
            error_metrics['RMSE'][model_name] = np.sqrt(mse)

            # MAE Calculation
            mae = errors.abs().mean().mean()
            error_metrics['MAE'][model_name] = mae

            # MAPE Calculation
            with np.errstate(divide='ignore', invalid='ignore'):
                mape = (errors.abs() / actuals).replace([np.inf, -np.inf], np.nan).mean().mean()
            error_metrics['MAPE'][model_name] = mape

            # QLIKE Calculation
            with np.errstate(divide='ignore', invalid='ignore'):
                ratio = actuals / predictions
                qlike = (ratio - np.log(ratio) - 1).replace([np.inf, -np.inf], np.nan).mean().mean()
            error_metrics['QLIKE'][model_name] = qlike

        # Create a DataFrame with error metrics
        error_metrics_df = pd.DataFrame(error_metrics)
        metrics_file = f"{output_dir}/{prefix}_{model}_metrics_horizon_{horizon}.csv"
        error_metrics_df.to_csv(metrics_file)

        # Function to calculate losses for MCS
        def calculate_losses(metric, model_dfs, actuals):
            residuals_dict = {}
            for model_name, predictions in model_dfs.items():
                # Align predictions and actuals
                predictions_aligned, actuals_aligned = predictions.align(actuals, join='inner', axis=0)
                if metric == 'RMSE':
                    residuals = (predictions_aligned - actuals_aligned) ** 2
                elif metric == 'MAE':
                    residuals = (predictions_aligned - actuals_aligned).abs()
                elif metric == 'MAPE':
                    with np.errstate(divide='ignore', invalid='ignore'):
                        residuals = (predictions_aligned - actuals_aligned).abs() / actuals_aligned
                elif metric == 'QLIKE':
                    with np.errstate(divide='ignore', invalid='ignore'):
                        ratio = actuals_aligned / predictions_aligned
                        residuals = ratio - np.log(ratio) - 1
                else:
                    raise ValueError("Unsupported metric")

                residuals = residuals.replace([np.inf, -np.inf], np.nan)
                residuals_dict[model_name] = residuals.mean(axis=1)

            return pd.DataFrame(residuals_dict).dropna()

        # Run MCS procedure for each metric
        pvalues_list = []

        for metric in metrics:
            losses = calculate_losses(metric, model_dfs, Actuals)
            mcs = MCS(losses, size=0.05, method="R", block_size=1000)
            mcs.compute()
            pvalues = mcs.pvalues.reset_index().rename(columns={'Pvalue': f'Pvalue_{metric}'})
            pvalues_list.append(pvalues)

        # Merge p-values and save
        merged_pvalues = pvalues_list[0]
        for pvalues in pvalues_list[1:]:
            merged_pvalues = merged_pvalues.merge(pvalues, on='Model name', how='outer')

        mcs_file = f"{output_dir}/{prefix}_{model}_MCS_horizon_{horizon}.csv"
        merged_pvalues.to_csv(mcs_file, index=False)

        # Run DM tests and save results
        stocks = Actuals.columns

        for metric in metrics:
            better_count_matrix = pd.DataFrame(0, index=model_columns, columns=model_columns)

            for model_a, model_b in combinations(model_columns, 2):
                counts = {'a_better': 0, 'b_better': 0}
                for stock in stocks:
                    actual = Actuals[stock]
                    pred1 = model_dfs[model_a][stock]
                    pred2 = model_dfs[model_b][stock]
                    combined = pd.concat([actual, pred1, pred2], axis=1).dropna()

                    if combined.empty:
                        continue

                    dm_stat = dm_test(combined.iloc[:, 0], combined.iloc[:, 1], combined.iloc[:, 2], h=1, crit=metric)
                    if np.isnan(dm_stat):
                        continue
                    if dm_stat > 1.96:
                        counts['a_better'] += 1
                    elif dm_stat < -1.96:
                        counts['b_better'] += 1

                better_count_matrix.loc[model_a, model_b] = counts['a_better']
                better_count_matrix.loc[model_b, model_a] = counts['b_better']

            # Summarize results
            better_count_matrix['Outperform Count'] = better_count_matrix.sum(axis=1)
            better_count_matrix.loc['Outperformed Count'] = better_count_matrix.sum(axis=0)
            better_count_matrix.loc['Outperformed Count', 'Outperform Count'] = np.nan
            dm_file = f"{output_dir}/{prefix}_{model}_DM_{metric}_horizon_{horizon}.csv"
            better_count_matrix.to_csv(dm_file)

        print(f"Completed processing for Model: '{model}', Horizon: {horizon}\n")

Processing Model: 'AutoTFT', Horizon: 1
Completed processing for Model: 'AutoTFT', Horizon: 1

Processing Model: 'AutoBiTCN', Horizon: 1
Completed processing for Model: 'AutoBiTCN', Horizon: 1

Processing Model: 'AutoTiDE', Horizon: 1
Completed processing for Model: 'AutoTiDE', Horizon: 1

Processing Model: 'AutoNHITS', Horizon: 1
Completed processing for Model: 'AutoNHITS', Horizon: 1

Processing Model: 'AutoTFT', Horizon: 5
Completed processing for Model: 'AutoTFT', Horizon: 5

Processing Model: 'AutoBiTCN', Horizon: 5
Completed processing for Model: 'AutoBiTCN', Horizon: 5

Processing Model: 'AutoTiDE', Horizon: 5
Completed processing for Model: 'AutoTiDE', Horizon: 5

Processing Model: 'AutoNHITS', Horizon: 5
Completed processing for Model: 'AutoNHITS', Horizon: 5

Processing Model: 'AutoTFT', Horizon: 10
Completed processing for Model: 'AutoTFT', Horizon: 10

Processing Model: 'AutoBiTCN', Horizon: 10
Completed processing for Model: 'AutoBiTCN', Horizon: 10

Processing Model: 'Aut

### Metrics

In [None]:
import os
import pandas as pd

# Define the horizons, metrics, prefixes, and models to treat
horizons = [1, 5, 10, 20]
metrics = ['RMSE', 'MAE', 'MAPE', 'QLIKE']
prefixes = ['Tech_Firm']
models_to_treat = ['AutoTFT', 'AutoTiDE', 'AutoNHITS', 'AutoBiTCN']

# Iterate over each model to treat
for treated_model in models_to_treat:
    metric_data = {metric: [] for metric in metrics}  # Reset for each model

    # Process each horizon and prefix
    for prefix in prefixes:
        for horizon in horizons:
            # Adjust file path with prefix and treated model
            file_path = f'Data/Evaluation/horizon_{horizon}/{prefix}_{treated_model}_metrics_horizon_{horizon}.csv'
            try:
                df = pd.read_csv(file_path)
                df = df.rename(columns={'Unnamed: 0': 'model'})  # Handle unnamed index column
                df['model'] = df['model'].str.replace('Auto', '', regex=False)  # Clean up model names

                # Extract and rename columns for each metric
                for metric in metrics:
                    metric_df = df[['model', metric]].copy()
                    metric_df = metric_df.rename(columns={metric: f'{metric}_{prefix}horizon_{horizon}'})
                    metric_data[metric].append(metric_df)
            except FileNotFoundError:
                print(f"File not found: {file_path}. Skipping this file.")

    # Create the output directory if it doesn't exist
    output_dir = 'Data/Evaluation/Final'
    os.makedirs(output_dir, exist_ok=True)

    # Export each metric DataFrame to an Excel file
    excel_file = f'{output_dir}/{prefix}_{treated_model}_metrics_across_horizons_and_prefixes.xlsx'
    with pd.ExcelWriter(excel_file) as writer:
        for metric in metrics:
            # Concatenate the metric DataFrames for each metric across horizons
            if metric_data[metric]:  # Check if data exists for the metric
                result_df = pd.concat(metric_data[metric], axis=1)
                result_df = result_df.loc[:, ~result_df.columns.duplicated()]  # Remove duplicate 'model' columns
                # Write each metric to a separate sheet in the Excel file
                result_df.to_excel(writer, sheet_name=metric, index=False)

    print(f"Excel file for {treated_model} metrics has been created: {excel_file}")

Excel file for AutoTFT metrics has been created: Data/Evaluation/Final/Tech_Firm_AutoTFT_metrics_across_horizons_and_prefixes.xlsx
Excel file for AutoTiDE metrics has been created: Data/Evaluation/Final/Tech_Firm_AutoTiDE_metrics_across_horizons_and_prefixes.xlsx
Excel file for AutoNHITS metrics has been created: Data/Evaluation/Final/Tech_Firm_AutoNHITS_metrics_across_horizons_and_prefixes.xlsx
Excel file for AutoBiTCN metrics has been created: Data/Evaluation/Final/Tech_Firm_AutoBiTCN_metrics_across_horizons_and_prefixes.xlsx


### DM

In [None]:
import pandas as pd
import os

# Define the horizons, metrics, prefixes, and models to treat
horizons = [1, 5, 10, 20]
metrics = ['RMSE', 'MAE', 'MAPE', 'QLIKE']
prefixes = ['Tech_Firm']
models_to_treat = ['AutoTFT', 'AutoTiDE', 'AutoNHITS', 'AutoBiTCN']

# Process each model to treat
for treated_model in models_to_treat:
    metric_data = {metric: [] for metric in metrics}  # Reset data storage for each model

    # Iterate over prefixes, horizons, and metrics
    for prefix in prefixes:
        for horizon in horizons:
            for metric in metrics:
                # Construct the file path for the DM file
                file_path = f"Data/Evaluation/horizon_{horizon}/{prefix}_{treated_model}_DM_{metric}_horizon_{horizon}.csv"

                if os.path.exists(file_path):
                    try:
                        # Read the DM file
                        df = pd.read_csv(file_path)
                        # Rename and filter columns as required
                        df = df.rename(columns={
                            'Unnamed: 0': 'model',
                            'Outperform Count': f'{metric}_{prefix.strip("_")}_h{horizon}_Outperform'
                        })
                        df = df[df['model'] != 'Outperformed Count']  # Remove unwanted rows
                        df = df[['model', f'{metric}_{prefix.strip("_")}_h{horizon}_Outperform']]
                        # Clean the 'model' column
                        df['model'] = df['model'].str.replace('Auto', '', regex=False)
                        # Append the processed DataFrame to the metric's list
                        metric_data[metric].append(df)
                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")
                else:
                    print(f"File not found: {file_path}. Skipping.")

    # Create the output directory if it doesn't exist
    output_dir = 'Data/Evaluation/Final'
    os.makedirs(output_dir, exist_ok=True)

    # Create an Excel file for the current model
    excel_file = f"{output_dir}/{prefix}_{treated_model}_DM_across_horizons.xlsx"
    with pd.ExcelWriter(excel_file) as writer:
        for metric, data_frames in metric_data.items():
            if data_frames:  # Only process if data exists for the metric
                # Concatenate data frames for the metric
                result_df = pd.concat(data_frames, axis=1)
                # Remove duplicate columns for 'model'
                result_df = result_df.loc[:, ~result_df.columns.duplicated()]
                # Write to a sheet named after the metric
                result_df.to_excel(writer, sheet_name=metric, index=False)

    print(f"Excel file created for {treated_model}: {excel_file}")

Excel file created for AutoTFT: Data/Evaluation/Final/Tech_Firm_AutoTFT_DM_across_horizons.xlsx
Excel file created for AutoTiDE: Data/Evaluation/Final/Tech_Firm_AutoTiDE_DM_across_horizons.xlsx
Excel file created for AutoNHITS: Data/Evaluation/Final/Tech_Firm_AutoNHITS_DM_across_horizons.xlsx
Excel file created for AutoBiTCN: Data/Evaluation/Final/Tech_Firm_AutoBiTCN_DM_across_horizons.xlsx
