In [7]:
from os import chdir
chdir('/Users/lananhnguyen/Desktop/thesis/thesis_code')
import main.packages.unchain_chain as chain
import main.packages.mine_generic as mine_g
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import mean_squared_error
from dieboldmariano import dm_test

In [8]:
def take_y_reals():
    cats = ["food", "energy", "neig", "services"]
    
    concatenated_df = pd.DataFrame()

    for cat in cats:
        cat_df = mine_g.load_excel(f"data/hicp_cat_raw/prc_hicp_{cat}.xlsx", name = f"hicp_{cat}", subset=True, verbose=0)
        #cat_df = cat_df[(cat_df.index > mine_g.train_test_split_date)]
        concatenated_df = pd.concat([concatenated_df, cat_df], axis=1)
    return concatenated_df

def extract_forecast_model_h(forecast_all_cat_df, model, h):
    """
    in 4 df, extract columns with model's name and horizon
    """

    pattern = rf"^{model}_(?!l).*h_{h}"
    matching_cols = [col for col in forecast_all_cat_df.columns if re.search(pattern, col)]
    df_specific_model_h = forecast_all_cat_df.loc[:, matching_cols]
    return df_specific_model_h

import os

def concatenate_csv_files(folder_path):
    """
    Concatenates CSV files in a folder column-wise into a single DataFrame.

    Args:
    - folder_path (str): Path to the folder containing CSV files.

    Returns:
    - concatenated_df (DataFrame): Concatenated DataFrame.
    """

    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    concatenated_df = pd.DataFrame()

    # Iterate through each CSV file
    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        
        # Add each file's data as new columns in the concatenated DataFrame
        concatenated_df = pd.concat([concatenated_df, df], axis=1)
        concatenated_df = concatenated_df.loc[:, ~concatenated_df.columns.duplicated()]

    return concatenated_df

def transform_to_chained(y_unchain, y_real):
    y_real = y_real[y_real.index <= "2022-12-31"]
    dec_mask = y_real.index.month == 12
    dec_data = y_real.where(dec_mask, other=np.nan)
    dec_data.ffill(inplace=True)
    dec_data = dec_data[dec_data.index > "2015-12-31"]
    # chain data back:
    y_chained = y_unchain * dec_data
    return y_chained

In [9]:
y_real = take_y_reals()


folder_path = "data/forecast_results"
forecast_yoy_all_cat = concatenate_csv_files(folder_path)
date_range = pd.date_range(start=mine_g.train_test_split_date + pd.DateOffset(months=1), end=mine_g.max_X_date, freq='M')

In [10]:
# take the weights:
weight_f = pd.read_excel('data/hicp_cat_raw/prc_hicp_weight_food.xlsx', sheet_name='Sheet 1', skiprows=7)
weight_f = weight_f.iloc[1, :].to_frame().reset_index()
weight_f.columns = weight_f.iloc[0]
weight_f = weight_f[1:]
weight_f.rename(columns={"TIME": "date", "Germany": 'weight_f'}, inplace=True)

weight_e = pd.read_excel('data/hicp_cat_raw/prc_hicp_weight_energy.xlsx', sheet_name='Sheet 1', skiprows=7)
weight_e = weight_e.iloc[1, :].to_frame().reset_index()
weight_e.columns = weight_e.iloc[0]
weight_e = weight_e[1:]
weight_e.rename(columns={"TIME": "date", "Germany": 'weight_e'}, inplace=True)

weight_n = pd.read_excel('data/hicp_cat_raw/prc_hicp_weight_neig.xlsx', sheet_name='Sheet 1', skiprows=7)
weight_n = weight_n.iloc[1, :].to_frame().reset_index()
weight_n.columns = weight_n.iloc[0]
weight_n = weight_n[1:]
weight_n.rename(columns={"TIME": "date", "Germany": 'weight_n'}, inplace=True)

weight_s = pd.read_excel('data/hicp_cat_raw/prc_hicp_weight_services.xlsx', sheet_name='Sheet 1', skiprows=7)
weight_s = weight_s.iloc[1, :].to_frame().reset_index()
weight_s.columns = weight_s.iloc[0]
weight_s = weight_s[1:]
weight_s.rename(columns={"TIME": "date", "Germany": 'weight_s'}, inplace=True)

weights = pd.concat([weight_f, weight_e, weight_n, weight_s], axis=1)
weights = weights.loc[:, ~weights.columns.duplicated()]
weights['date'] = weights['date'].astype(int)
for col in weights.columns[1:]:
    weights[col] = weights[col].astype(float)

weights.loc[:, 'total'] = weights.iloc[:, 1:5].sum(axis=1)


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


In [15]:
forecast_yoy_all_cat.columns


Index(['prc_food_h_1', 'prc_food_h_2', 'prc_food_h_3', 'ridge_food_h_1',
       'ridge_food_h_2', 'ridge_food_h_3', 'lasso_food_h_1', 'lasso_food_h_2',
       'lasso_food_h_3', 'xgb_food_h_1', 'xgb_food_h_2', 'xgb_food_h_3',
       'cb_ridge_lasso_food_h_1', 'cb_ridge_lasso_food_h_2',
       'cb_ridge_lasso_food_h_3', 'cb_ridge_pcr_food_h_1',
       'cb_ridge_pcr_food_h_2', 'cb_ridge_pcr_food_h_3',
       'cb_ridge_prc_lasso_food_h_1', 'cb_ridge_prc_lasso_food_h_2',
       'cb_ridge_prc_lasso_food_h_3', 'ar_1_food_h_1', 'ar_1_food_h_2',
       'ar_1_food_h_3', 'prc_energy_h_1', 'prc_energy_h_2', 'prc_energy_h_3',
       'ridge_energy_h_1', 'ridge_energy_h_2', 'ridge_energy_h_3',
       'lasso_energy_h_1', 'lasso_energy_h_2', 'lasso_energy_h_3',
       'xgb_energy_h_1', 'xgb_energy_h_2', 'xgb_energy_h_3',
       'cb_ridge_lasso_energy_h_1', 'cb_ridge_lasso_energy_h_2',
       'cb_ridge_lasso_energy_h_3', 'cb_ridge_pcr_energy_h_1',
       'cb_ridge_pcr_energy_h_2', 'cb_ridge_pcr_energy_h

In [29]:
new_df

2016-01-31
2016-02-29
2016-03-31
2016-04-30
2016-05-31
...
2022-08-31
2022-09-30
2022-10-31
2022-11-30
2022-12-31


In [28]:
models = ['cb_ridge_prc'] #,'xgb','ridge', 'lasso','cb_ridge_prc', 'cb_ridge_prc_lasso', 'cb_ridge_lasso'

horizons = [1, 2, 3]

for model in models:
    for h in horizons:

        # get the categorical data:
        new_df = extract_forecast_model_h(forecast_yoy_all_cat, model, h)
        new_df.set_index(date_range, inplace=True)


        # transform:
        food_chained = chain.transform_back_chained(new_df.iloc[:, 0], y_real.iloc[:, 0])
        food_unchain = chain.unchain_series(food_chained, y_real.iloc[:, 0])
        energy_chained = chain.transform_back_chained(new_df.iloc[:, 1], y_real.iloc[:, 1])
        energy_unchain = chain.unchain_series(energy_chained, y_real.iloc[:, 1])
        neig_chained = chain.transform_back_chained(new_df.iloc[:, 2], y_real.iloc[:, 2])
        neig_unchained = chain.unchain_series(neig_chained, y_real.iloc[:, 2])
        services_chained = chain.transform_back_chained(new_df.iloc[:, 3], y_real.iloc[:, 3])
        services_unchained = chain.unchain_series(services_chained, y_real.iloc[:, 3])
        
        # take the products of respective category and weights
        food_unchain.name = 'food_unchain'
        food_unchain = food_unchain.to_frame()
        food_unchain.loc[:, 'year'] = food_unchain.index.year
        food_unchain
        merged_df = pd.merge(food_unchain, weights, how='inner', left_on='year', right_on='date')
        mul_food = merged_df.loc[:, 'food_unchain'].mul(merged_df.loc[:, 'weight_f'])

        energy_unchain.name = 'e_unchain'
        e_unchain = energy_unchain.to_frame()
        e_unchain.loc[:, 'year'] = e_unchain.index.year
        merged_df = pd.merge(e_unchain, weights, how='inner', left_on='year', right_on='date')
        mul_e = merged_df.loc[:, 'e_unchain'].mul(merged_df.loc[:, 'weight_e'])


        neig_unchained.name = 'n_unchain'
        n_unchain = neig_unchained.to_frame()
        n_unchain.loc[:, 'year'] = n_unchain.index.year
        merged_df = pd.merge(n_unchain, weights, how='inner', left_on='year', right_on='date')
        mul_n = merged_df.loc[:, 'n_unchain'].mul(merged_df.loc[:, 'weight_n'])

        services_unchained.name = 's_unchain'
        s_unchain = services_unchained.to_frame()
        s_unchain.loc[:, 'year'] = s_unchain.index.year
        merged_df = pd.merge(s_unchain, weights, how='inner', left_on='year', right_on='date')
        mul_s = merged_df.loc[:, 's_unchain'].mul(merged_df.loc[:, 'weight_s'])

        # take the sum of all product:
        mul_sum = pd.concat([mul_food, mul_e, mul_n, mul_s], axis=1)
        mul_sum.loc[:, 'hicp_unchained'] = mul_sum.sum(axis=1)/1000

        date_range = pd.date_range(start=mine_g.train_test_split_date + pd.DateOffset(months=1), end=mine_g.max_X_date, freq='M')
        mul_sum.set_index(date_range, inplace=True)

        # chain the results:
        y_real_hicp = mine_g.load_excel('data/hicp_all.xlsx', 'hicp_all', verbose=0)
        hicp_chained_final = transform_to_chained(mul_sum.loc[:, 'hicp_unchained'], y_real_hicp.iloc[:, 0])

        y_real_hicp.loc[:, 'last_y'] = y_real_hicp.iloc[:, 0].shift(12)
        y_real_hicp = y_real_hicp[(y_real_hicp.index > mine_g.train_test_split_date) & (y_real_hicp.index <= mine_g.max_X_date)]

        hicp_all_yoy = (hicp_chained_final/y_real_hicp.loc[:, 'last_y']-1)*100
        hicp_all_yoy.name = f'{model}_h_{h}'   
        forecast = pd.read_csv('data/headline_forecast/head_infl_forecast.csv')
        new_forecast = pd.concat([forecast, hicp_all_yoy.reset_index(drop=True)], axis=1)

        new_forecast.to_csv('data/headline_forecast/head_infl_forecast.csv', index=False)

IndexError: single positional indexer is out-of-bounds

----

In [None]:
real_hicp_yoy = pd.read_csv('data/preprocessed/head_inflation.csv', parse_dates = True, index_col='date')
real_hicp_yoy_test = real_hicp_yoy[(real_hicp_yoy.index > mine_g.train_test_split_date) & (real_hicp_yoy.index <= mine_g.max_X_date)]

new_forecast = pd.concat([new_forecast, real_hicp_yoy_test.reset_index(drop=True)], axis=1)
new_forecast

# Compare OLS for headline and OLS from individual cat aggregated

In [None]:
forecast = pd.read_csv('data/headline_forecast/head_infl_forecast.csv')

real_hicp_yoy = pd.read_csv('data/preprocessed/head_inflation.csv', parse_dates = True, index_col='date')
real_hicp_yoy_test = real_hicp_yoy[(real_hicp_yoy.index > mine_g.train_test_split_date) & (real_hicp_yoy.index <= mine_g.max_X_date)]


In [None]:
rmse = []

for h in [1, 2, 3]:
    h_cols = []

    for col in forecast.columns:
        if col.endswith(str(h)):
            h_cols.append(col)
    forecast_df_h = forecast[h_cols]
    
    for col in [ col for col in forecast_df_h.columns if "ar_ols" not in col]:
        print(col)
        print(dm_test(real_hicp_yoy_test.iloc[:, 0], forecast_df_h[col], forecast_df_h.loc[:, f'ar_ols_h_{h}'], h = h, harvey_correction=True))


    for col in forecast_df_h.columns:
        rmse_here = np.sqrt(mean_squared_error(forecast_df_h[col], real_hicp_yoy_test))
        rmse.append(rmse_here)


def get_first_part(col_name, separator='_h'):
    return col_name.split(separator)[0]

new_col_names = [get_first_part(col) for col in forecast_df_h.columns]
rmse = pd.DataFrame(rmse)
rmse = pd.DataFrame(rmse.values.reshape(3, int(len(forecast.columns)/3)), columns=new_col_names, index=[f'h_{h}' for h in [1, 2, 3]])


ar_110_h_1
(-1.6769896095002441, 0.0973070531150434)
prc_h_1
(-1.5781254467216208, 0.11834067278353033)
xgb_h_1
(3.4237065320769067, 0.0009619323061758958)
ols_h_1
(3.539041479399328, 0.0006609258505795756)
ridge_h_1
(-1.0653307344634626, 0.289815590386732)
lasso_h_1
(-0.8946303986000829, 0.37357171115027277)
cb_ridge_prc_h_1
(-1.6289865490960187, 0.10710576840526909)
cb_ridge_prc_lasso_h_1
(-1.7453848181394982, 0.08461869679593281)
cb_ridge_lasso_h_1
(-1.3974531417190776, 0.16600170813092985)
ar_110_h_2
(-1.5284227334846807, 0.13020984156895388)
prc_h_2
(-1.2279405052947059, 0.22294023474817415)
xgb_h_2
(2.3304612418639303, 0.022209175970383455)
ols_h_2
(1.724296670959621, 0.08837689553732869)
ridge_h_2
(-1.4989486791627376, 0.13768095075089137)
lasso_h_2
(-1.505347480195841, 0.13603101380567806)
cb_ridge_prc_h_2
(-1.4854956017988998, 0.14120103861820776)
cb_ridge_prc_lasso_h_2
(-1.530942585813958, 0.12958626261347939)
cb_ridge_lasso_h_2
(-1.5466068948854563, 0.1257626711967298)
ar_11

In [None]:
rmse

Unnamed: 0,ar_110,ar_ols,prc,xgb,ols,ridge,lasso,cb_ridge_prc,cb_ridge_prc_lasso,cb_ridge_lasso
h_1,0.626385,0.69637,0.589798,1.15479,1.527996,0.628242,0.644231,0.590541,0.59299,0.61674
h_2,0.883125,1.075871,0.850622,1.423182,1.499872,0.750642,0.779543,0.772864,0.76724,0.754751
h_3,1.116769,1.431566,1.058617,1.484928,1.657973,0.856219,0.929474,0.920781,0.917174,0.881245


In [None]:
rmse.to_csv("data/report_rmse/total.csv")

In [None]:
rmse_comparative = [round(rmse[col]/rmse[f'ar_ols'], 2) for col in rmse.columns]
rmse_comparative

[h_1    0.90
 h_2    0.82
 h_3    0.78
 dtype: float64,
 h_1    1.0
 h_2    1.0
 h_3    1.0
 Name: ar_ols, dtype: float64,
 h_1    0.85
 h_2    0.79
 h_3    0.74
 dtype: float64,
 h_1    1.66
 h_2    1.32
 h_3    1.04
 dtype: float64,
 h_1    2.19
 h_2    1.39
 h_3    1.16
 dtype: float64,
 h_1    0.9
 h_2    0.7
 h_3    0.6
 dtype: float64,
 h_1    0.93
 h_2    0.72
 h_3    0.65
 dtype: float64,
 h_1    0.85
 h_2    0.72
 h_3    0.64
 dtype: float64,
 h_1    0.85
 h_2    0.71
 h_3    0.64
 dtype: float64,
 h_1    0.89
 h_2    0.70
 h_3    0.62
 dtype: float64]