In [1]:
from os import chdir
chdir('/Users/lananhnguyen/Desktop/thesis/thesis_code')
import main.packages.mine_generic as mine_g
import main.packages.mine_specific as mine_s

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error

import xgboost as xgb
import optuna
import json
from sklearn.decomposition import PCA

In [2]:
optuna.logging.set_verbosity(optuna.logging.WARNING)


In [3]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [4]:
# Forecast:
def generate_forecast(X, y, N, T, h, hyperparam, verbose=1): # from common function
    """
    Generate recursive forecast
    """
    print(f"Horizon: {h}")
    print("------------------------")
    y_pred_series = []
    for i in range(0, T):  # T+1-h
        X_train = X.iloc[: N+i, :]
        y_train = y.iloc[h: N+i+h, :]

        X_test = X.iloc[N+i : N+i+1, :] 
        y_test = y.iloc[N+i+h : N+i+h+1, :]  

        if X_test.index[-1] > X.index[-1]-pd.DateOffset(months=h):
            break

        # Forecast:
        y_pred = mine_s.xgb_pred(X_train, X_test, y_train, y_test, hyperparam)

        if verbose == 1:
            print(f"Training period - features: {X_train.index[0]} to {X_train.index[-1]}")
            print(f"Training period - target : {y_train.index[0]} to {y_train.index[-1]}")
            print(f"Test period - features: {X_test.index}")
            print(f"Test period - target : {y_test.index}")
            print(f"Forecast: {y_pred}")
            print("-------------------------------------------------------")

        y_pred_series.append(y_pred[0])
    return y_pred_series

In [5]:
def prepare_X_train_test_pca(X_train, X_test, y, lags, n_components):
    """
    """
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA:
    pca = PCA(n_components = n_components)

    factor_train = pd.DataFrame(pca.fit_transform(X_train_scaled))
    factor_test = pd.DataFrame(pca.transform(X_test_scaled))

    # Create y_lagged:
    y_lagged = pd.DataFrame()
    for lag in range(0, lags + 1):
        y_lagged[f'Lag_{lag}'] = y[y.index <= mine_g.max_X_date].shift(lag)

    # concatenate all:
    X_factor_all = pd.concat([factor_train, factor_test], axis = 0, ignore_index=True)
    factor_col_name = {col: f'factor_{col}' for col in X_factor_all.columns}
    X_factor_all.rename(columns=factor_col_name, inplace=True)

    X_factor_lags_all = pd.concat([y_lagged.reset_index(), X_factor_all], axis=1)
    #X_factor_lags_all.set_index(0, drop=True)
    #X_factor_lags_all.dropna(inplace=True)
    X_factor_lags_all.set_index('date', drop=True, inplace=True)
    print(X_factor_lags_all.shape)
    return X_factor_lags_all

In [6]:
hicp_all_path = 'data/preprocessed/hicp_yoy.csv'
hicp_class_path = 'data/HICP_COICOP10s.xlsx'
model = 'xgb_pca'
date_range = pd.date_range(start=mine_g.train_test_split_date + pd.DateOffset(months=1), end=mine_g.max_X_date, freq='M')


# Food:

In [7]:
category = 'Food'
cat_short = 'food'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'

HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly,
                             fillna=False)
cat_df.dropna(axis=1, inplace=True)

food_cat_prediction = pd.DataFrame(index=date_range)
yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

Number of items in Food group:  180


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df.dropna(axis=1, inplace=True)


## h = 1

In [8]:
h = 1

X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00


In [9]:
X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

(312, 35)


In [10]:

best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False)
best_param


Number of finished trials: 1000


{'learning_rate': 0.27936997139955455,
 'max_depth': 3,
 'gamma': 2.595325381428443,
 'lambda': 0.10111264097809791,
 'alpha': 0.011416217213591429}

In [11]:

# Forecast:

N, T = len(X_cat_train), len(X_cat_test)
y_pred = generate_forecast(X = X_factor_lags_all, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
food_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
print(mean_absolute_error(yoy_real_test, y_pred))
print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
------------------------
0.7735644998741007
1.2106535744448241


## h = 2

In [12]:
h = 2
X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 35)


In [14]:

best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False)
print(best_param)

N, T = len(X_cat_train), len(X_cat_test)

y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
food_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
print(mean_absolute_error(yoy_real_test, y_pred))
print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Number of finished trials: 1000
{'learning_rate': 0.2948680409716552, 'max_depth': 6, 'gamma': 2.840592720274538, 'lambda': 0.8471842805780159, 'alpha': 6.982999417187105}
Horizon: 2
------------------------
1.2568734899728276
2.2328679974437717


## h = 3

In [15]:
h = 3
X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False)
print(best_param)

N, T = len(X_cat_train), len(X_cat_test)

y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
food_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
print(mean_absolute_error(yoy_real_test, y_pred))
print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(312, 35)
Number of finished trials: 1000
{'learning_rate': 0.2722724478571974, 'max_depth': 4, 'gamma': 2.164259929533126, 'lambda': 0.9507825320287131, 'alpha': 0.3791585445933658}
Horizon: 3
------------------------
1.0880493185533975
1.5915509581048204


In [16]:
food_cat_prediction

Unnamed: 0,xgb_pca_food_h_1,xgb_pca_food_h_2,xgb_pca_food_h_3
2016-01-31,1.510642,1.332123,1.329567
2016-02-29,1.431145,1.237375,1.052172
2016-03-31,1.130537,1.174551,1.256293
2016-04-30,1.347342,1.189418,1.373047
2016-05-31,1.475416,1.286819,1.508808
...,...,...,...
2022-08-31,10.184432,6.304696,8.169952
2022-09-30,10.821424,6.862824,10.185497
2022-10-31,12.126287,8.668372,10.950062
2022-11-30,13.557337,9.763294,12.406686


In [17]:
mine_g.save_forecast(forecast_result_df=food_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)

------------------

# Energy:

In [18]:
category = 'Energy'
cat_short = 'energy'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'


HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly,
                             fillna=False)
cat_df.dropna(axis=1, inplace=True)
energy_cat_prediction = pd.DataFrame(index=date_range)

yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

Number of items in Energy group:  14


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df.dropna(axis=1, inplace=True)


In [19]:
for h in [1, 2, 3]:
    X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

    X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
    X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
    X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

    best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False)
    print(best_param)

    N, T = len(X_cat_train), len(X_cat_test)

    y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
    energy_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
    print(mean_absolute_error(yoy_real_test, y_pred))
    print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00
(312, 10)
Number of finished trials: 1000
{'learning_rate': 0.2796980286864401, 'max_depth': 3, 'gamma': 7.627107893286285, 'lambda': 0.3251207295804724, 'alpha': 4.60638800379785}
Horizon: 1
------------------------
2.711848880202763
4.208863323931931
Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 10)
Number of finished trials: 1000
{'learning_rate': 0.21473064465956468, 'max_depth': 4, 'gamma': 5.217099698931682, 'lambda': 0.4751055617116573, 'alpha': 8.422200523233446}
Horizon: 2
------------------------
4.095404591786816
5.909859823509882
Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(312, 10)
Nu

In [20]:
energy_cat_prediction

Unnamed: 0,xgb_pca_energy_h_1,xgb_pca_energy_h_2,xgb_pca_energy_h_3
2016-01-31,-6.104800,-6.226882,-6.193914
2016-02-29,-4.193218,-5.342348,-6.141488
2016-03-31,-6.805299,-5.073478,-6.046509
2016-04-30,-6.919703,-6.227957,-5.946076
2016-05-31,-7.017718,-6.321769,-6.634685
...,...,...,...
2022-08-31,33.472527,29.908594,31.821791
2022-09-30,31.361765,27.997944,32.195183
2022-10-31,33.681252,30.671486,34.017826
2022-11-30,34.983688,32.713638,32.185894


In [21]:
mine_g.save_forecast(forecast_result_df=energy_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)

## NEIG:

In [22]:
cat_short = 'neig'
category = 'Non-energy industrial goods (NEIG)'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'


HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly,
                             fillna=False)
cat_df.dropna(axis=1, inplace=True)

neig_cat_prediction = pd.DataFrame(index=date_range)
yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

Number of items in Non-energy industrial goods (NEIG) group:  302


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df.dropna(axis=1, inplace=True)


In [23]:
for h in [1, 2, 3]:
    X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

    X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
    X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
    X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

    best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False)
    print(best_param)

    N, T = len(X_cat_train), len(X_cat_test)

    y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
    neig_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
    print(mean_absolute_error(yoy_real_test, y_pred))
    print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00
(312, 56)
Number of finished trials: 1000
{'learning_rate': 0.282660321923129, 'max_depth': 9, 'gamma': 0.24222570561891987, 'lambda': 0.14386795749955153, 'alpha': 0.4491393445439831}
Horizon: 1
------------------------
0.7106992460615217
1.061060483362243
Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 56)
Number of finished trials: 1000
{'learning_rate': 0.11356159742000572, 'max_depth': 8, 'gamma': 1.5278102145674186, 'lambda': 0.24979205827786471, 'alpha': 0.6628265189206577}
Horizon: 2
------------------------
0.9785511044661662
1.5083096512380718
Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(3

In [24]:
neig_cat_prediction

Unnamed: 0,xgb_pca_neig_h_1,xgb_pca_neig_h_2,xgb_pca_neig_h_3
2016-01-31,0.639764,0.665797,0.783515
2016-02-29,0.943979,0.704511,0.658100
2016-03-31,0.344499,0.724720,0.711778
2016-04-30,0.295555,0.516369,0.815611
2016-05-31,0.498168,0.509060,0.492938
...,...,...,...
2022-08-31,4.874139,1.816454,2.711798
2022-09-30,5.108075,3.652495,1.069477
2022-10-31,5.403193,3.242584,2.780759
2022-11-30,5.823827,3.728022,3.126815


In [25]:
mine_g.save_forecast(forecast_result_df=neig_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)

# Services:

In [26]:
cat_short = 'services'
category = 'Services'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'


HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly,
                             fillna=False)
cat_df.dropna(axis=1, inplace=True)

services_cat_prediction = pd.DataFrame(index=date_range)
yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

Number of items in Services group:  148


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df.dropna(axis=1, inplace=True)


In [27]:
for h in [1, 2, 3]:
    X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

    X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
    X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
    X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

    best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False)
    print(best_param)

    N, T = len(X_cat_train), len(X_cat_test)

    y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
    services_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
    print(mean_absolute_error(yoy_real_test, y_pred))
    print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00
(312, 34)
Number of finished trials: 1000
{'learning_rate': 0.2950336062725532, 'max_depth': 6, 'gamma': 1.649757864401882, 'lambda': 0.03393276768386008, 'alpha': 0.8861142598532352}
Horizon: 1
------------------------
0.5231414054260826
0.6799060681023014
Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 34)
Number of finished trials: 1000
{'learning_rate': 0.2957637456950312, 'max_depth': 4, 'gamma': 2.3902846469152994, 'lambda': 0.14940282935297614, 'alpha': 0.017922764883711046}
Horizon: 2
------------------------
0.5328371376282987
0.6884264567049797
Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(

In [28]:
services_cat_prediction

Unnamed: 0,xgb_pca_services_h_1,xgb_pca_services_h_2,xgb_pca_services_h_3
2016-01-31,1.788104,2.539376,1.444829
2016-02-29,2.380782,2.151071,1.377225
2016-03-31,1.568678,2.162640,1.474322
2016-04-30,1.398836,1.478015,1.587453
2016-05-31,1.554886,1.422383,1.302446
...,...,...,...
2022-08-31,2.044750,2.049197,1.972254
2022-09-30,2.065027,1.917514,2.434944
2022-10-31,2.381360,2.628388,2.089640
2022-11-30,2.850416,2.792700,2.478073


In [29]:
mine_g.save_forecast(forecast_result_df=services_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)