In [25]:
from os import chdir
chdir('/Users/lananhnguyen/Desktop/thesis/thesis_code')
import main.packages.mine_generic as mine_g
import main.packages.mine_specific as mine_s

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error

import xgboost as xgb
import optuna
import json
from sklearn.decomposition import PCA

In [26]:
optuna.logging.set_verbosity(optuna.logging.WARNING)


In [27]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [28]:
# Forecast:
def generate_forecast(X, y, N, T, h, hyperparam, verbose=1): # from common function
    """
    Generate recursive forecast
    """
    print(f"Horizon: {h}")
    print("------------------------")
    y_pred_series = []
    for i in range(0, T):  # T+1-h
        X_train = X.iloc[: N+i, :]
        y_train = y.iloc[h: N+i+h, :]

        X_test = X.iloc[N+i : N+i+1, :] 
        y_test = y.iloc[N+i+h : N+i+h+1, :]  

        if X_test.index[-1] > X.index[-1]-pd.DateOffset(months=h):
            break

        # Forecast:
        y_pred = mine_s.xgb_pred(X_train, X_test, y_train, y_test, hyperparam)

        if verbose == 1:
            print(f"Training period - features: {X_train.index[0]} to {X_train.index[-1]}")
            print(f"Training period - target : {y_train.index[0]} to {y_train.index[-1]}")
            print(f"Test period - features: {X_test.index}")
            print(f"Test period - target : {y_test.index}")
            print(f"Forecast: {y_pred}")
            print("-------------------------------------------------------")

        y_pred_series.append(y_pred[0])
    return y_pred_series

In [29]:
hicp_all_path = 'data/preprocessed/hicp_yoy.csv'
hicp_class_path = 'data/HICP_COICOP10s.xlsx'
model = 'xgb_pca'
date_range = pd.date_range(start=mine_g.train_test_split_date + pd.DateOffset(months=1), end=mine_g.max_X_date, freq='M')


# Food:

In [30]:
category = 'Food'
cat_short = 'food'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'

HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly)
food_cat_prediction = pd.DataFrame(index=date_range)

Number of items in Food group:  180


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df.fillna(0, inplace=True)


## h = 1

In [31]:
h = 1

X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00


In [32]:
def prepare_X_train_test_pca(X_train, X_test, y, lags, n_components):

    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA:
    pca = PCA(n_components = n_components)

    factor_train = pd.DataFrame(pca.fit_transform(X_train_scaled))
    factor_test = pd.DataFrame(pca.transform(X_test_scaled))

    # Create y_lagged:
    y_lagged = pd.DataFrame()
    for lag in range(0, lags + 1):
        y_lagged[f'Lag_{lag}'] = y[y.index <= mine_g.max_X_date].shift(lag)

    # concatenate all:
    X_factor_all = pd.concat([factor_train, factor_test], axis = 0, ignore_index=True)
    factor_col_name = {col: f'factor_{col}' for col in X_factor_all.columns}
    X_factor_all.rename(columns=factor_col_name, inplace=True)

    X_factor_lags_all = pd.concat([y_lagged.reset_index(), X_factor_all], axis=1)
    #X_factor_lags_all.set_index(0, drop=True)
    #X_factor_lags_all.dropna(inplace=True)
    X_factor_lags_all.set_index('date', drop=True, inplace=True)
    print(X_factor_lags_all.shape)
    return X_factor_lags_all

In [33]:
X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

(312, 37)


In [34]:

best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False, use_all_params=False)
best_param
# save best_param:

# with open(f'models/{model}_h_{h}_hyperparam.json', 'w') as json_file:
#     json.dump(best_param, json_file)


Number of finished trials: 1000


{'learning_rate': 0.2878168225266016,
 'max_depth': 3,
 'lambda': 0.4938891500319428,
 'alpha': 0.018372799236502462}

In [35]:

# Forecast:

N, T = len(X_cat_train), len(X_cat_test)
y_pred = generate_forecast(X = X_factor_lags_all, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
food_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred

Horizon: 1
------------------------


In [36]:

yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

print(mean_absolute_error(yoy_real_test, y_pred))
print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

0.8044636471344597
1.2958570776942582


## h = 2

In [38]:
h = 2
X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 37)


In [39]:

best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False, use_all_params=False)
print(best_param)

N, T = len(X_cat_train), len(X_cat_test)

y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
food_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
print(mean_absolute_error(yoy_real_test, y_pred))
print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Number of finished trials: 1000
{'learning_rate': 0.2903095302439779, 'max_depth': 3, 'lambda': 0.6300582812965343, 'alpha': 4.38604865897875}
Horizon: 2
------------------------
1.2039944832792537
1.9787562100301188


## h = 3

In [40]:
h = 3
X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False, use_all_params=False)
print(best_param)

N, T = len(X_cat_train), len(X_cat_test)

y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
food_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
print(mean_absolute_error(yoy_real_test, y_pred))
print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(312, 37)
Number of finished trials: 1000
{'learning_rate': 0.26406373917396314, 'max_depth': 4, 'lambda': 0.04618789659187705, 'alpha': 5.164326443308275}
Horizon: 3
------------------------
1.2505989088753684
2.081885935571803


In [41]:
food_cat_prediction

Unnamed: 0,xgb_pca_food_h_1,xgb_pca_food_h_2,xgb_pca_food_h_3
2016-01-31,1.356598,0.796826,1.647838
2016-02-29,1.350596,0.914567,1.305536
2016-03-31,1.085839,0.981566,1.276447
2016-04-30,1.379725,1.202421,1.051230
2016-05-31,1.056466,0.929876,1.112998
...,...,...,...
2022-08-31,9.098248,6.904180,5.324711
2022-09-30,10.135781,8.649558,7.158765
2022-10-31,12.121883,9.729570,10.069124
2022-11-30,13.700499,11.303638,11.235138


In [57]:
mine_g.save_forecast(forecast_result_df=food_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)

------------------

# Energy:

In [58]:
category = 'Energy'
cat_short = 'energy'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'


HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly)
energy_cat_prediction = pd.DataFrame(index=date_range)

yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

In [49]:
for h in [1, 2, 3]:
    X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

    X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
    X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
    X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

    best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False, use_all_params=False)
    print(best_param)

    N, T = len(X_cat_train), len(X_cat_test)

    y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
    energy_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
    print(mean_absolute_error(yoy_real_test, y_pred))
    print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00
(312, 11)
Number of finished trials: 1000
{'learning_rate': 0.24625842640899043, 'max_depth': 3, 'lambda': 0.03048403178925436, 'alpha': 0.1996033152429902}
Horizon: 1
------------------------
2.5959795561840466
4.016344617812125
Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 11)
Number of finished trials: 1000
{'learning_rate': 0.273507732428635, 'max_depth': 3, 'lambda': 0.028058203118284716, 'alpha': 7.62918116848462}
Horizon: 2
------------------------
3.582320087533878
5.116987738147694
Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(312, 11)
Number of finished trials: 1000
{'learning_rate': 0.21

In [59]:
energy_cat_prediction

Unnamed: 0,xgb_pca_energy_h_1,xgb_pca_energy_h_2,xgb_pca_energy_h_3
2016-01-31,-5.634300,-6.621330,-5.911104
2016-02-29,-4.334748,-6.567018,-5.821362
2016-03-31,-7.002379,-6.497048,-5.960976
2016-04-30,-7.571912,-6.594689,-5.440464
2016-05-31,-7.500488,-6.668287,-6.390052
...,...,...,...
2022-08-31,33.449226,32.758442,31.395893
2022-09-30,31.679750,32.572292,31.314125
2022-10-31,38.874279,34.606262,28.175564
2022-11-30,39.741333,34.445869,33.026100


In [60]:
mine_g.save_forecast(forecast_result_df=energy_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)

## NEIG:

In [61]:
cat_short = 'neig'
category = 'Non-energy industrial goods (NEIG)'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'


HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly)
neig_cat_prediction = pd.DataFrame(index=date_range)
yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

In [51]:
for h in [1, 2, 3]:
    X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

    X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
    X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
    X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

    best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False, use_all_params=False)
    print(best_param)

    N, T = len(X_cat_train), len(X_cat_test)

    y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
    neig_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
    print(mean_absolute_error(yoy_real_test, y_pred))
    print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00
(312, 58)
Number of finished trials: 1000
{'learning_rate': 0.2911594955014518, 'max_depth': 4, 'lambda': 0.47542552532251187, 'alpha': 4.348518128258336}
Horizon: 1
------------------------
0.7934830639677303
1.1557759025632035
Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 58)
Number of finished trials: 1000
{'learning_rate': 0.29998283951820465, 'max_depth': 10, 'lambda': 0.502990435959595, 'alpha': 4.752185634676202}
Horizon: 2
------------------------
0.8177561720763846
1.2216164747789913
Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(312, 58)
Number of finished trials: 1000
{'learning_rate': 0.

In [62]:
neig_cat_prediction

Unnamed: 0,xgb_pca_neig_h_1,xgb_pca_neig_h_2,xgb_pca_neig_h_3
2016-01-31,0.719095,0.966426,0.836978
2016-02-29,0.885443,0.615713,0.747510
2016-03-31,0.453645,0.997458,0.855693
2016-04-30,0.781987,0.576700,0.841586
2016-05-31,0.853814,0.789251,0.301598
...,...,...,...
2022-08-31,4.155545,3.877729,4.251588
2022-09-30,4.298408,4.115819,4.423064
2022-10-31,4.586485,4.558065,4.647146
2022-11-30,4.633268,4.899963,4.773440


In [63]:
mine_g.save_forecast(forecast_result_df=neig_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)

# Services:

In [64]:
cat_short = 'services'
category = 'Services'
hicp_cat_path = f'data/preprocessed/{cat_short}_yoy_infl.csv'
save_cat_file_path = f'data/forecast_results/{cat_short}_forecast.csv'


HICP_monthly, HICP_class, HICP_cat = mine_g.import_data_all(hicp_all_path=hicp_all_path,
                                                     hicp_class_path=hicp_class_path,
                                                     hicp_cat_path=hicp_cat_path)

cat_df = mine_g.split_into_category(category=category,
                             HICP_class=HICP_class,
                             HICP_monthly=HICP_monthly)
services_cat_prediction = pd.DataFrame(index=date_range)
yoy_real = pd.read_csv(f"data/preprocessed/{cat_short}_yoy_infl.csv", parse_dates=True, index_col='date')
yoy_real_test = yoy_real[(yoy_real.index > mine_g.train_test_split_date) & (yoy_real.index <= mine_g.max_X_date)]

In [54]:
for h in [1, 2, 3]:
    X_cat_train, X_cat_test, y_cat_train, y_cat_test = mine_g.split_train_test_set(X = cat_df, y = HICP_cat, h = h)

    X_factor_lags_all = prepare_X_train_test_pca(X_cat_train, X_cat_test, HICP_cat, 3, 0.95)
    X_cat_train_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_train.index)]
    X_cat_test_new = X_factor_lags_all[X_factor_lags_all.index.isin(X_cat_test.index)]

    best_param = mine_s.hyperparam_tuning_optuna(mine_s.objective_xgb, X_cat_train_new, y_cat_train, scaler=False, use_all_params=False)
    print(best_param)

    N, T = len(X_cat_train), len(X_cat_test)

    y_pred = generate_forecast(X = cat_df, y = HICP_cat, N = N, T = T, h = h, hyperparam=best_param, verbose=0)
    services_cat_prediction.loc[:, f'{model}_{cat_short}_h_{h}'] = y_pred
    print(mean_absolute_error(yoy_real_test, y_pred))
    print(np.sqrt(mean_squared_error(yoy_real_test, y_pred)))

Horizon: 1
Training predictor period: 1997-01-31 00:00:00 to 2015-11-30 00:00:00
Training dependent variable period: 1997-02-28 00:00:00 to 2015-12-31 00:00:00
(312, 35)
Number of finished trials: 1000
{'learning_rate': 0.2770137182358671, 'max_depth': 3, 'lambda': 0.9992775064765546, 'alpha': 3.5997087537886507}
Horizon: 1
------------------------
0.5612903062622896
0.7269738557097635
Horizon: 2
Training predictor period: 1997-01-31 00:00:00 to 2015-10-31 00:00:00
Training dependent variable period: 1997-03-31 00:00:00 to 2015-12-31 00:00:00
(312, 35)
Number of finished trials: 1000
{'learning_rate': 0.28622844991129476, 'max_depth': 7, 'lambda': 0.10155262166651956, 'alpha': 0.01874010438900821}
Horizon: 2
------------------------
0.47533973174485716
0.6459962267282484
Horizon: 3
Training predictor period: 1997-01-31 00:00:00 to 2015-09-30 00:00:00
Training dependent variable period: 1997-04-30 00:00:00 to 2015-12-31 00:00:00
(312, 35)
Number of finished trials: 1000
{'learning_rate'

In [65]:
services_cat_prediction

Unnamed: 0,xgb_pca_services_h_1,xgb_pca_services_h_2,xgb_pca_services_h_3
2016-01-31,1.494736,1.707252,1.658896
2016-02-29,1.810799,2.119540,1.753526
2016-03-31,1.187141,1.065606,1.323503
2016-04-30,1.275875,1.175357,1.387650
2016-05-31,1.383212,0.854819,1.021546
...,...,...,...
2022-08-31,1.889655,1.694674,2.071346
2022-09-30,1.855000,1.959387,1.903322
2022-10-31,1.918810,3.246345,2.121928
2022-11-30,2.640146,3.469780,2.306933


In [66]:
mine_g.save_forecast(forecast_result_df=services_cat_prediction.reset_index(drop=True), cat_file_path=save_cat_file_path)