In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import datetime as dt

from utils.dataset import Dataset
from utils.utils import save_files, open_file
from utils.model_utils import prepare_sequential_data, train_test_split_dates

In [3]:
d = Dataset(path="Data/cleaned_data.csv",
            date_col="Date",
            include_pct_change=False,
            encode_categorical=False
               )
d.data.loc[:, d.data.dtypes == 'O'] = d.data.loc[:, d.data.dtypes == 'O'].astype(bool)
d.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 869 entries, 0 to 868
Data columns (total 38 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   Date                                      869 non-null    datetime64[ns]
 1   unempl_rate_seas                          869 non-null    float64       
 2   EU27_2020_GDP_gth_rate                    300 non-null    float64       
 3   G-20_GDP_gth_rate                         264 non-null    float64       
 4   G-7_GDP_gth_rate                          696 non-null    float64       
 5   OECD_GDP_gth_rate                         696 non-null    float64       
 6   USA_GDP_gth_rate                          869 non-null    float64       
 7   Recession                                 869 non-null    float64       
 8   VIX                                       366 non-null    float64       
 9   potus                           

### Main parameters: features, start and end dates, test set size, lookback

In [4]:
Y_cols = ["Fed_rate_month_avg", "Fed_rate_spot_EOM",
         "Fed_rate_month_avg_diff", "Fed_rate_month_avg_pct_change",
         "Fed_rate_month_avg_diff_3_class", "Fed_rate_month_avg_diff_5_class",
         "Fed_rate_month_avg_diff_9_class", "Fed_rate_month_avg_trend", "Date"
         ]
X_non_cols = ["Fed_rate_month_avg_diff_3_class", "Fed_rate_month_avg_diff_5_class",
              "Fed_rate_month_avg_diff_9_class", "Fed_rate_month_avg_trend", "Date"
             ]
X_cols = list(filter(lambda x: x not in X_non_cols, d.data.columns))

In [5]:
start_date = dt.datetime(1992, 1, 1)
end_date = dt.datetime(2020, 5, 1)

d.del_rows(start_date=start_date, end_date=end_date, inplace=True)
d.data = d.data.dropna(axis=1)  # dropping columns with nans

test_size = .2
standardize = True

# row_shifts = (1, 2, 3, 6, 12)


lookback = 12


## 1. ML training sets: feature shifts

### 1. Add lagged features

In [None]:
d.shift_features(row_shifts=row_shifts, inplace=True)
# Boolean features were converted to weird type, let's convert them back to booleans
d.data.loc[:, d.data.dtypes == 'O'] = d.data.loc[:, d.data.dtypes == 'O'].astype(bool)

### 2. Start date and end date of the dataset

In [None]:
X_cols = [
    x 
    for x in d.data.columns 
    if (
        x not in Y_cols 
        and len(x.split("_t-")) == 2
    )
]
d.data.head()

## 2. Sequences for LSTM / GRU

In [6]:
X_cols = list(filter(lambda x: x in d.columns, X_cols))
print(X_cols)
X, Y = prepare_sequential_data(d.data, features=X_cols, labels=Y_cols, lookback=12)
print("X shape: ", X.shape)
print("Y shape: ", Y.shape)

['unempl_rate_seas', 'G-7_GDP_gth_rate', 'OECD_GDP_gth_rate', 'USA_GDP_gth_rate', 'Recession', 'VIX', 'potus', 'houseOfRep', 'fedChair', 'Perso_consumption_evol_m', 'Fed_rate_month_avg', 'Fed_rate_spot_EOM', 'Fed_rate_month_avg_diff', 'Fed_rate_month_avg_pct_change', 'cpi_all_seas_pct_change', 'cpi_energy_seas_pct_change', 'empl_pop_ratio_seas_pct_change', 'unempl_level_seas_pct_change', 'US_debt_share_in_GDP_pct_change', 'US_debt_nominal_pct_change', 'USDCNY_pct_change', 'GSPC(S&P500)_pct_change', 'WTI_oil_price_pct_change', 'Perso_consumption_real_q_pct_change', 'Non_fin_non_corp_business_inv_pct_change', 'Non_fin_corp_business_inv_pct_change', 'Private_business_domestic_inv_pct_change']
X shape:  (328, 12, 27)
Y shape:  (328, 9)


## 3. Split in train and test set and save

In [7]:
X_train, Y_train, X_test, Y_test = train_test_split_dates(
    X=X,
    Y=Y,
#     test_date=None,
    test_size=test_size,
    standardize=standardize,
    sequential=True
)

In [10]:
params = open_file(path="Data/cleaned_data_params.yaml")
params["start_date"] = start_date
params["end_date"] = end_date
params["test_size"] = test_size
params["standardize"] = standardize
params["lookback"] = lookback
# params["row_shifts"] = row_shifts

files = {
    "X_train.joblib": X_train,
    "X_test.joblib": X_test,
#     "X_train.csv": X_train,
#     "X_test.csv": X_test,
    "Y_train.csv": Y_train,
    "Y_test.csv": Y_test,
    "params.yaml": params
}

path = "./Models/start_{:s}_end_{:s}_test_{:s}".format(
    start_date.strftime("%Y%m"),
    end_date.strftime("%Y%m"),
    Y_test['Date'].iloc[0].strftime("%Y%m")
)
path += "_seq"

save_files(path=path, files=files, replace=True)

------------------- 5 file(s) saved succesfully in ./Models/start_199201_end_202005_test_201311_seq/ -------------------
