In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import datetime as dt

from utils.dataset import Dataset
from utils.utils import save_files, open_file
from utils.model_utils import prepare_sequential_data, train_test_split_dates

In [12]:
d = Dataset(path="Data/cleaned_data.csv",
            date_col="Date",
            include_pct_change=False,
            encode_categorical=False
               )
d.data.loc[:, d.data.dtypes == 'O'] = d.data.loc[:, d.data.dtypes == 'O'].astype(bool)
d.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 869 entries, 0 to 868
Data columns (total 38 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   Date                                      869 non-null    datetime64[ns]
 1   unempl_rate_seas                          869 non-null    float64       
 2   EU27_2020_GDP_gth_rate                    300 non-null    float64       
 3   G-20_GDP_gth_rate                         264 non-null    float64       
 4   G-7_GDP_gth_rate                          696 non-null    float64       
 5   OECD_GDP_gth_rate                         696 non-null    float64       
 6   USA_GDP_gth_rate                          869 non-null    float64       
 7   Recession                                 869 non-null    float64       
 8   VIX                                       366 non-null    float64       
 9   potus                           

### Main parameters: features, start and end dates, test set size, lookback

In [13]:
Y_cols = ["Fed_rate_month_avg", "Fed_rate_spot_EOM",
         "Fed_rate_month_avg_diff", "Fed_rate_month_avg_pct_change",
         "Fed_rate_month_avg_diff_3_class", "Fed_rate_month_avg_diff_5_class",
         "Fed_rate_month_avg_diff_9_class", "Fed_rate_month_avg_trend", "Date"
         ]
X_non_cols = ["Fed_rate_month_avg_diff_3_class", "Fed_rate_month_avg_diff_5_class",
              "Fed_rate_month_avg_diff_9_class", "Fed_rate_month_avg_trend", "Date"
             ]
X_cols = list(filter(lambda x: x not in X_non_cols, d.data.columns))

In [14]:
# start_date = dt.datetime(1992, 1, 1)
start_date = dt.datetime(1966, 1, 1)
end_date = dt.datetime(2020, 5, 1)

d.del_rows(start_date=start_date, end_date=end_date, inplace=True)
d.data = d.data.dropna(axis=1)  # dropping columns with nans

test_size = .2
standardize = True

row_shifts = (1, 2, 3, 6, 12)


lookback = 12


In [15]:
d.data

Unnamed: 0,Date,unempl_rate_seas,G-7_GDP_gth_rate,OECD_GDP_gth_rate,USA_GDP_gth_rate,Recession,potus,houseOfRep,fedChair,Perso_consumption_evol_m,...,empl_pop_ratio_seas_pct_change,unempl_level_seas_pct_change,US_debt_share_in_GDP_pct_change,US_debt_nominal_pct_change,GSPC(S&P500)_pct_change,WTI_oil_price_pct_change,Perso_consumption_real_q_pct_change,Non_fin_non_corp_business_inv_pct_change,Non_fin_corp_business_inv_pct_change,Private_business_domestic_inv_pct_change
216,1966-01-31,4.0,0.005685,0.005627,0.007622,0.0,False,False,False,0.677935,...,0.001767,-0.014187,-0.006017,-0.002567,0.000414,0.006361,0.012281,0.021890,0.054129,0.183105
217,1966-02-28,3.8,0.005685,0.005627,0.007622,0.0,False,False,False,0.726407,...,-0.001764,-0.056225,-0.006048,-0.002541,-0.011441,-0.008475,0.012281,0.021890,0.054129,0.183105
218,1966-03-31,3.8,0.006357,0.005496,0.008049,0.0,False,False,False,0.727240,...,0.000000,0.023759,-0.006080,-0.002516,-0.026590,-0.015059,0.012281,-0.007726,0.031567,-0.013720
219,1966-04-30,3.8,0.006357,0.005496,0.008049,0.0,False,False,False,0.142240,...,0.003534,-0.020436,0.002554,0.008647,0.013529,-0.005230,-0.000074,-0.007726,0.031567,-0.013720
220,1966-05-31,3.9,0.006357,0.005496,0.008049,0.0,False,False,False,-0.880398,...,-0.001761,0.043140,0.002580,0.008644,-0.047987,0.008644,-0.000074,-0.007726,0.031567,-0.013720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,2019-12-31,3.5,0.000020,0.000532,0.001755,0.0,True,False,True,-0.029010,...,0.000000,-0.009981,0.004366,0.007369,0.024490,-0.057850,0.003151,0.025698,-0.019649,-0.259066
864,2020-01-31,3.6,0.000020,0.000532,0.001755,0.0,True,False,True,0.377935,...,0.003279,0.024161,0.006959,0.009496,-0.007231,-0.114988,-0.018529,0.025698,-0.019649,-0.259066
865,2020-02-29,3.5,0.000020,0.000532,0.001755,0.0,True,False,True,-0.073593,...,-0.001634,-0.017821,0.007007,0.009549,-0.074349,-0.430517,-0.018529,0.025698,-0.019649,-0.259066
866,2020-03-31,4.4,-0.006775,-0.006036,-0.004306,0.0,True,False,True,-6.472760,...,-0.018003,0.233800,0.007056,0.009600,-0.164881,-0.448472,-0.018529,0.025698,-0.019649,-0.259066


## 1. ML training sets: feature shifts

### 1. Add lagged features

In [16]:
d.shift_features(row_shifts=row_shifts, inplace=True)
# Boolean features were converted to weird type, let's convert them back to booleans
d.data.loc[:, d.data.dtypes == 'O'] = d.data.loc[:, d.data.dtypes == 'O'].astype(bool)

### 2. Start date and end date of the dataset

In [17]:
X_cols = [
    x 
    for x in d.data.columns 
    if (
        x not in Y_cols 
        and len(x.split("_t-")) == 2
    )
]
d.data.head()

Unnamed: 0,Date,unempl_rate_seas,G-7_GDP_gth_rate,OECD_GDP_gth_rate,USA_GDP_gth_rate,Recession,potus,houseOfRep,fedChair,Perso_consumption_evol_m,...,Non_fin_corp_business_inv_pct_change_t-1,Non_fin_corp_business_inv_pct_change_t-2,Non_fin_corp_business_inv_pct_change_t-3,Non_fin_corp_business_inv_pct_change_t-6,Non_fin_corp_business_inv_pct_change_t-12,Private_business_domestic_inv_pct_change_t-1,Private_business_domestic_inv_pct_change_t-2,Private_business_domestic_inv_pct_change_t-3,Private_business_domestic_inv_pct_change_t-6,Private_business_domestic_inv_pct_change_t-12
216,1966-01-31,4.0,0.005685,0.005627,0.007622,0.0,False,False,False,0.677935,...,,,,,,,,,,
217,1966-02-28,3.8,0.005685,0.005627,0.007622,0.0,False,False,False,0.726407,...,0.054129,,,,,0.183105,,,,
218,1966-03-31,3.8,0.006357,0.005496,0.008049,0.0,False,False,False,0.72724,...,0.054129,0.054129,,,,0.183105,0.183105,,,
219,1966-04-30,3.8,0.006357,0.005496,0.008049,0.0,False,False,False,0.14224,...,0.031567,0.054129,0.054129,,,-0.01372,0.183105,0.183105,,
220,1966-05-31,3.9,0.006357,0.005496,0.008049,0.0,False,False,False,-0.880398,...,0.031567,0.031567,0.054129,,,-0.01372,-0.01372,0.183105,,


## 2. Sequences for LSTM / GRU

In [8]:
def prepare_sequential_data(data, features, labels, lookback=12):
    if isinstance(data, pd.DataFrame):
        pass
    X_temp = data[features].to_numpy()
    Y = data[labels].iloc[lookback:]
    n, p = X_temp.shape
    l = len(labels)
    X = np.zeros(shape=(n - lookback, lookback, p))
    for i in range(lookback):
        X[:, i, :] = X_temp[lookback - (i + 1) : n - (i + 1)]
    return X, Y

In [9]:
X_cols = list(filter(lambda x: x in d.columns, X_cols))
print(X_cols)
X, Y = prepare_sequential_data(d.data, features=X_cols, labels=Y_cols, lookback=12)
print("X shape: ", X.shape)
print("Y shape: ", Y.shape)
print(Y)

['unempl_rate_seas_t-1', 'unempl_rate_seas_t-2', 'unempl_rate_seas_t-3', 'unempl_rate_seas_t-6', 'unempl_rate_seas_t-12', 'G-7_GDP_gth_rate_t-1', 'G-7_GDP_gth_rate_t-2', 'G-7_GDP_gth_rate_t-3', 'G-7_GDP_gth_rate_t-6', 'G-7_GDP_gth_rate_t-12', 'OECD_GDP_gth_rate_t-1', 'OECD_GDP_gth_rate_t-2', 'OECD_GDP_gth_rate_t-3', 'OECD_GDP_gth_rate_t-6', 'OECD_GDP_gth_rate_t-12', 'USA_GDP_gth_rate_t-1', 'USA_GDP_gth_rate_t-2', 'USA_GDP_gth_rate_t-3', 'USA_GDP_gth_rate_t-6', 'USA_GDP_gth_rate_t-12', 'Recession_t-1', 'Recession_t-2', 'Recession_t-3', 'Recession_t-6', 'Recession_t-12', 'potus_t-1', 'potus_t-2', 'potus_t-3', 'potus_t-6', 'potus_t-12', 'houseOfRep_t-1', 'houseOfRep_t-2', 'houseOfRep_t-3', 'houseOfRep_t-6', 'houseOfRep_t-12', 'fedChair_t-1', 'fedChair_t-2', 'fedChair_t-3', 'fedChair_t-6', 'fedChair_t-12', 'Perso_consumption_evol_m_t-1', 'Perso_consumption_evol_m_t-2', 'Perso_consumption_evol_m_t-3', 'Perso_consumption_evol_m_t-6', 'Perso_consumption_evol_m_t-12', 'Fed_rate_month_avg_t-1',

## 3. Split in train and test set and save

In [18]:
X_train, Y_train, X_test, Y_test = train_test_split_dates(
    X=X,
    Y=Y,
    test_date=start_date + (end_date - start_date) * (1 - test_size),
    standardize=standardize,
    sequential=True
)

In [20]:
params = open_file(filepath="Data/cleaned_data_params.yaml")
params["start_date"] = start_date
params["end_date"] = end_date
params["test_size"] = test_size
params["standardize"] = standardize
params["lookback"] = lookback
# params["row_shifts"] = row_shifts

files = {
    "X_train.joblib": X_train,
    "X_test.joblib": X_test,
#     "X_train.csv": X_train,
#     "X_test.csv": X_test,
    "Y_train.csv": Y_train,
    "Y_test.csv": Y_test,
    "params.yaml": params
}

path = "./Models/start_{:s}_end_{:s}_test_{:s}".format(
    start_date.strftime("%Y%m"),
    end_date.strftime("%Y%m"),
    Y_test['Date'].iloc[0].strftime("%Y%m")
)
# path += "_seq_{:d}".format(lookback)

save_files(path=path, files=files, overwrite=True)

--------------------- 5 file(s) saved succesfully in ./Models/start_196601_end_202005_test_200906/ ---------------------
