In [1]:
from datetime import datetime
import os
import numpy as np
import pandas as pd

# import functions
from pysrc.load_data import read_data, sort_data, transform_data, drop_data
from pysrc.load_spec import load_spec
from pysrc.summarize import summarize

In [2]:
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.subplots as psub

In [3]:
from dateutil import relativedelta

In [4]:
pd.set_option('display.max_columns', 30)

In [5]:
def detect_frequency(date_series):
    date_diffs = date_series.diff().dropna()
    mean_diff = date_diffs.mean()
    
    if pd.Timedelta(days=27) <= mean_diff < pd.Timedelta(days=32):
        return 'monthly'
    elif pd.Timedelta(days=80) <= mean_diff < pd.Timedelta(days=100):
        return 'quarterly'
    elif pd.Timedelta(days=6) <= mean_diff <= pd.Timedelta(days=7):
        return 'weekly'
    else:
        return 'unknown'


def lag_to_fill_ragged_edges(df):
    # Iterate over each column (representing different series)
    for col in df.columns:
        series = df[col]
        
        # Check if the series has missing values at the end
        if series.iloc[-1:].isna().all():
            # Find the position of the last non-NaN value
            last_non_nan = series.last_valid_index()
            
            # If valid non-NaN is found, create a lagged version of the column
            if last_non_nan is not None:
                shift_amount = len(series) - series.index.get_loc(last_non_nan) - 1
                df[col] = series.shift(shift_amount)
    
    return df

def source_data_prep(src_fpath, Spec, country="US", series_name="GDPC1"):
    df_long = pd.read_csv(src_fpath)
    df_long["ReferenceDate"] = pd.to_datetime(df_long["ReferenceDate"])

    # Create empty dataframes for monthly, weekly, and quarterly data
    monthly_data = pd.DataFrame()
    weekly_data = pd.DataFrame()
    quarterly_data = pd.DataFrame()

    # Group by 'VariableCode' to apply the frequency detection for each group
    for var_code, group in df_long.groupby('VariableCode'):
        frequency = detect_frequency(group['ReferenceDate'])
        if frequency == 'monthly':
            monthly_data = pd.concat([monthly_data, group])
        elif frequency == 'weekly':
            weekly_data = pd.concat([weekly_data, group])
        elif frequency == 'quarterly':
            quarterly_data = pd.concat([quarterly_data, group])
            
    df_m = monthly_data.pivot(index="ReferenceDate", columns="VariableCode", values="VariableValue")
    df_m = lag_to_fill_ragged_edges(df_m)

    df_q = quarterly_data.pivot(index="ReferenceDate", columns="VariableCode", values="VariableValue")
    df_q = pd.merge(
        lag_to_fill_ragged_edges(df_q.drop(columns=[series_name])),
        df_q[[series_name]],
        left_index=True,
        right_index=True
    )
    
    core_series = pd.merge(
        df_m[list(set(df_m.columns).intersection(set(Spec["seriesid"])))],
        df_q[list(set(df_q.columns).intersection(set(Spec["seriesid"])))],
        right_index=True,
        left_index=True,
        how="left"
    ).reset_index()
    other_series = df_long.loc[~df_long["VariableCode"].isin(Spec["seriesid"])]

    vintage = os.path.basename(src_fpath).split(".")[0]

    names = ["data", "other"]
    dataframes = [core_series, other_series]
    writer = pd.ExcelWriter(os.path.join('data', country, f'{vintage}.xlsx'), engine="xlsxwriter")
    for i, frame in enumerate(dataframes):
        frame.to_excel(writer, sheet_name = names[i], index=False)
    writer.close()  # Ensure the writer is properly saved
    return vintage

In [6]:
def conv_matrix_to_df(matrix, date_col):
    df = pd.DataFrame(matrix)

    headers = df.iloc[0].values
    df.columns = headers
    df = df.drop(index=0, axis=0)
    df["ReferenceDate"] = date_col
    return df.set_index("ReferenceDate").sort_index()

In [7]:
def load_data(datafile, Spec, sample=None, load_excel=False):
    """
    Load vintage of data from file and format as structure

    Parameters:
        datafile (str): Filename of Microsoft Excel workbook file
        Spec (dict): Model specification containing SeriesID and other info
        sample (float, optional): Sample period start date in numeric form
        load_excel (bool, optional): Flag to force loading from Excel

    Returns:
        X (np.ndarray): T x N numeric array, transformed dataset
        Time (np.ndarray): T x 1 numeric array, date number with observation dates
        Z (np.ndarray): T x N numeric array, raw (untransformed) dataset
    """
    print('Loading data...')

    ext = os.path.splitext(datafile)[1]  # file extension
    idx = datafile.rfind(os.path.sep)
    datafile_mat = os.path.join(datafile[:idx], 'mat', os.path.splitext(datafile[idx + 1:])[0] + '.npz')

    if os.path.exists(datafile_mat) and not load_excel:
        # Load raw data from a NumPy formatted binary (.npz) file
        with np.load(datafile_mat, allow_pickle=True) as data:
            Z = data['Z']
            Time = data['Time']
            Mnem = data['Mnem']
    elif ext in ['.xlsx', '.xls']:
        # Read raw data from Excel file
        Z, Time, Mnem = read_data(datafile)
        # np.savez(datafile_mat, Z=Z, Time=Time, Mnem=Mnem)
    else:
        raise ValueError('Only Microsoft Excel workbook files supported.')

    # Sort data based on model specification
    Z = sort_data(Z, Mnem, Spec)
    
    # Transform data based on model specification
    X, Time, Z, header = transform_data(Z, Time, Spec)

    # Drop data not in estimation sample
    if sample is not None:
        X, Time, Z = drop_data(X, Time, Z, sample)

    # Z = np.vstack([header, Z])
    # X = np.vstack([header, X])

    return X, Time, Z, header

In [8]:
from scipy.signal import lfilter
from scipy.interpolate import splrep, splev
import numpy as np

def filter(x, k):
    """Apply a moving average filter with a window size of 2*k+1."""
    numerator = np.ones(2*k+1) / (2*k+1)
    return lfilter(numerator, [1], x)

def remNaNs_spline(X,options):
    """
    Treats NaNs in the dataset for use in Dynamic Factor Models (DFM).

    This function processes NaNs in a data matrix `X` according to five cases, 
    which are useful for running functions in the `DFM.m` file that do not 
    accept missing value inputs.

    Replication files for: 
    "Nowcasting", 2010, by Marta Banbura, Domenico Giannone, and Lucrezia Reichlin, 
    in Michael P. Clements and David F. Hendry, editors, Oxford Handbook on Economic Forecasting.

    The software can be freely used in applications. Users are kindly requested to 
    add acknowledgments to published work and cite the above reference in any resulting publications.

    Args:
        X (ndarray): Input data matrix of shape (T, n) where `T` is time and `n` is the number of series.
        options (dict): A dictionary with the following keys:
            - method (int): Determines the method for handling NaNs.
                - 1: Replaces all missing values using a filter.
                - 2: Replaces missing values after removing trailing and leading zeros 
                     (a row is 'missing' if more than 80% is NaN).
                - 3: Only removes rows with leading and closing zeros.
                - 4: Replaces missing values after removing trailing and leading zeros 
                     (a row is 'missing' if all are NaN).
                - 5: Replaces missing values using a spline and then applies a filter.
            - k (int): Used in MATLAB's filter function for the 1-D filter. 
              Controls the rational transfer function's numerator, where the 
              denominator is set to 1. The numerator takes the form 
              `ones(2*k+1, 1) / (2*k+1)`. See MATLAB's documentation for `filter()` for details.

    Returns:
        tuple:
            - X (ndarray): The processed data matrix.
            - indNaN (ndarray): A matrix indicating the location of missing values (1 for NaN).
    """
    T, N = X.shape  # Gives dimensions for data input
    k = options["k"]  # Inputted options
    method = options["method"]  # Inputted options
    indNaN = np.isnan(X)  # Returns location of NaNs
    nanLE = None
    if method == 1:   # replace all the missing values
        for i in range(N):  # loop through columns
            x = X[:, i]
            isnanx = indNaN[:, i]
            x[isnanx]  = np.nanmedian(x)  # Replace missing values series median
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)  # Apply filter
            x_MA = x_MA[2*k:]  # Match dimensions
            x[isnanx] = x_MA[isnanx]  # Replace missing observations with filtered values
            X[:, i] = x  # Replace vector
    elif method == 2:   # replace missing values after removing leading and closing zeros
        rem1 = np.sum(indNaN, axis=1) > N * 0.8  # Returns row sum for NaN values. Marks true for rows with more than 80% NaN
        nanLead = np.cumsum(rem1) == np.arange(1, T+1)
        nanEnd = np.cumsum(rem1[::-1]) == np.arange(1, T+1)
        nanEnd = nanEnd[::-1]  # Reverses nanEnd
        nanLE = nanLead | nanEnd

        X = X[~nanLE, :]  # Remove leading and trailing NaN rows
        indNaN = np.isnan(X)  # Index for missing values

        # Loop for each series
        for i in range(N):
            x = X[:, i]
            isnanx = np.isnan(x)
            t1 = np.where(~isnanx)[0][0]  # First non-NaN entry
            t2 = np.where(~isnanx)[0][-1]  # Last non-NaN entry

            # Interpolates without NaN entries in beginning and end
            tck = splrep(np.where(~isnanx)[0], x[~isnanx], s=0)
            x[t1:t2+1] = splev(np.arange(t1, t2+1), tck)
            isnanx = np.isnan(x)

            x[isnanx] = np.nanmedian(x)  # Replace NaNs with the median

            # Apply filter
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)
            x_MA = x_MA[2*k:]
            x[isnanx] = x_MA[isnanx]
            X[:, i] = x

    elif method == 3:  # Only remove rows with leading and closing zeros
        rem1 = np.sum(indNaN, axis=1) == N
        nanLead = np.cumsum(rem1) == np.arange(1, T+1)
        nanEnd = np.cumsum(rem1[::-1]) == np.arange(1, T+1)
        nanEnd = nanEnd[::-1]
        nanLE = nanLead | nanEnd

        # Remove leading and trailing NaN rows
        X = X[~nanLE, :]
        indNaN = np.isnan(X)

    elif method == 4:  # Remove rows with leading and closing zeros & replace missing values
        rem1 = np.sum(indNaN, axis=1) == N
        nanLead = np.cumsum(rem1) == np.arange(1, T+1)
        nanEnd = np.cumsum(rem1[::-1]) == np.arange(1, T+1)
        nanEnd = nanEnd[::-1]
        nanLE = nanLead | nanEnd

        # Remove leading and trailing NaN rows
        X = X[~nanLE, :]
        indNaN = np.isnan(X)

        for i in range(N):
            x = X[:, i]
            isnanx = np.isnan(x)
            t1 = np.where(~isnanx)[0][0]
            t2 = np.where(~isnanx)[0][-1]

            # Interpolation
            tck = splrep(np.where(~isnanx)[0], x[~isnanx], s=0)
            x[t1:t2+1] = splev(np.arange(t1, t2+1), tck)
            isnanx = np.isnan(x)

            x[isnanx] = np.nanmedian(x)  # Replace NaNs with the median
            
            # Apply filter
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)
            x_MA = x_MA[2*k:]
            x[isnanx] = x_MA[isnanx]
            X[:, i] = x

    elif method == 5:  # Replace missing values
        indNaN = np.isnan(X)
        for i in range(N):
            x = X[:, i]
            isnanx = np.isnan(x)
            t1 = np.where(~isnanx)[0][0]
            t2 = np.where(~isnanx)[0][-1]

            # Interpolation
            tck = splrep(np.where(~isnanx)[0], x[~isnanx], s=0)
            x[t1:t2+1] = splev(np.arange(t1, t2+1), tck)
            isnanx = np.isnan(x)

            x[isnanx] = np.nanmedian(x)  # Replace NaNs with the median

            # Apply filter
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)
            x_MA = x_MA[2*k:]
            x[isnanx] = x_MA[isnanx]
            X[:, i] = x

    return X, indNaN, nanLE


In [9]:
src_fpath = "./data/0_art_vintages/2017-01-01.csv"

In [10]:
## User inputs.
src_fpath = "./data/0_art_vintages/2017-01-01.csv"
country = 'US';         # United States macroeconomic data
sample_start = datetime.strptime('2000-01-01', '%Y-%m-%d'); # estimation sample

## Load model specification and dataset.
# Load model specification structure `Spec`
Spec = load_spec('Spec_US_example.xls');
# Parse `Spec`
SeriesID, SeriesName, Units, UnitsTransformed, Frequency = Spec['seriesid'], Spec['seriesname'], Spec['units'], Spec['unitstransformed'], Spec['frequency']

vintage = source_data_prep(src_fpath=src_fpath, Spec=Spec, country=country)
# vintage = '2016-06-29'; # vintage dataset to use for estimation

# Load data
datafile = os.path.join('data', country, f'{vintage}.xls') if os.path.exists(os.path.join('data', country, f'{vintage}.xls')) else os.path.join('data', country, f'{vintage}.xlsx');
X, Time, Z, header = load_data(datafile, Spec, sample_start);
summarize(X.astype(float),Time,Spec,vintage); # summarize data


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



Table 1: Model specification
              SeriesID                   SeriesName                 Units  \
0               PAYEMS           Payroll Employment  Thousands of Persons   
1               JTSJOL                 Job Openings             Thousands   
2             CPIAUCSL         Consumer Price Index                 Index   
3              DGORDER         Durable Goods Orders           $, Millions   
4                RSAFS                 Retail Sales           $, Millions   
5               UNRATE            Unemployment Rate                     %   
6                HOUST               Housing Starts    Thousands of Units   
7               INDPRO        Industrial Production                 Index   
8              DSPIC96              Personal Income   Chained $, Billions   
9              BOPTEXP                      Exports           $, Millions   
10             BOPTIMP                      Imports           $, Millions   
11             TTLCONS        Construction Spen

In [11]:
source_data = pd.DataFrame(Z, columns=header, index=Time)
date_ranges = [source_data.apply(lambda col: col.first_valid_index()).max(), None]

In [12]:
# Prepare data -----------------------------------------------------------
x_est = X[list(Time).index(date_ranges[0]):]
time_est = Time[list(Time).index(date_ranges[0]):]

Mx = np.nanmean(x_est, axis=0)
Wx = np.nanstd(x_est, axis=0)
xNaN = (x_est - Mx) / Wx  # Standardize series

optNaN = {"method": 4, "k": 3}
x_est, _, nanLE = remNaNs_spline(xNaN, optNaN)

In [13]:
X_df = pd.DataFrame(data=x_est, columns=header, index=time_est)
Z_df = pd.DataFrame(data=Z, columns=header, index=Time)

fig = psub.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                            subplot_titles=("Raw Observed Data", "Transformed data"))

## Plot raw and transformed data.
# Industrial Production (INDPRO) <fred.stlouisfed.org/series/INDPRO>
series_name = "GDPC1"
idxSeries = SeriesID.index(series_name)
# Plot raw observed data
trace1 = go.Scatter(
    x=Z_df.index,
    y=Z_df[series_name],
    mode="markers" if Z_df[series_name].isna().sum() else "lines",
    name='Raw Observed Data',
    line=dict(color="#000000", width=1),  # #7BCC62 / #68b562 / #7BB562
    marker={"size": 4, "symbol": "diamond"},
)
fig.add_trace(trace1, row=1, col=1)

# Plot transformed data
trace2 = go.Scatter(
    x=X_df.index,
    y=X_df[series_name],
    mode='lines',
    name='Transformed Data',
    line=dict(color="#BDC1D6", width=1),  # #7BCC62 / #68b562 / #7BB562
)
fig.add_trace(trace2, row=2, col=1)
fig.update_layout(
    height=600,
    width=800,
    showlegend=False,
    title_text=series_name,
    plot_bgcolor="white",
)
fig.update_xaxes(range=[Time[0], Time[-1]], row=1, col=1, gridcolor="lightgrey")
fig.update_yaxes(title_text=Units[idxSeries], row=1, col=1, gridcolor="lightgrey")

fig.update_xaxes(range=[Time[0], Time[-1]], title_text='Time', row=2, col=1, gridcolor="lightgrey")
fig.update_yaxes(title_text=UnitsTransformed[idxSeries], row=2, col=1, gridcolor="lightgrey")
fig.show()

In [14]:
# Prepare data -----------------------------------------------------------
idx_iM = [i for i in range(len(Spec["seriesid"])) if Spec["frequency"][i] == "m"]
df_m = pd.DataFrame(data=x_est[:, idx_iM], columns=header[idx_iM], index=time_est)

time_index = pd.DatetimeIndex(Time)
quarterly_index = time_index.to_period('Q')
monthly_index = pd.period_range(start=date_ranges[0], end=quarterly_index.max(), freq='M').to_timestamp()
df_m = df_m.reindex(monthly_index).dropna(how="all")
df_xNaN = pd.DataFrame(xNaN[:, idx_iM], columns=header[idx_iM], index=time_est)
df_m = df_m.fillna(df_xNaN)

idx_iQ = [i for i in range(len(Spec["seriesid"])) if Spec["frequency"][i] == "q"]
xNaN_sQ = pd.DataFrame(
    X[:, idx_iQ], columns=header[idx_iQ], index=Time
).loc[pd.date_range(start=Time[0], end=Time[-1].date(), freq='QS')]
x_est_q, _, nanLE_q = remNaNs_spline(xNaN_sQ.to_numpy(), options=optNaN)
df_q = pd.DataFrame(x_est_q, columns=header[idx_iQ], index=xNaN_sQ.index[~nanLE_q])


In [15]:
print("Transformed monthly series:")
display(df_m.tail(5))
print("Transformed quarterly series:")
display(df_q.tail(5))
print("Raw observed data:")
display(Z_df.tail(5))

# target reference dates: 
# "20XX-01-01", "20XX-04-01", "20XX-07-01", "20XX-10-01"
reference_date = df_q[series_name].index.max() # + relativedelta.relativedelta(months=3)
date_ranges[1] = reference_date
print(f"Reference date: {reference_date}, estimation date ranges: {date_ranges}")

Transformed monthly series:


Unnamed: 0,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI
2016-11-01,0.402135,-0.368354,0.131353,-1.14054,-0.120744,-1.189845,-1.551545,-0.373924,0.036579,-1.345072,0.353432,0.509562,-0.148282,0.293318,-0.295306,0.276442,-0.755903,-0.375091,-0.68978,-0.04146,-0.406443,0.310578,0.178448
2016-12-01,0.362913,0.162726,0.284394,-0.254072,0.662821,0.594922,1.269779,1.037764,-0.071018,-0.360009,0.30158,1.100011,0.195354,0.77562,-1.051462,-0.410595,0.293495,0.94121,0.992143,0.4248,0.126677,0.119591,0.90746
2017-01-01,0.628752,-0.424485,1.223634,0.498675,0.23382,0.594922,-0.338137,-0.520984,-0.080264,1.245858,0.542573,-0.332844,0.308266,1.886966,0.004573,0.137744,1.09694,-0.563134,0.17439,0.19018,-0.018719,0.605335,1.165928
2017-02-01,0.641826,0.34407,-0.146425,0.467667,-0.510221,-0.594922,0.565635,-0.003836,-0.299412,0.191194,0.834078,-0.513394,0.192544,0.59464,1.452661,1.290771,-1.231412,0.000995,0.163535,0.305143,0.853658,-1.065415,2.471522
2017-03-01,0.114507,0.482237,-1.457275,0.098572,-0.467387,-1.189845,-0.820177,0.71198,-0.008794,-0.137859,-0.960626,0.475095,-0.204024,-3.545755,0.514708,-0.087065,0.867384,0.753167,0.118269,0.188182,0.542095,-0.693189,1.775647


Transformed quarterly series:


Unnamed: 0,GDPC1,ULCNFB
2016-01-01,0.834583,5.703889
2016-04-01,1.413788,-0.340885
2016-07-01,3.516445,6.226937
2016-10-01,2.079755,0.732883
2017-01-01,0.694108,1.664405


Raw observed data:


Unnamed: 0,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI,GDPC1,ULCNFB
2016-11-01,145170,5587,242.199,228192,466028,4.6,1149,102.9771,12785.5,186398,228975,1173749,121.1,249.464,111.906,111.365,1212,75.5,1807227,120.8,1.9,11637.1,8.7,,
2016-12-01,145325,5631,242.821,226239,470616,4.7,1275,103.7707,12801.7,185995,231478,1191468,121.6,250.013,111.954,111.434,1228,76.0,1822058,121.3,9.6,11662.9,19.7,,
2017-01-01,145541,5539,244.158,231752,473104,4.8,1241,103.4685,12816.9,191014,235273,1188941,122.3,250.783,112.114,111.632,1293,75.7,1828232,121.6,7.5,11709.4,23.6,16842.4,111.212
2017-02-01,145760,5625,244.456,237085,471865,4.7,1303,103.5317,12807.9,192512,240685,1183840,122.8,251.299,112.428,112.102,1216,75.7,1834310,122.0,20.1,11684.8,43.3,,
2017-03-01,145858,5743,243.752,238713,470844,4.5,1215,104.0998,12831.0,192872,236429,1192822,122.6,250.993,112.643,112.248,1267,76.1,1839919,122.3,15.6,11676.1,32.8,,


Reference date: 2017-01-01 00:00:00, estimation date ranges: [Timestamp('2001-07-01 00:00:00'), Timestamp('2017-01-01 00:00:00')]


In [16]:
reference_date

Timestamp('2017-01-01 00:00:00')

In [17]:
# data aggregation
dt_index = pd.period_range(start=date_ranges[0], end=date_ranges[1], freq='Q').to_timestamp()
df_mod_q = pd.merge(df_q.reindex(dt_index), df_m.resample('QS').sum(), how="left", left_index=True, right_index=True)

X_df, y = df_mod_q.drop(columns=[series_name]), df_mod_q[series_name]
X, _, _ = remNaNs_spline(X_df.values, options={"method": 1, "k": 3})
X = pd.DataFrame(X, columns=X_df.columns, index=X_df.index)
# train-test split
train_index, test_index = y.drop(reference_date).index, reference_date
X_train, y_train = X.loc[train_index],  y.loc[train_index]
X_test, y_test = X.loc[[test_index]],  y.loc[[test_index]]

print("X train:")
display(X_train)
print("y train:")
display(y_train)

print("X test:")
display(X_test)
print("y test:")
display(y_test)

X train:


Unnamed: 0,ULCNFB,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI
2001-07-01,-5.372532,-3.151598,-1.877936,-0.888092,-1.786988,-2.227358,2.974612e+00,-0.590812,-1.910864,3.120509,-2.917253,-1.605168,-0.530446,-1.483674,1.442330,1.365138,-1.374785,-0.906772,-2.629618,-3.873680,-1.040619,-3.954163,0.544801,-3.772452
2001-10-01,-1.687672,-4.380555,-3.307102,-2.867450,0.036098,2.009864,4.164456e+00,-0.012445,-1.641704,-3.033959,-4.494768,-2.214588,-0.808320,-3.577272,2.745911,-0.632468,-2.246247,1.979073,-2.065489,-6.694282,-2.474664,-4.798846,1.133939,-4.660521
2002-01-01,-1.432232,-2.210269,-0.146528,0.368504,0.321284,-0.605630,-1.363338e-17,0.611065,1.771715,2.221512,-0.097759,0.902067,1.335948,0.787375,-0.658076,-1.913257,-1.475829,0.273801,1.319286,-3.306319,-0.473397,0.359260,0.075148,1.045653
2002-04-01,-6.612353,-1.055398,0.885408,0.357642,-0.534660,0.265670,5.949223e-01,0.530030,2.348055,-0.123574,1.247517,1.241840,-1.545038,0.701197,0.651384,1.321053,1.634705,1.192025,2.259501,-1.474634,0.102775,1.293950,-1.287164,1.662000
2002-07-01,3.259364,-1.626296,-0.539441,0.521532,0.179500,-0.436247,-5.949223e-01,0.581539,-0.361526,-0.855973,0.415523,0.798558,-1.638890,0.754131,1.279269,1.191918,0.382000,0.831294,-0.185058,0.301591,0.672209,-0.741598,1.824114,-0.597937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-10-01,0.830430,2.688128,-1.048933,-1.099083,-0.160240,-0.120389,4.005274e-17,-0.255102,-2.098848,0.038603,-1.621784,-1.804978,-1.168766,-1.761752,0.890585,-1.024645,-1.413517,1.274009,-1.689403,-1.603754,-3.370798,-3.642600,0.052860,-2.553014
2016-01-01,5.703889,1.624775,1.407852,-1.297510,0.373298,-1.429914,-2.220446e-16,-0.479787,-0.833778,-0.012136,-1.559773,-0.211261,1.154186,-1.202697,1.621314,1.096806,-2.301542,-1.939773,-0.373101,-1.467880,-2.480034,-3.829538,0.008998,-1.094990
2016-04-01,-0.340885,1.210764,-0.012679,0.753274,-1.097527,1.225977,-5.949223e-01,0.799073,0.575918,0.273606,-0.543247,-1.037887,-1.432301,1.480688,0.786133,0.058810,0.568855,1.339596,0.755157,-0.127508,2.839861,-1.295488,0.916516,-1.101618
2016-07-01,6.226937,2.182599,-0.474675,-0.060346,0.774860,0.211232,1.161329e-16,-1.501621,-0.472665,0.130491,1.388831,0.514878,1.176829,-0.330379,0.393919,0.010545,-0.562363,1.274009,-0.373101,-0.464167,-0.935379,-2.015546,0.227544,-0.200294


y train:


2001-07-01   -1.259126
2001-10-01    1.115917
2002-01-01    3.734696
2002-04-01    2.223841
2002-07-01    1.962626
                ...   
2015-10-01    0.873102
2016-01-01    0.834583
2016-04-01    1.413788
2016-07-01    3.516445
2016-10-01    2.079755
Freq: QS-OCT, Name: GDPC1, Length: 62, dtype: float64

X test:


Unnamed: 0,ULCNFB,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI
2017-01-01,1.664405,1.385084,0.401822,-0.380065,1.064914,-0.743787,-1.189845,-0.592679,0.18716,-0.38847,1.299194,0.416026,-0.371142,0.296786,-1.06415,1.971942,1.341451,0.732913,0.191028,0.456194,0.683505,1.377033,-1.153269,5.413096


y test:


2017-01-01    0.694108
Name: GDPC1, dtype: float64

In [18]:
out_dir = f'./data/1_art_data_prep/{reference_date.strftime("%Y-%m-%d")}'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

X_train.to_csv(os.path.join(out_dir, "X_train.csv"))
X_test.to_csv(os.path.join(out_dir, "X_test.csv"))
y_train.to_csv(os.path.join(out_dir, "y_train.csv"))
y_test.to_csv(os.path.join(out_dir, "y_test.csv"))
Z_df.to_csv(os.path.join(out_dir, "Z_df.csv"))