In [6]:
from datetime import datetime
import os
import numpy as np
import pandas as pd

# import functions
from pysrc.load_data import read_data, sort_data, transform_data, drop_data
from pysrc.load_spec import load_spec
from pysrc.summarize import summarize

In [7]:
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.subplots as psub

In [8]:
from dateutil import relativedelta

In [9]:
pd.set_option('display.max_columns', 30)

In [10]:
def source_data_prep(src_fpath, country="US"):
    df_long = pd.read_csv(src_fpath, decimal=",")
    df_long["ReferenceDate"] = pd.to_datetime(df_long["ReferenceDate"])
    df_pivot = df_long.pivot(index="ReferenceDate", columns="VariableCode", values="VariableValue")
    vintage = os.path.basename(src_fpath).split(".")[0]
    df_pivot.to_excel(os.path.join('data', country, f'{vintage}.xlsx'), sheet_name="data")
    return vintage

In [11]:
def conv_matrix_to_df(matrix, date_col):
    df = pd.DataFrame(matrix)

    headers = df.iloc[0].values
    df.columns = headers
    df = df.drop(index=0, axis=0)
    df["ReferenceDate"] = date_col
    return df.set_index("ReferenceDate").sort_index()

In [12]:
def load_data(datafile, Spec, sample=None, load_excel=False):
    """
    Load vintage of data from file and format as structure

    Parameters:
        datafile (str): Filename of Microsoft Excel workbook file
        Spec (dict): Model specification containing SeriesID and other info
        sample (float, optional): Sample period start date in numeric form
        load_excel (bool, optional): Flag to force loading from Excel

    Returns:
        X (np.ndarray): T x N numeric array, transformed dataset
        Time (np.ndarray): T x 1 numeric array, date number with observation dates
        Z (np.ndarray): T x N numeric array, raw (untransformed) dataset
    """
    print('Loading data...')

    ext = os.path.splitext(datafile)[1]  # file extension
    idx = datafile.rfind(os.path.sep)
    datafile_mat = os.path.join(datafile[:idx], 'mat', os.path.splitext(datafile[idx + 1:])[0] + '.npz')

    if os.path.exists(datafile_mat) and not load_excel:
        # Load raw data from a NumPy formatted binary (.npz) file
        with np.load(datafile_mat, allow_pickle=True) as data:
            Z = data['Z']
            Time = data['Time']
            Mnem = data['Mnem']
    elif ext in ['.xlsx', '.xls']:
        # Read raw data from Excel file
        Z, Time, Mnem = read_data(datafile)
        # np.savez(datafile_mat, Z=Z, Time=Time, Mnem=Mnem)
    else:
        raise ValueError('Only Microsoft Excel workbook files supported.')

    # Sort data based on model specification
    Z = sort_data(Z, Mnem, Spec)
    
    # Transform data based on model specification
    X, Time, Z, header = transform_data(Z, Time, Spec)

    # Drop data not in estimation sample
    if sample is not None:
        X, Time, Z = drop_data(X, Time, Z, sample)

    # Z = np.vstack([header, Z])
    # X = np.vstack([header, X])

    return X, Time, Z, header

In [13]:
from scipy.signal import lfilter
from scipy.interpolate import splrep, splev
import numpy as np

def filter(x, k):
    """Apply a moving average filter with a window size of 2*k+1."""
    numerator = np.ones(2*k+1) / (2*k+1)
    return lfilter(numerator, [1], x)

def remNaNs_spline(X,options):
    """
    Treats NaNs in the dataset for use in Dynamic Factor Models (DFM).

    This function processes NaNs in a data matrix `X` according to five cases, 
    which are useful for running functions in the `DFM.m` file that do not 
    accept missing value inputs.

    Replication files for: 
    "Nowcasting", 2010, by Marta Banbura, Domenico Giannone, and Lucrezia Reichlin, 
    in Michael P. Clements and David F. Hendry, editors, Oxford Handbook on Economic Forecasting.

    The software can be freely used in applications. Users are kindly requested to 
    add acknowledgments to published work and cite the above reference in any resulting publications.

    Args:
        X (ndarray): Input data matrix of shape (T, n) where `T` is time and `n` is the number of series.
        options (dict): A dictionary with the following keys:
            - method (int): Determines the method for handling NaNs.
                - 1: Replaces all missing values using a filter.
                - 2: Replaces missing values after removing trailing and leading zeros 
                     (a row is 'missing' if more than 80% is NaN).
                - 3: Only removes rows with leading and closing zeros.
                - 4: Replaces missing values after removing trailing and leading zeros 
                     (a row is 'missing' if all are NaN).
                - 5: Replaces missing values using a spline and then applies a filter.
            - k (int): Used in MATLAB's filter function for the 1-D filter. 
              Controls the rational transfer function's numerator, where the 
              denominator is set to 1. The numerator takes the form 
              `ones(2*k+1, 1) / (2*k+1)`. See MATLAB's documentation for `filter()` for details.

    Returns:
        tuple:
            - X (ndarray): The processed data matrix.
            - indNaN (ndarray): A matrix indicating the location of missing values (1 for NaN).
    """
    T, N = X.shape  # Gives dimensions for data input
    k = options["k"]  # Inputted options
    method = options["method"]  # Inputted options
    indNaN = np.isnan(X)  # Returns location of NaNs
    nanLE = None
    if method == 1:   # replace all the missing values
        for i in range(N):  # loop through columns
            x = X[:, i]
            isnanx = indNaN[:, i]
            x[isnanx]  = np.nanmedian(x)  # Replace missing values series median
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)  # Apply filter
            x_MA = x_MA[2*k:]  # Match dimensions
            x[isnanx] = x_MA[isnanx]  # Replace missing observations with filtered values
            X[:, i] = x  # Replace vector
    elif method == 2:   # replace missing values after removing leading and closing zeros
        rem1 = np.sum(indNaN, axis=1) > N * 0.8  # Returns row sum for NaN values. Marks true for rows with more than 80% NaN
        nanLead = np.cumsum(rem1) == np.arange(1, T+1)
        nanEnd = np.cumsum(rem1[::-1]) == np.arange(1, T+1)
        nanEnd = nanEnd[::-1]  # Reverses nanEnd
        nanLE = nanLead | nanEnd

        X = X[~nanLE, :]  # Remove leading and trailing NaN rows
        indNaN = np.isnan(X)  # Index for missing values

        # Loop for each series
        for i in range(N):
            x = X[:, i]
            isnanx = np.isnan(x)
            t1 = np.where(~isnanx)[0][0]  # First non-NaN entry
            t2 = np.where(~isnanx)[0][-1]  # Last non-NaN entry

            # Interpolates without NaN entries in beginning and end
            tck = splrep(np.where(~isnanx)[0], x[~isnanx], s=0)
            x[t1:t2+1] = splev(np.arange(t1, t2+1), tck)
            isnanx = np.isnan(x)

            x[isnanx] = np.nanmedian(x)  # Replace NaNs with the median

            # Apply filter
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)
            x_MA = x_MA[2*k:]
            x[isnanx] = x_MA[isnanx]
            X[:, i] = x

    elif method == 3:  # Only remove rows with leading and closing zeros
        rem1 = np.sum(indNaN, axis=1) == N
        nanLead = np.cumsum(rem1) == np.arange(1, T+1)
        nanEnd = np.cumsum(rem1[::-1]) == np.arange(1, T+1)
        nanEnd = nanEnd[::-1]
        nanLE = nanLead | nanEnd

        # Remove leading and trailing NaN rows
        X = X[~nanLE, :]
        indNaN = np.isnan(X)

    elif method == 4:  # Remove rows with leading and closing zeros & replace missing values
        rem1 = np.sum(indNaN, axis=1) == N
        nanLead = np.cumsum(rem1) == np.arange(1, T+1)
        nanEnd = np.cumsum(rem1[::-1]) == np.arange(1, T+1)
        nanEnd = nanEnd[::-1]
        nanLE = nanLead | nanEnd

        # Remove leading and trailing NaN rows
        X = X[~nanLE, :]
        indNaN = np.isnan(X)

        for i in range(N):
            x = X[:, i]
            isnanx = np.isnan(x)
            t1 = np.where(~isnanx)[0][0]
            t2 = np.where(~isnanx)[0][-1]

            # Interpolation
            tck = splrep(np.where(~isnanx)[0], x[~isnanx], s=0)
            x[t1:t2+1] = splev(np.arange(t1, t2+1), tck)
            isnanx = np.isnan(x)

            x[isnanx] = np.nanmedian(x)  # Replace NaNs with the median
            
            # Apply filter
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)
            x_MA = x_MA[2*k:]
            x[isnanx] = x_MA[isnanx]
            X[:, i] = x

    elif method == 5:  # Replace missing values
        indNaN = np.isnan(X)
        for i in range(N):
            x = X[:, i]
            isnanx = np.isnan(x)
            t1 = np.where(~isnanx)[0][0]
            t2 = np.where(~isnanx)[0][-1]

            # Interpolation
            tck = splrep(np.where(~isnanx)[0], x[~isnanx], s=0)
            x[t1:t2+1] = splev(np.arange(t1, t2+1), tck)
            isnanx = np.isnan(x)

            x[isnanx] = np.nanmedian(x)  # Replace NaNs with the median

            # Apply filter
            x_MA = filter(np.concatenate(([x[0]] * k, x, [x[-1]] * k)), k)
            x_MA = x_MA[2*k:]
            x[isnanx] = x_MA[isnanx]
            X[:, i] = x

    return X, indNaN, nanLE


In [14]:
## User inputs.
src_fpath = "./data/2024-08-26.csv"
vintage = source_data_prep(src_fpath=src_fpath)
# vintage = '2016-06-29'; # vintage dataset to use for estimation

country = 'US';         # United States macroeconomic data
sample_start = datetime.strptime('2000-01-01', '%Y-%m-%d'); # estimation sample


## Load model specification and dataset.
# Load model specification structure `Spec`
Spec = load_spec('Spec_US_example.xls');
# Parse `Spec`
SeriesID, SeriesName, Units, UnitsTransformed, Frequency = Spec['seriesid'], Spec['seriesname'], Spec['units'], Spec['unitstransformed'], Spec['frequency']

# Load data
datafile = os.path.join('data', country, f'{vintage}.xls') if os.path.exists(os.path.join('data', country, f'{vintage}.xls')) else os.path.join('data', country, f'{vintage}.xlsx');
X, Time, Z, header = load_data(datafile, Spec, sample_start);
summarize(X.astype(float),Time,Spec,vintage); # summarize data

Table 1: Model specification
              SeriesID                   SeriesName                 Units  \
0               PAYEMS           Payroll Employment  Thousands of Persons   
1               JTSJOL                 Job Openings             Thousands   
2             CPIAUCSL         Consumer Price Index                 Index   
3              DGORDER         Durable Goods Orders           $, Millions   
4                RSAFS                 Retail Sales           $, Millions   
5               UNRATE            Unemployment Rate                     %   
6                HOUST               Housing Starts    Thousands of Units   
7               INDPRO        Industrial Production                 Index   
8              DSPIC96              Personal Income   Chained $, Billions   
9              BOPTEXP                      Exports           $, Millions   
10             BOPTIMP                      Imports           $, Millions   
11             TTLCONS        Construction Spen


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [15]:
source_data = pd.DataFrame(Z, columns=header, index=Time)
date_ranges = [source_data.apply(lambda col: col.first_valid_index()).max(), None]

In [16]:
# Prepare data -----------------------------------------------------------
x_est = X[list(Time).index(date_ranges[0]):]
time_est = Time[list(Time).index(date_ranges[0]):]

Mx = np.nanmean(x_est, axis=0)
Wx = np.nanstd(x_est, axis=0)
xNaN = (x_est - Mx) / Wx  # Standardize series

optNaN = {"method": 4, "k": 3}
x_est, _, nanLE = remNaNs_spline(xNaN, optNaN)

In [17]:
X_df = pd.DataFrame(data=x_est, columns=header, index=time_est)
Z_df = pd.DataFrame(data=Z, columns=header, index=Time)

fig = psub.make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                            subplot_titles=("Raw Observed Data", "Transformed data"))

## Plot raw and transformed data.
# Industrial Production (INDPRO) <fred.stlouisfed.org/series/INDPRO>
series_name = "GDPC1"
idxSeries = SeriesID.index(series_name)
# Plot raw observed data
trace1 = go.Scatter(
    x=Z_df.index,
    y=Z_df[series_name],
    mode="markers" if Z_df[series_name].isna().sum() else "lines",
    name='Raw Observed Data',
    line=dict(color="#000000", width=1),  # #7BCC62 / #68b562 / #7BB562
    marker={"size": 4, "symbol": "diamond"},
)
fig.add_trace(trace1, row=1, col=1)

# Plot transformed data
trace2 = go.Scatter(
    x=X_df.index,
    y=X_df[series_name],
    mode='lines',
    name='Transformed Data',
    line=dict(color="#BDC1D6", width=1),  # #7BCC62 / #68b562 / #7BB562
)
fig.add_trace(trace2, row=2, col=1)
fig.update_layout(
    height=600,
    width=800,
    showlegend=False,
    title_text=series_name,
    plot_bgcolor="white",
)
fig.update_xaxes(range=[Time[0], Time[-1]], row=1, col=1, gridcolor="lightgrey")
fig.update_yaxes(title_text=Units[idxSeries], row=1, col=1, gridcolor="lightgrey")

fig.update_xaxes(range=[Time[0], Time[-1]], title_text='Time', row=2, col=1, gridcolor="lightgrey")
fig.update_yaxes(title_text=UnitsTransformed[idxSeries], row=2, col=1, gridcolor="lightgrey")
fig.show()

In [18]:
# Prepare data -----------------------------------------------------------
idx_iM = [i for i in range(len(Spec["seriesid"])) if Spec["frequency"][i] == "m"]
df_m = pd.DataFrame(data=x_est[:, idx_iM], columns=header[idx_iM], index=time_est)

time_index = pd.DatetimeIndex(Time)
quarterly_index = time_index.to_period('Q')
monthly_index = pd.period_range(start=date_ranges[0], end=quarterly_index.max(), freq='M').to_timestamp()
df_m = df_m.reindex(monthly_index).dropna(how="all")
df_xNaN = pd.DataFrame(xNaN[:, idx_iM], columns=header[idx_iM], index=time_est)
df_m = df_m.fillna(df_xNaN)

idx_iQ = [i for i in range(len(Spec["seriesid"])) if Spec["frequency"][i] == "q"]
xNaN_sQ = pd.DataFrame(
    X[:, idx_iQ], columns=header[idx_iQ], index=Time
).loc[pd.date_range(start=Time[0], end=Time[-1].date(), freq='QS')]
x_est_q, _, nanLE_q = remNaNs_spline(xNaN_sQ.to_numpy(), options=optNaN)
df_q = pd.DataFrame(x_est_q, columns=header[idx_iQ], index=xNaN_sQ.index[~nanLE_q])

In [19]:
print("Transformed monthly series:")
display(df_m.tail(5))
print("Transformed quarterly series:")
display(df_q.tail(5))
print("Raw observed data:")
display(Z_df.tail(5))

# target reference dates: 
# "20XX-01-01", "20XX-04-01", "20XX-07-01", "20XX-10-01"
reference_date = df_q[series_name].index.max() + relativedelta.relativedelta(months=3)
date_ranges[1] = reference_date
print(f"Reference date: {reference_date}, estimation date ranges: {date_ranges}")

Transformed monthly series:


Unnamed: 0,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI
2024-04-01,0.004169,-1.424568,0.336525,-0.000353,-0.222891,0.129015,0.641013,-0.020116,-0.111498,0.129506,0.8025,0.828327,0.563531,0.587992,0.660314,0.386905,-0.626493,-0.056474,0.088009,0.517671,-1.156464,-0.19711,0.511603
2024-05-01,0.07636,0.924702,-0.634871,-0.02911,-0.04964,0.129015,-0.532581,0.558254,0.0554,-0.370193,-0.225755,-0.594402,-0.187649,-0.260393,-0.357061,-0.638206,-0.56941,0.541661,0.325594,-0.786072,-1.237533,0.192207,-0.119678
2024-06-01,0.051628,-0.198041,-0.830744,-1.468876,-0.247957,0.129015,0.089244,0.185747,-0.038233,0.449573,0.11738,-0.516959,-0.081118,-0.90875,0.057251,-0.426852,0.800583,0.152909,0.08195,-0.398006,-0.638872,0.037855,-0.303324
2024-07-01,0.00818,-0.109534,-0.163091,2.002068,0.284558,0.257422,-0.794473,-0.50009,-0.016639,0.062373,0.103442,0.022635,0.025564,-0.24627,-0.058757,-0.111411,-0.669306,-0.568612,0.110522,0.585015,-0.676288,0.00242,0.419781
2024-08-01,0.055448,0.091518,-0.219578,0.095197,0.001542,0.073984,-0.170276,0.066331,-0.001502,0.052006,-0.010131,-0.07993,-0.01296,-0.268325,-0.180652,-0.170316,-0.069934,0.052765,0.107882,-0.046104,-0.557803,0.030007,-0.779654


Transformed quarterly series:


Unnamed: 0,GDPC1,ULCNFB
2023-04-01,2.060217,2.505186
2023-07-01,4.861686,0.053417
2023-10-01,3.39603,-2.800239
2024-01-01,1.409499,3.793829
2024-04-01,2.841666,0.922269


Raw observed data:


Unnamed: 0,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI,GDPC1,ULCNFB
2024-04-01,158214.0,7919.0,313.207,282737.0,702681.0,3.9,1377.0,102.4955,16910.4,263443.0,337905.0,2163179.0,141.7,317.622,121.947,123.109,1440.0,77.7624,2546223.0,149.8,-14.3,15679.4,15.5,22918.739,120.394
2024-05-01,158430.0,8230.0,313.225,282987.0,704309.0,4.0,1315.0,103.2734,16967.2,262006.0,337012.0,2154816.0,141.5,318.14,122.102,123.146,1399.0,78.2853,2558827.0,148.8,-15.6,15742.7,4.5,,
2024-06-01,158609.0,8184.0,313.049,263571.0,702862.0,4.1,1329.0,103.5494,16984.4,265938.0,339047.0,2148444.0,141.5,318.346,122.324,123.243,1454.0,78.4235,2567535.0,148.4,-6.0,15777.0,1.3,,
2024-07-01,158723.0,,313.534,289645.0,709668.0,4.3,1238.0,102.8887,,,,,141.7,318.872,,,1406.0,77.8477,,149.5,-6.6,,13.9,,
2024-08-01,,,,,,,,,,,,,,,,,,,,,-4.7,,-7.0,,


Reference date: 2024-07-01 00:00:00, estimation date ranges: [Timestamp('2007-01-01 00:00:00'), Timestamp('2024-07-01 00:00:00')]


In [20]:
# data aggregation
dt_index = pd.period_range(start=date_ranges[0], end=date_ranges[1], freq='Q').to_timestamp()
df_mod_q = pd.merge(df_q.reindex(dt_index), df_m.resample('QS').sum(), how="left", left_index=True, right_index=True)

X_df, y = df_mod_q.drop(columns=[series_name]), df_mod_q[series_name]
X, _, _ = remNaNs_spline(X_df.values, options={"method": 1, "k": 3})
X = pd.DataFrame(X, columns=X_df.columns, index=X_df.index)
# train-test split
train_index, test_index = y.index[~y.isna()], y.index[y.isna()]
assert len(test_index) == 1
X_train, y_train = X.loc[train_index],  y.loc[train_index]
X_test, y_test = X.loc[test_index],  y.loc[test_index]

print("X train:")
display(X_train)
print("y train:")
display(y_train)

print("X test:")
display(X_test)
print("y test:")
display(y_test)

X train:


Unnamed: 0,ULCNFB,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI
2007-01-01,7.920009,0.150874,0.915447,1.436649,-0.092809,-0.036960,0.001826,-1.038610,0.543941,0.018905,1.111035,0.766081,0.044730,0.294827,0.160340,1.507798,2.077746,-0.552299,0.104635,-0.009416,1.482108,2.024570,-0.256018,-0.341818
2007-04-01,-1.460556,-0.010220,-0.484051,1.029529,-0.194768,-0.336126,0.258639,-0.428101,0.519701,-0.175731,0.581728,-0.099030,0.455062,2.396181,-0.945229,-0.730280,0.893920,-2.650102,0.184366,1.321556,0.689859,1.588046,-0.223130,0.237813
2007-07-01,-1.653878,-0.183345,-0.811125,0.041579,-0.338841,0.114327,0.130232,-2.246605,0.164717,-0.115984,0.920727,0.218812,-1.154413,0.887918,-0.372182,0.372785,0.322485,-2.036459,0.050268,0.303016,0.177134,3.178240,0.243568,-0.341818
2007-10-01,1.659716,-0.008214,-0.496631,2.417769,1.504043,-0.284403,0.387045,-1.409287,0.198890,-0.133829,1.038469,0.718833,-4.026658,3.135258,1.010462,1.020388,2.313884,-1.551253,0.341201,1.611408,1.743697,2.959978,-0.242052,-0.617286
2008-01-01,6.625626,-0.297648,-1.166503,1.027363,-0.962166,-0.650719,0.130232,-0.379395,-0.663212,-0.152495,1.023183,1.072514,-1.683648,3.389791,-0.141406,-0.227064,0.896626,-2.550206,-0.441457,0.732609,3.217990,-1.717062,-0.523891,-4.198375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-01,2.505186,0.344721,-1.726303,0.403385,0.887591,0.183743,0.130232,0.697432,-0.368069,-0.047022,-1.365920,-0.516418,1.706229,-0.512417,2.741421,1.852987,0.198009,0.047073,-0.808483,-1.832704,-3.063821,-1.692118,0.009239,-3.997513
2023-07-01,0.053417,0.223733,0.412257,1.446676,-0.409567,0.352819,0.258639,-0.408673,0.634735,-0.219955,1.209691,0.631371,0.555430,0.620530,1.111317,0.183783,1.459115,0.361030,0.374649,-0.374426,1.565413,-1.791895,0.216215,-2.275835
2023-10-01,-2.800239,0.221728,-1.474708,-0.464911,-0.294507,-0.336580,-0.126581,1.542904,-0.545909,-0.063238,-0.715467,-0.244159,1.101340,-1.643690,1.422254,-1.036771,-1.658344,0.261135,-0.755329,-1.142459,-2.697750,-1.417731,0.424488,-3.021895
2024-01-01,3.793829,0.332020,-1.839521,1.600431,-0.510673,-0.428886,0.130232,-1.861268,-0.136160,-0.153115,-0.170680,0.305730,0.597078,0.896666,3.310526,4.346159,2.442660,-0.595111,-0.273507,-1.165529,1.111959,-4.972282,-0.249026,-1.260046


y train:


2007-01-01    1.208784
2007-04-01    2.469595
2007-07-01    2.324088
2007-10-01    2.536727
2008-01-01   -1.696161
                ...   
2023-04-01    2.060217
2023-07-01    4.861686
2023-10-01    3.396030
2024-01-01    1.409499
2024-04-01    2.841666
Freq: QS-OCT, Name: GDPC1, Length: 70, dtype: float64

X test:


Unnamed: 0,ULCNFB,PAYEMS,JTSJOL,CPIAUCSL,DGORDER,RSAFS,UNRATE,HOUST,INDPRO,DSPIC96,BOPTEXP,BOPTIMP,TTLCONS,IR,CPILFESL,PCEPILFE,PCEPI,PERMIT,TCU,BUSINV,IQ,GACDISA066MSFRBNY,PCEC96,GACDFSA066MSFRBPHI
2024-07-01,1.23118,0.063627,-0.018016,-0.382668,2.097266,0.2861,0.331405,-0.964749,-0.433759,-0.018142,0.114379,0.09331,-0.057294,0.012604,-0.514595,-0.239409,-0.281727,-0.739239,-0.515848,0.218404,0.538911,-1.234091,0.032428,-0.359874


y test:


2024-07-01   NaN
Freq: QS-OCT, Name: GDPC1, dtype: float64

In [23]:
out_dir = f'./data/{reference_date.strftime("%Y-%m-%d")}'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

X_train.to_csv(os.path.join(out_dir, "X_train.csv"))
X_test.to_csv(os.path.join(out_dir, "X_test.csv"))
y_train.to_csv(os.path.join(out_dir, "y_train.csv"))
y_test.to_csv(os.path.join(out_dir, "y_test.csv"))
Z_df.to_csv(os.path.join(out_dir, "Z_df.csv"))