## TO-DO:

- variable path
- data cleaning and imputation (series-by-series)
- lag variables, accounting for data refresh frequency

In [None]:
# DONE:
# W / M / Q -> D
# M / Q - > W
# Q -> M

In [1]:
import os
import json
import pandas as pd
import numpy as np

In [64]:
with open("../Database/data_info.json", "r") as openfile:
    dct_data_info = json.load(openfile)

In [3]:
# set parameters
TARGET_FREQ = "D"

In [65]:
dct_data_info

{'VAPGDPAER': {'id': 'VAPGDPAER',
  'realtime_start': '2023-10-22',
  'realtime_end': '2023-10-22',
  'title': 'Value Added by Industry: Arts, Entertainment, Recreation, Accommodation, and Food Services: Arts, Entertainment, and Recreation as a Percentage of GDP',
  'observation_start': '2005-01-01',
  'observation_end': '2023-01-01',
  'frequency': 'Quarterly',
  'frequency_short': 'QS',
  'units': 'Percent',
  'units_short': '%',
  'seasonal_adjustment': 'Not Seasonally Adjusted',
  'seasonal_adjustment_short': 'NSA',
  'last_updated': '2023-06-29 11:04:40-05',
  'popularity': 1,
  'notes': 'Value added represents the sum of the costs-incurred and the incomes-earned in production, and consists of compensation of employees, taxes on production and imports, less subsidies, and gross operating surplus.',
  'category': 'Industry'},
 'VAPGDPAERAF': {'id': 'VAPGDPAERAF',
  'realtime_start': '2023-10-22',
  'realtime_end': '2023-10-22',
  'title': 'Value Added by Industry: Arts, Entertainme

# Process FRED Data:

In [134]:
filename = "WEI.csv"
series_id = filename[:-4]
data_freq = dct_data_info[series_id]["frequency_short"]

df = pd.read_csv(f"../Database/{filename}")
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["realtime_start"] = pd.to_datetime(df["realtime_start"], format="%Y-%m-%d")
df["realtime_end"] = pd.to_datetime(df["realtime_end"], format="%Y-%m-%d")

first_release_date = df["realtime_start"].min()
first_data_date = df["date"].min()

In [135]:
# maps frequency tag to interval duration in days
dct_interval = {"Q": 90, "M": 30, "W": 7, "D": 1}
interval = dct_interval[data_freq[0]]

# if the first release date is more than 5 (threshold)
# times the interval, then regarded as retrospective
retrospective = (first_release_date - first_data_date).days > 5*interval
print(retrospective)

True


In [136]:
if retrospective:
    df_extend = df.groupby("date").head(1).copy()
    df_extend["realtime_start"] = df_extend["date"]
    df = pd.concat([df, df_extend], axis=0).sort_values(["date", "realtime_start"])

In [137]:
df

Unnamed: 0,realtime_start,realtime_end,date,WEI
0,2008-01-05,2020-06-08,2008-01-05,1.48
0,2020-04-16,2020-06-08,2008-01-05,1.48
1,2020-06-09,2020-07-29,2008-01-05,1.41
2,2020-07-30,2021-07-28,2008-01-05,1.42
3,2021-07-29,2022-01-19,2008-01-05,1.40
...,...,...,...,...
7775,2023-10-07,2023-10-18,2023-10-07,2.14
7775,2023-10-12,2023-10-18,2023-10-07,2.14
7776,2023-10-19,2023-12-31,2023-10-07,1.99
7777,2023-10-14,2023-12-31,2023-10-14,2.27


In [138]:
# delay between release of datapoint and the date of datapoint
df["release_delay"] = df["realtime_start"] - df["date"]
# filter release delay
df = df[df["release_delay"] <= pd.Timedelta(days=2*365)]

df["realtime_dates"] = df.apply(lambda x: pd.date_range(x.realtime_start, x.realtime_end, freq="D"), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["realtime_dates"] = df.apply(lambda x: pd.date_range(x.realtime_start, x.realtime_end, freq="D"), axis=1)


In [139]:
df2 = (df
       .explode("realtime_dates")
       .sort_values(["realtime_dates", "date"])
      )

In [141]:
df2.head(30)

Unnamed: 0,realtime_start,realtime_end,date,WEI,release_delay,realtime_dates
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-05
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-06
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-07
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-08
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-09
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-10
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-11
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-12
11,2008-01-12,2020-06-08,2008-01-12,1.42,0 days,2008-01-12
0,2008-01-05,2020-06-08,2008-01-05,1.48,0 days,2008-01-13


In [112]:
# example:
# the table lists potential historical values of an external variable to use
# for predicting a target variable on the day 2023-04-30
df2.query("realtime_dates == '2023-04-30'")

Unnamed: 0,realtime_start,realtime_end,date,WEI,realtime_dates
7,2022-10-06,2023-05-17,2008-01-05,1.68,2023-04-30
18,2022-10-06,2023-06-21,2008-01-12,1.63,2023-04-30
27,2022-10-06,2023-05-17,2008-01-19,1.60,2023-04-30
38,2022-10-06,2023-06-21,2008-01-26,1.10,2023-04-30
47,2022-10-06,2023-05-17,2008-02-02,0.79,2023-04-30
...,...,...,...,...,...
7674,2023-04-13,2023-05-17,2023-03-25,1.45,2023-04-30
7678,2023-04-13,2023-05-17,2023-04-01,1.59,2023-04-30
7682,2023-04-20,2023-05-17,2023-04-08,0.97,2023-04-30
7687,2023-04-27,2023-05-17,2023-04-15,0.79,2023-04-30


In [98]:
# for each date to predict,
# pick a historical value of the predictor to use
df3 = (df2
 .groupby("realtime_dates")
 .agg({"date": lambda x: x.tail(1), series_id: lambda x: x.tail(1)})
)

df3

Unnamed: 0_level_0,date,WEI
realtime_dates,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-16,2020-04-11,-11.04
2020-04-17,2020-04-11,-11.04
2020-04-18,2020-04-11,-11.04
2020-04-19,2020-04-11,-11.04
2020-04-20,2020-04-11,-11.04
...,...,...
2023-12-27,2023-10-14,2.27
2023-12-28,2023-10-14,2.27
2023-12-29,2023-10-14,2.27
2023-12-30,2023-10-14,2.27


In [101]:
df3.resample("MS")[series_id].mean()

realtime_dates
2020-04-01   -11.357333
2020-05-01   -11.250968
2020-06-01    -9.315667
2020-07-01    -7.013548
2020-08-01    -5.840968
2020-09-01    -4.694000
2020-10-01    -4.059032
2020-11-01    -2.926000
2020-12-01    -2.339032
2021-01-01    -1.814194
2021-02-01    -2.426071
2021-03-01    -0.210645
2021-04-01    10.292000
2021-05-01    11.650000
2021-06-01    10.334333
2021-07-01     9.160323
2021-08-01     7.901613
2021-09-01     7.764667
2021-10-01     7.510000
2021-11-01     6.925333
2021-12-01     7.533548
2022-01-01     6.275484
2022-02-01     5.738929
2022-03-01     5.390323
2022-04-01     4.543000
2022-05-01     4.091613
2022-06-01     3.357000
2022-07-01     2.862581
2022-08-01     2.996129
2022-09-01     2.554000
2022-10-01     2.137097
2022-11-01     1.920333
2022-12-01     1.153548
2023-01-01     1.203548
2023-02-01     0.998929
2023-03-01     1.006129
2023-04-01     1.308000
2023-05-01     0.994516
2023-06-01     1.026000
2023-07-01     1.352258
2023-08-01     1.522258
2

## check_nans()

In [None]:
def check_nans(df, subset):
    """
    """
    nans = df[subset].isna().sum()
    
    if all(nans == 0):
        return False, False
    else:
        return True, nans

In [None]:
check_nans(df, ["date", series_id])

## change_frequency()

In [None]:
def change_frequency(df, data_freq, target_freq):
    """Downsample / upsample the data to match the target
    frequency.
    """
    # increasing order of frequency
    freq_order = ["Q", "M", "W", "D"]
    data_freq_order = freq_order.index(data_freq[0])
    target_freq_order = freq_order.index(target_freq[0])
    
    # down- or up-sampling
    if data_freq_order < target_freq_order:
        upsample = True
    elif data_freq_order > target_freq_order:
        upsample = False
    else:
        return df
    
    # check if periods are labelled with the left or right edge
    if "S" in data_freq:
        left_label = True
    else:
        left_label = False
        
    

In [74]:
filenames = os.listdir("../Database")

In [80]:
df = pd.read_csv(f"../Database/{filenames[-1]}")

In [81]:
df

Unnamed: 0.1,Unnamed: 0,Date,YIELD,2 MO,3 MO,6 MO,1 YR,2 YR,3 YR,5 YR,7 YR,10 YR,20 YR,30 YR
0,0,1990-01-02,,,7.83,7.89,7.81,7.87,7.90,7.87,7.98,7.94,,8.00
1,1,1990-01-03,,,7.89,7.94,7.85,7.94,7.96,7.92,8.04,7.99,,8.04
2,2,1990-01-04,,,7.84,7.90,7.82,7.92,7.93,7.91,8.02,7.98,,8.04
3,3,1990-01-05,,,7.79,7.85,7.79,7.90,7.94,7.92,8.03,7.99,,8.06
4,4,1990-01-08,,,7.79,7.88,7.81,7.90,7.95,7.92,8.05,8.02,,8.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8454,8454,2023-10-16,5.57,5.58,5.61,5.58,5.42,5.09,4.87,4.72,4.74,4.71,5.06,4.87
8455,8455,2023-10-17,5.58,5.58,5.62,5.60,5.48,5.19,5.01,4.86,4.88,4.83,5.14,4.94
8456,8456,2023-10-18,5.57,5.56,5.61,5.58,5.47,5.19,5.03,4.92,4.95,4.91,5.20,5.00
8457,8457,2023-10-19,5.58,5.57,5.60,5.56,5.44,5.14,5.01,4.95,5.00,4.98,5.30,5.11
