In [1]:
import os
import json
import pandas as pd
import numpy as np

# TO-DO:

- variable path
- data cleaning and imputation (series-by-series)
- add contraint to implement "most recent data >= 2 periods"

# Metadata

In [2]:
with open("../Database/data_info.json", "r") as openfile:
    dct_data_info = json.load(openfile)

In [3]:
dct_data_info

{'VAPGDPAER': {'id': 'VAPGDPAER',
  'realtime_start': '2023-10-22',
  'realtime_end': '2023-10-22',
  'title': 'Value Added by Industry: Arts, Entertainment, Recreation, Accommodation, and Food Services: Arts, Entertainment, and Recreation as a Percentage of GDP',
  'observation_start': '2005-01-01',
  'observation_end': '2023-01-01',
  'frequency': 'Quarterly',
  'frequency_short': 'QS',
  'units': 'Percent',
  'units_short': '%',
  'seasonal_adjustment': 'Not Seasonally Adjusted',
  'seasonal_adjustment_short': 'NSA',
  'last_updated': '2023-06-29 11:04:40-05',
  'popularity': 1,
  'notes': 'Value added represents the sum of the costs-incurred and the incomes-earned in production, and consists of compensation of employees, taxes on production and imports, less subsidies, and gross operating surplus.',
  'category': 'Industry'},
 'VAPGDPAERAF': {'id': 'VAPGDPAERAF',
  'realtime_start': '2023-10-22',
  'realtime_end': '2023-10-22',
  'title': 'Value Added by Industry: Arts, Entertainme

# Nomenclature

* "date of a datapoint": 
* "realtime_start":
* "realtime_end":
* "period_of_release": 

# Input

In [6]:
# choose file to load
filename = "WEI.csv"

# get additional parameters
series_id = filename[:-4]
data_freq = dct_data_info[series_id]["frequency_short"]

# read and format type
df = pd.read_csv(f"../Database/{filename}")
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
df["realtime_start"] = pd.to_datetime(df["realtime_start"], format="%Y-%m-%d")
df["realtime_end"] = pd.to_datetime(df["realtime_end"], format="%Y-%m-%d")

In [7]:
df

Unnamed: 0,realtime_start,realtime_end,date,WEI
0,2020-04-16,2020-06-08,2008-01-05,1.48
1,2020-06-09,2020-07-29,2008-01-05,1.41
2,2020-07-30,2021-07-28,2008-01-05,1.42
3,2021-07-29,2022-01-19,2008-01-05,1.40
4,2022-01-20,2022-01-24,2008-01-05,1.67
...,...,...,...,...
7773,2023-10-05,2023-10-11,2023-09-30,1.83
7774,2023-10-12,2023-12-31,2023-09-30,1.67
7775,2023-10-12,2023-10-18,2023-10-07,2.14
7776,2023-10-19,2023-12-31,2023-10-07,1.99


# check_if_retrospective()

In [8]:
def check_if_retrospective(df, data_freq, threshold):
    """Check if the data is 'retrospective', i.e.,
    whether data had been created for the dates that came before 
    the date on which the metric was invented. If 'retrospective',
    extend realtime_start to the date of the datapoints.
    """
    first_release_date = df["realtime_start"].min()
    first_data_date = df["date"].min()
    
    # maps frequency tag to interval duration in days
    dct_interval = {"Q": 90, "M": 30, "W": 7, "D": 1}
    interval = dct_interval[data_freq[0]]

    # if the first release date is more than <threshold> times
    # the interval, then regarded as retrospective
    retrospective = (first_release_date - first_data_date).days > threshold*interval
    print("Is retrospective:", retrospective)
    
    # sort data
    df = df.sort_values(["date", "realtime_start"])
    
    # extend the realtime_start of the first release
    # of each data point
    if retrospective:
        df2 = df.groupby("date").head(1).copy()
        df2["realtime_start"] = df2["date"]
        df2 = pd.concat([df, df2], axis=0).sort_values(["date", "realtime_start"])
    else:
        df2 = df.copy()
        
    return df2

In [10]:
# example: values for date '2008-01-05'
# was released 'restrospectively' on 2020-04-16.
# to access the values before date of release,
# extend realtime_start to the date of the data point
df_interim_1 = check_if_retrospective(df, data_freq, threshold=5)
df_interim_1

Is retrospective: True


Unnamed: 0,realtime_start,realtime_end,date,WEI
0,2008-01-05,2020-06-08,2008-01-05,1.48
0,2020-04-16,2020-06-08,2008-01-05,1.48
1,2020-06-09,2020-07-29,2008-01-05,1.41
2,2020-07-30,2021-07-28,2008-01-05,1.42
3,2021-07-29,2022-01-19,2008-01-05,1.40
...,...,...,...,...
7775,2023-10-07,2023-10-18,2023-10-07,2.14
7775,2023-10-12,2023-10-18,2023-10-07,2.14
7776,2023-10-19,2023-12-31,2023-10-07,1.99
7777,2023-10-14,2023-12-31,2023-10-14,2.27


# map_date_to_available_data()

In [None]:
def map_date_to_available_data(df):
    """
    """
    # delay between release of datapoint and the date of datapoint
    df["release_delay"] = df["realtime_start"] - df["date"]
    
    # filter release delay: will not use data released more than two years ago
    df2 = df[df["release_delay"] <= pd.Timedelta(days=2*365)].copy()
    
    # add new column containing the dates ranges of each version
    df2["realtime_dates"] = df2.apply(lambda x: pd.date_range(x.realtime_start, x.realtime_end, 
                                                              freq="D"), axis=1)
    # expand into dataframe mapping each historical date to
    # the different available versions of each data point
    df2 = (df2
           .explode("realtime_dates")
           .sort_values(["realtime_dates", "date"]))
    
    return df2

In [None]:
df_interim_2 = select_data_version(df_interim_1)
df_interim_2.pivot(index=["realtime_dates"], columns=["date"], values=["SERIES_ID"])

# select_data()

In [None]:
def select_data(df, series_id):
    """
    """
    df2 = (df
           .groupby("realtime_dates")
           # select most recent version available
           .agg({"date": lambda x: x.tail(1), series_id: lambda x: x.tail(1)})
          )
    
    return df2

In [None]:
# TEST 1:
# if it returns daily data from first date in data to today
all(df_result.index == pd.date_range(first_date, today, freq="D"))

In [None]:
df_result

# check_nans()

In [None]:
def check_nans(df, subset):
    """
    """
    nans = df[subset].isna().sum()
    
    if all(nans == 0):
        return False, False
    else:
        return True, nans

# change_frequency()

In [None]:
def change_frequency(df, data_freq, target_freq):
    """Downsample / upsample the data to match the target
    frequency.
    """
    # increasing order of frequency
    freq_order = ["Q", "M", "W", "D"]
    data_freq_order = freq_order.index(data_freq[0])
    target_freq_order = freq_order.index(target_freq[0])
    
    # down- or up-sampling
    if data_freq_order < target_freq_order:
        upsample = True
    elif data_freq_order > target_freq_order:
        upsample = False
    else:
        return df
    
    # check if periods are labelled with the left or right edge
    if "S" in data_freq:
        left_label = True
    else:
        left_label = False
        
    