In [1]:
import pandas as pd
import numpy as np
import itertools
import time

from fredapi import Fred

from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# FRED API key required
fred = Fred(api_key='36c37937923a552f29698345022ea734')

# dict of column names to series IDs, dictated by FRED
series_ids = {
    'earnings':'CES0500000003', # average hourly earnings, total private
    'gdp_nom':'GDP', # nominal gdp
    'gdp_real':'GDPC1', # real gdp
    'stocks':'WILLLRGCAPPR', # Wilshire US Large-Cap Price Index, proxy for SP500
    'unemployment':'UNRATE', #U3 unemployment
    'govt_expenditures_total':'W068RCQ027SBEA', # total govt expenditures
    'govt_expenditures_fed':'FGEXPND', # federal govt expenditures
    'public_debt':'GFDEBTN', # total public debt
    'fed_funds':'FEDFUNDS', # effective fed funds rate
    'eitc':'ENINCCTA', # earned income tax credit
    'gini':'SIPOVGINIUSA', # GINI coefficient, measure of income inequality
}

# some series only have annual frequency
annual_series = ['eitc', 'gini'] 

# dataframe to consolidate results
df = pd.DataFrame()

# loop through series, query FRED and add to df
for series in series_ids:
    
    freq = 'q' if series not in annual_series else 'a'
    
    data = pd.DataFrame(
        fred.get_series(
            series_id = series_ids[series],
            observation_start = '2000-12-31',
            observation_end = None,
            frequency = freq,
            units = 'lin'
        ),
        columns = [series]
    )
    
    df = pd.concat([df, data], axis = 1)

df

Unnamed: 0,earnings,gdp_nom,gdp_real,stocks,unemployment,govt_expenditures_total,govt_expenditures_fed,public_debt,fed_funds,eitc,gini
2000-01-01,,,,,,,,,,32296341.0,40.1
2000-10-01,,10435.744,14229.765,3090.50,3.9,3210.083,1938.796,5662216.0,6.47,,
2001-01-01,,10470.231,14183.120,2840.93,4.2,3280.819,1982.626,5773740.0,5.59,33375971.0,40.6
2001-04-01,,10599.000,14271.694,2739.95,4.4,3358.217,2003.581,5726815.0,4.33,,
2001-07-01,,10598.020,14214.516,2557.58,4.8,3377.367,2037.703,5807464.0,3.50,,
...,...,...,...,...,...,...,...,...,...,...,...
2022-10-01,32.79,26408.405,21989.981,9033.40,3.6,9175.833,6175.709,31419689.0,3.65,,
2023-01-01,33.11,26813.601,22112.329,9380.93,3.5,9326.383,6324.782,31458438.0,4.52,,
2023-04-01,33.46,27063.012,22225.350,9847.26,3.6,9422.404,6346.294,32332274.0,4.99,,
2023-07-01,33.83,27623.543,22491.567,10456.58,3.7,9985.533,6392.615,,5.26,,


In [2]:
# create growth rate columns
# for annual-only series, first linearly interpolate
growth_cols = {}

for col in df.columns:
    if col not in annual_series:
        growth_cols[col + '_growth'] = df[col].pct_change()
    else:
        df[col] = df[col].interpolate(method = 'linear', axis = 0)
        growth_cols[col + '_growth'] = df[col].pct_change()

# create differenced columns
diff_cols = {}

for col in df.columns:
    diff_cols[col + '_diff'] = df[col].diff()

# concatenate new columns with original df
df = pd.concat([df, pd.DataFrame(growth_cols), pd.DataFrame(diff_cols)], axis = 1)

# create lead / lag columns columns
# I use +/- 4 to create windows a year before and after
# I also exclude 'earnings' since that is our target variable
lead_lag_cols = {}
earnings_cols = [x for x in df.columns if 'earnings' in x]

for col in df.drop(earnings_cols, axis = 1).columns:
    for k in range(1,5):
        lead_lag_cols[f'{col}_lag_{k}'] = df[col].shift(k)
        lead_lag_cols[f'{col}_lead_{k}'] = df[col].shift(-k)

# concatenate once again
df = pd.concat([df, pd.DataFrame(lead_lag_cols)], axis = 1)

In [3]:
df = df.dropna(subset = earnings_cols)

for col in series_ids:
    print(f'{col}: {len( df[pd.isnull(df[col])] )}')
    
print(f'\nDataframe Dimensions: {df.shape}')

earnings: 0
gdp_nom: 0
gdp_real: 0
stocks: 0
unemployment: 0
govt_expenditures_total: 0
govt_expenditures_fed: 0
public_debt: 1
fed_funds: 0
eitc: 0
gini: 0

Dataframe Dimensions: (69, 273)


In [4]:
df = df.dropna()

In [5]:
growth_cols = [x for x in df.columns if 'growth' in x]
diff_cols = [x for x in df.columns if 'diff' in x]
level_cols = [x for x in df.columns if 'growth' not in x and 'diff' not in x]
earnings_cols = [x for x in df.columns if 'earnings' in x]

print(
    len(growth_cols),'\n',
    len(diff_cols),'\n',
    len(level_cols),'\n',
    len(growth_cols) + len(diff_cols) + len(level_cols),'\n',
    len(df.columns),'\n',
    len(earnings_cols)
)

91 
 91 
 91 
 273 
 273 
 3


In [6]:
n = 270
k = 4

f'{np.math.factorial(n)/(np.math.factorial(n-k)*np.math.factorial(k)):,}'

  f'{np.math.factorial(n)/(np.math.factorial(n-k)*np.math.factorial(k)):,}'


'216,546,345.0'

In [8]:
variable_groups = [growth_cols, diff_cols, level_cols]
variable_group_names = ['growth', 'diff', 'level']
number_features = 3

for group, group_name in zip(variable_groups, variable_group_names):
    subset = df[group]
    variables = [x for x in subset.columns if 'earnings' not in x]
    all_combos = itertools.combinations(variables, number_features)
    globals()['var_combos_' + group_name] = []

    start = time.perf_counter()
    for i, combo in enumerate(all_combos):

        exclude = False
        for series in series_ids.keys():
            if ' '.join(combo).count(series) > 1:
                exclude = True
        if exclude == False:
            globals()['var_combos_' + group_name].append(combo)


    print(
        f'Final no. of combinations for {"var_combos_" + group_name}: {len(globals()["var_combos_" + group_name]):,}'
    )

Final no. of combinations for var_combos_growth: 87,480

Final no. of combinations for var_combos_diff: 87,480

Final no. of combinations for var_combos_level: 87,480



In [12]:
variable_group_combos = [var_combos_growth, var_combos_diff, var_combos_level]
model_fits = {}
counter = 0

start = time.perf_counter()
for group, combos in zip(variable_groups, variable_group_combos):
    subset = df[group]
    y = subset.filter(like = 'earnings')
    
    for combo in combos:
        X = subset[list(combo)]

        model = LinearRegression()
        model.fit(X, y)

        model_fits[counter] = model

        counter += 1
        
        
        if counter % 10**5 == 0:
            now = time.perf_counter()
            print(
                f'Done with {counter:,} models\n',
                f'Time to complete: {(now-start)/60:.2f}'
            )

Done with 100,000 models
 Time to complete: 1.50
Done with 200,000 models
 Time to complete: 3.00


In [10]:
X

Unnamed: 0,fed_funds_lead_4,eitc_lead_4,gini_lead_4
2006-07-01,5.07,49604628.50,40.80
2006-10-01,4.50,50136945.75,40.80
2007-01-01,3.18,50669263.00,40.80
2007-04-01,2.09,52811812.75,40.75
2007-07-01,1.94,54954362.50,40.70
...,...,...,...
2021-04-01,0.77,66723338.00,39.80
2021-07-01,2.19,66723338.00,39.80
2021-10-01,3.65,66723338.00,39.80
2022-01-01,4.52,66723338.00,39.80
