In [382]:
import numpy as np
import pandas as pd
import datetime as dt
from sqlalchemy import create_engine
import psycopg2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In this project, I will attempt to build a regression model that accuarately forecasts the S&P500 using traditional leading economic indicators.

The first step is to retrieve the data I will be using from my database.

In [383]:
engine = create_engine('postgresql://Jballas223:Password1@localhost:5432/securities_db')

data = pd.read_sql_table('economic_data', engine)
data

Unnamed: 0,metric,date,value
0,man_hours,2010-01-01,39.9
1,man_hours,2010-02-01,39.6
2,man_hours,2010-03-01,40.0
3,man_hours,2010-04-01,40.2
4,man_hours,2010-05-01,40.5
...,...,...,...
4932,cons_sentiment,2021-02-01,76.8
4933,cons_sentiment,2021-03-01,84.9
4934,cons_sentiment,2021-04-01,88.3
4935,cons_sentiment,2021-05-01,82.9


In [384]:
data.dtypes

metric            object
date      datetime64[ns]
value            float64
dtype: object

In [385]:
data.set_index('date')

Unnamed: 0_level_0,metric,value
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01,man_hours,39.9
2010-02-01,man_hours,39.6
2010-03-01,man_hours,40.0
2010-04-01,man_hours,40.2
2010-05-01,man_hours,40.5
...,...,...
2021-02-01,cons_sentiment,76.8
2021-03-01,cons_sentiment,84.9
2021-04-01,cons_sentiment,88.3
2021-05-01,cons_sentiment,82.9


My data is structured in a way that has the date, value, and metric type as the columns. Therefore I need to pivot this data to have one uniform date index, with the columns being the different economic metrics that I will be using. I can do this in three steps. 

**Step  1:** Break up the data into a single series for each individual metric.

**Step  2:** Find a common index to merge the series on.

**Step  3:** Merge the series

Here I will perfrom step one by creating a dictionary that contains each series

In [386]:
dfs = {metric: df for metric, df in data.groupby('metric')}
for metric, df in dfs.items():
    print('\n'+metric+'\n')
    print(df)
    print('-'*40)
    print('-'*40)


cons_sentiment

              metric       date  value
4799  cons_sentiment 2010-01-01   74.4
4800  cons_sentiment 2010-02-01   73.6
4801  cons_sentiment 2010-03-01   73.6
4802  cons_sentiment 2010-04-01   72.2
4803  cons_sentiment 2010-05-01   73.6
...              ...        ...    ...
4932  cons_sentiment 2021-02-01   76.8
4933  cons_sentiment 2021-03-01   84.9
4934  cons_sentiment 2021-04-01   88.3
4935  cons_sentiment 2021-05-01   82.9
4936  cons_sentiment 2021-06-01   85.5

[138 rows x 3 columns]
----------------------------------------
----------------------------------------

initail_claims

             metric       date     value
139  initail_claims 2010-01-02  456000.0
140  initail_claims 2010-01-09  469000.0
141  initail_claims 2010-01-16  507000.0
142  initail_claims 2010-01-23  471000.0
143  initail_claims 2010-01-30  496000.0
..              ...        ...       ...
741  initail_claims 2021-07-17  424000.0
742  initail_claims 2021-07-24  399000.0
743  initail_claims 202

In [387]:
for df in dfs.values():
    df.set_index('date', inplace=True)

In [388]:
for metric, df in dfs.items():
    date_count = len(df)
    print(f'{metric}: {date_count}')

cons_sentiment: 138
initail_claims: 607
int_rate_spread: 3037
man_hours: 139
money_supply: 601
new_manufacturing_orders: 138
new_orders_capgoods: 138
residential_permits: 139


It looks like consumer sentiment, initial claims, man_hours, new_manufacturing_orders, new_orders_capgoods, and residential permits are all updated monthly, money_supply and inistial_claims are updated weekly, and int_rate_spread is updated daily.

This means that the common time period that we can merge on is month. To do that we must make sure that there is a specific day of the month that is common between each metric, if not, we must resample the dataframe. 

In [389]:
for metric, df in dfs.items():
    temp_df = pd.to_datetime(df.index).day.value_counts()
    print(metric+'\n', temp_df)

cons_sentiment
 1    138
Name: date, dtype: int64
initail_claims
 16    21
2     21
14    21
7     21
23    21
9     21
19    20
12    20
13    20
6     20
17    20
5     20
20    20
21    20
24    20
3     20
26    20
27    20
28    20
10    20
4     19
8     19
1     19
11    19
15    19
30    19
18    19
22    19
25    19
29    18
31    12
Name: date, dtype: int64
int_rate_spread
 5     101
4     101
12    101
19    101
18    101
11    101
1     100
6     100
8     100
16    100
20    100
22    100
26    100
2     100
15    100
25    100
23    100
9     100
13    100
14     99
27     99
10     99
21     99
3      99
17     99
7      99
28     99
24     98
29     93
30     91
31     57
Name: date, dtype: int64
man_hours
 1    139
Name: date, dtype: int64
money_supply
 18    21
4     21
25    21
11    21
16    20
2     20
5     20
7     20
8     20
9     20
14    20
15    20
1     20
23    20
28    20
21    20
22    20
24    19
3     19
27    19
6     19
26    19
19    19
17    19
10 

It looks like for all of the metrics that are recorded monthly, they are recorded on the first day of the months. for the other metrics, it seems to be an even split between all days of the month, therefore we will not have first day of the month data for the metrics that are recorded more frequently. This means that we must resample.

The Pandas resample method will not give us what we want in this instance. In this instance we want the earliest day given for each month for every dataframe. We create our own function for this.

In [392]:
def resample(data):
    '''
    Must take in a Pandas dataframe with a datetime index called 'date'.
    '''
    data.reset_index(inplace=True)
    data['date2'] = data['date']
    data['Y-m'] = data['date'].apply(lambda x: x.strftime('%Y-%m'))
    data.set_index('date2', inplace=True)
    
    unique = data['Y-m'].unique()
    df = pd.concat([pd.DataFrame(data.loc[i].min()).transpose() for i in unique])
    #df.drop('index', axis=1, inplace=True)
    return df

In [393]:
for metric in dfs.keys():
    dfs[metric] = resample(dfs[metric]).drop(['date','metric'], axis=1).set_index('Y-m')


In [394]:
dfs['money_supply']

Unnamed: 0_level_0,value
Y-m,Unnamed: 1_level_1
2010-01,8411.7
2010-02,8429.2
2010-03,8528.5
2010-04,8526.6
2010-05,8549.2
...,...
2021-03,19677.2
2021-04,20150.3
2021-05,20230
2021-06,20363.4


In [395]:
dfs['residential_permits']

Unnamed: 0_level_0,value
Y-m,Unnamed: 1_level_1
2010-01,636
2010-02,650
2010-03,687
2010-04,637
2010-05,575
...,...
2021-03,1755
2021-04,1733
2021-05,1683
2021-06,1594


In [396]:
counter = 0
for metric, df in dfs.items():
  if counter < 1:
    combined_df = df
  else:
    combined_df = pd.merge(combined_df, df, how='left', left_index=True, right_index=True, suffixes=['',f'_{metric}'])
  counter+=1

In [397]:
combined_df

Unnamed: 0_level_0,value,value_initail_claims,value_int_rate_spread,value_man_hours,value_money_supply,value_new_manufacturing_orders,value_new_orders_capgoods,value_residential_permits
Y-m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01,74.4,456000,2.75,39.9,8411.7,181016,56916,636
2010-02,73.6,466000,2.8,39.6,8429.2,180450,62116,650
2010-03,73.6,459000,2.66,40,8528.5,182148,58873,687
2010-04,72.2,449000,2.68,40.2,8526.6,182948,59108,637
2010-05,73.6,451000,2.37,40.5,8549.2,189932,64043,575
...,...,...,...,...,...,...,...,...
2021-02,76.8,747000,0.98,40.3,19434.3,246261,78315,1726
2021-03,84.9,658000,1.29,40.5,19677.2,249467,77437,1755
2021-04,88.3,566000,1.4,40.4,20150.3,247644,81405,1733
2021-05,82.9,388000,1.41,40.4,20230,255529,86253,1683


In [357]:
dfs['money_supply'].drop(['date','metric'], axis=1).set_index('Y-m')

Unnamed: 0_level_0,value
Y-m,Unnamed: 1_level_1
2010-01,8411.7
2010-02,8429.2
2010-03,8528.5
2010-04,8526.6
2010-05,8549.2
...,...
2021-03,19677.2
2021-04,20150.3
2021-05,20230
2021-06,20363.4


In [349]:
money_supply

Unnamed: 0_level_0,date,index,metric,value,Y-m
date2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-04,2010-01-04,0,money_supply,8534.5,2010-01
2010-01-11,2010-01-11,1,money_supply,8495.5,2010-01
2010-01-18,2010-01-18,2,money_supply,8476.8,2010-01
2010-01-25,2010-01-25,3,money_supply,8411.7,2010-01
2010-02-01,2010-02-01,4,money_supply,8429.2,2010-02
...,...,...,...,...,...
2021-06-07,2021-06-07,596,money_supply,20418.0,2021-06
2021-06-14,2021-06-14,597,money_supply,20504.1,2021-06
2021-06-21,2021-06-21,598,money_supply,20409.2,2021-06
2021-06-28,2021-06-28,599,money_supply,20363.4,2021-06
