# S&P 500 index prediction

`Dataset to prepare: (Data name(source))` 
* historical S&P index (Yahoo Finance)
* oil & gas price (Yahoo Finance)
* unemployment rate (US Bureau of Labor)
* prime rate (banks)
* Other stock exchange index(Hang seng, Zurich)
* trading volume (Yahoo Finance)
* CPI (US Bureau of Labor)
* Bond yield(treasury bond/bill) (Yahoo Finance)
* Consumer_Sentiment_Index(Nasdaq)
* gold price
* Hangseng_index
* S&P monthly Price/Earning Ratio(Nasdaq)
* S&P monthly Dividend rate(Nasdaq)
* S&P monthly earning yield ratio(Nasdaq)
* +a: future price (crude oil etc)     

#### These dataset are separate csv files, so first of all we need to merge them into 1 dataframe.

### 1. Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm

### 2. importing data using Pandas and calling the DataFrame

In [2]:
# Oldest date is 1928-01-06 (Volume records since 1951-Dec-24)
SP = pd.read_csv('data/Raw_data/S&P500.csv')
SP.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,02-Sep-22,3994.66,4018.43,3906.21,3924.26,3924.26,4134920000
1,01-Sep-22,3936.73,3970.23,3903.65,3966.85,3966.85,3754570000
2,31-Aug-22,4000.67,4015.37,3954.53,3955.0,3955.0,3797860000
3,30-Aug-22,4041.25,4044.98,3965.21,3986.16,3986.16,3190580000
4,29-Aug-22,4034.58,4062.99,4017.42,4030.61,4030.61,2963020000


In [3]:
SP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23783 entries, 0 to 23782
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         23783 non-null  object
 1   Open         23783 non-null  object
 2   High         23783 non-null  object
 3   Low          23783 non-null  object
 4   Close*       23783 non-null  object
 5   Adj Close**  23783 non-null  object
 6   Volume       23783 non-null  object
dtypes: object(7)
memory usage: 1.3+ MB


### 3. Data Combining

1. Date column adjustment - extracting year, month, day
2. Year 1927 to 1972 are shown as year 2027-2072, so we need to convert it to 1927 - 1972.
3. We will use only Open, Close price and (trading) Volume of the day.
4. We will combine all dataset into one file. 

In [4]:
# Date column adjustment - extracting year, month and day
SP['Date']= pd.to_datetime(SP['Date'])
SP['Year'] = SP['Date'].dt.year
SP['Month'] = SP['Date'].dt.month
SP['Day'] = SP['Date'].dt.day

# Manually converting 2027 ~ 2072 to Year 1927 ~ 1972 and combine 'Date' as 'Date_adj' again
SP['Year'] = np.where((SP['Year'] >= 2027), SP['Year'] - 100, SP.Year)
SP['Date_adj'] = pd.to_datetime(SP[['Year','Month','Day']])

SP = SP.drop(['Date','Year','Month','Day'], axis=1)
SP = SP.rename(columns = {'Open':'S&P_Open','Close*':'S&P_Close','Volume':'S&P_Volume','Date_adj':'Date'})
SP = SP.set_index('Date')
SP = SP.drop(['High','Low','Adj Close**'], axis=1)
SP.replace(to_replace=',', value='', inplace=True, regex = True)
SP.replace(to_replace='-', value=np.nan, inplace=True)
SP = SP.astype(float)

In [5]:
SP.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23783 entries, 2022-09-02 to 1927-12-30
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   S&P_Open    23783 non-null  float64
 1   S&P_Close   23783 non-null  float64
 2   S&P_Volume  18287 non-null  float64
dtypes: float64(3)
memory usage: 743.2 KB


#### Combining all data into one dataframe

In [6]:
# Prime rate - Oldest date is 1955-08-04
Prime = pd.read_csv('data/Raw_data/prime_rate.csv')
Prime['Date']= pd.to_datetime(Prime['DATE'])
Prime = Prime.drop('DATE', axis=1)

# Treasury bill rate - Oldest date is 1960-01-04
treasury_3m = pd.read_csv('data/Raw_data/treasury_3m_yield.csv')
treasury_3m['Date']= pd.to_datetime(treasury_3m['Date'])
treasury_3m = treasury_3m[['Date','Close']]
treasury_3m = treasury_3m.rename(columns={'Close':'t_3m_Close'})

# Treasury bond rate - Oldest date is 1962-01-04
treasury_10y = pd.read_csv('data/Raw_data/treasury_10yr_yield.csv')
treasury_10y['Date']= pd.to_datetime(treasury_10y['Date'])
treasury_10y = treasury_10y[['Date','Close']]
treasury_10y = treasury_10y.rename(columns={'Close':'t_10y_Close'})

# CPI
cpi = pd.read_csv('data/Raw_data/CPI.csv')
cpi['Month'] = cpi['Period'].str.replace('M','').astype(int)
cpi['Day'] = "01"
cpi['Date'] = pd.to_datetime(cpi[['Year','Month','Day']])
cpi = cpi.drop(['Year','Period','Month','Day','Series Id'], axis=1)
cpi = cpi.rename(columns={'Value':'cpi_Value','1-Month percent change':'cpi_MoM%'})

# Crude Oil - Oldest info 2000-08-23
oil = pd.read_csv('data/Raw_data/Crude_1986.csv')
oil['Date'] = pd.to_datetime(oil['Date'])
oil = oil.rename(columns={'Cushing, OK WTI Spot Price FOB (Dollars per Barrel)':'oil_Price'})

# Unemployment rate
unemployment = pd.read_csv('data/Raw_data/unemployment_rate.csv')
unemployment['Month'] = unemployment['Period'].str.replace('M','').astype(int)
unemployment['Day'] = "01"
unemployment['Date'] = pd.to_datetime(unemployment[['Year','Month','Day']])
unemployment = unemployment.drop(['Year','Period','Month','Day', 'Series id'], axis=1)
unemployment = unemployment.rename(columns={'Value':'unemp_rate'})

# S&P PE(Price/Earning) ratio
SP_PER = pd.read_csv('data/Raw_data/SP500_PE_RATIO_MONTH.csv')
SP_PER['Date']= pd.to_datetime(SP_PER['Date'])
SP_PER = SP_PER.rename(columns={'Value':'PE_Ratio'})

# S&P Dividend ratio
SP_DIV = pd.read_csv('data/Raw_data/MULTPL-SP500_DIV_MONTH.csv')
SP_DIV['Date']= pd.to_datetime(SP_DIV['Date'])
SP_DIV = SP_DIV.rename(columns={'Value':'Div_Ratio'})

# S&P EY(Earning Yield) ratio
SP_EYR = pd.read_csv('data/Raw_data/MULTPL-SP500_EARNINGS_YIELD_MONTH.csv')
SP_EYR['Date']= pd.to_datetime(SP_EYR['Date'])
SP_EYR = SP_EYR.rename(columns={'Value':'EY_Ratio'})

# Consumer Sentiment Index
CSI = pd.read_csv('data/Raw_data/Consumer_Sentiment_Index.csv')
CSI['Date']= pd.to_datetime(CSI['Date'])
CSI = CSI.rename(columns={'Index':'CS_Index'})

# Put/Call ratio
PutCallRatio = pd.read_csv('data/Raw_data/spxputcallratio.csv')
PutCallRatio['Date'] = pd.to_datetime(PutCallRatio['Date'])
PutCallRatio = PutCallRatio.drop(['SPX Put Volume','SPX Call Volume','Total SPX Options Volume'], axis=1)
PutCallRatio = PutCallRatio.rename(columns={'SPX Put/Call Ratio':'Put/Call_Ratio'})


In [7]:
# Merging all data into one file
Filename = [treasury_3m, treasury_10y,cpi, oil, unemployment, SP_PER, SP_DIV, SP_EYR, CSI,PutCallRatio]

SP_all = SP.merge(Prime, on='Date', how='left')
for file in Filename:
    SP_all = SP_all.merge(file, on='Date', how='left')

In [8]:
SP_all.head()

Unnamed: 0,Date,S&P_Open,S&P_Close,S&P_Volume,PRIME,t_3m_Close,t_10y_Close,cpi_Value,cpi_MoM%,oil_Price,unemp_rate,PE_Ratio,Div_Ratio,EY_Ratio,CS_Index,Put/Call_Ratio
0,2022-09-02,3994.66,3924.26,4134920000.0,,,,,,,,,,,,1.53
1,2022-09-01,3936.73,3966.85,3754570000.0,,,,,,,,20.04,,4.99,,1.28
2,2022-08-31,4000.67,3955.0,3797860000.0,,2.86,3.133,,,,,19.98,,5.0,,1.46
3,2022-08-30,4041.25,3986.16,3190580000.0,,2.873,3.11,,,,,,,,,1.47
4,2022-08-29,4034.58,4030.61,2963020000.0,,2.798,3.11,,,97.4,,,,,,1.49


### 4. Data Cleaning
#### Missing value replacement
Let's see how much there are missing values and how we can fill them.
`Some data is monthly data shown only on 1st day of the month, so we need to fill missing values by using backfill.
Prime rate is not monthly, but once there is a new rate it continues until the next prime rate is announced.
Some data is monthly data shown only on last day of the month, so we need to fill missing values by using fowardfill.`

In [9]:
cols_tobfill = ['PRIME','t_3m_Close','t_10y_Close','unemp_rate','cpi_Value',
               'cpi_MoM%','oil_Price','PE_Ratio','Div_Ratio','EY_Ratio','CS_Index','Put/Call_Ratio']

for col in cols_tobfill:
    SP_all[col].fillna(method='bfill', inplace=True)         

In [10]:
SP_all.head(5)

Unnamed: 0,Date,S&P_Open,S&P_Close,S&P_Volume,PRIME,t_3m_Close,t_10y_Close,cpi_Value,cpi_MoM%,oil_Price,unemp_rate,PE_Ratio,Div_Ratio,EY_Ratio,CS_Index,Put/Call_Ratio
0,2022-09-02,3994.66,3924.26,4134920000.0,5.5,2.86,3.133,295.271,0.0,97.4,3.5,20.04,64.01,4.99,50.0,1.53
1,2022-09-01,3936.73,3966.85,3754570000.0,5.5,2.86,3.133,295.271,0.0,97.4,3.5,20.04,64.01,4.99,50.0,1.28
2,2022-08-31,4000.67,3955.0,3797860000.0,5.5,2.86,3.133,295.271,0.0,97.4,3.5,19.98,64.01,5.0,50.0,1.46
3,2022-08-30,4041.25,3986.16,3190580000.0,5.5,2.873,3.11,295.271,0.0,97.4,3.5,20.93,64.01,4.78,50.0,1.47
4,2022-08-29,4034.58,4030.61,2963020000.0,5.5,2.798,3.11,295.271,0.0,97.4,3.5,20.93,64.01,4.78,50.0,1.49


In [11]:
#missing data
total = SP_all.isnull().sum().sort_values(ascending=False)
percent = (SP_all.isnull().sum()/SP_all.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
Put/Call_Ratio,20719,0.871168
oil_Price,14539,0.611319
t_10y_Close,8509,0.357777
t_3m_Close,8007,0.336669
PRIME,6896,0.289955
CS_Index,6412,0.269604
S&P_Volume,5496,0.231089
unemp_rate,5034,0.211664
cpi_Value,4807,0.202119
cpi_MoM%,4807,0.202119


In [12]:
treasury_10y.head()

Unnamed: 0,Date,t_10y_Close
0,1962-01-02,4.06
1,1962-01-03,4.03
2,1962-01-04,3.99
3,1962-01-05,4.02
4,1962-01-07,


In [13]:
oil.head()

Unnamed: 0,Date,oil_Price
0,1986-01-02,25.56
1,1986-01-03,26.0
2,1986-01-06,26.53
3,1986-01-07,25.85
4,1986-01-08,25.87


In [14]:
# I will drop Put/Call_Ratio and oil_price columns and missing values for the rest and do EDA from year 1962 data
SP_all.drop(['Put/Call_Ratio','oil_Price'], axis=1, inplace=True)
SP_all.dropna(inplace=True)

In [15]:
SP_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15274 entries, 0 to 15273
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         15274 non-null  datetime64[ns]
 1   S&P_Open     15274 non-null  float64       
 2   S&P_Close    15274 non-null  float64       
 3   S&P_Volume   15274 non-null  float64       
 4   PRIME        15274 non-null  float64       
 5   t_3m_Close   15274 non-null  float64       
 6   t_10y_Close  15274 non-null  object        
 7   cpi_Value    15274 non-null  float64       
 8   cpi_MoM%     15274 non-null  float64       
 9   unemp_rate   15274 non-null  float64       
 10  PE_Ratio     15274 non-null  float64       
 11  Div_Ratio    15274 non-null  float64       
 12  EY_Ratio     15274 non-null  float64       
 13  CS_Index     15274 non-null  float64       
dtypes: datetime64[ns](1), float64(12), object(1)
memory usage: 1.7+ MB


#### The goal of S&P index prediction is to predict future('t+1') index based on the current information('t'). Let's create our target variable 'Close_t+1' and 'return rate_t+1'. 

In [16]:
#Creating return rate of t before creating 't+1'
SP_all['Close_t-1'] = SP_all['S&P_Close'].shift(-1)
SP_all['return(%)'] = (SP_all['S&P_Close']-SP_all['Close_t-1'])/SP_all['Close_t-1'] * 100
# return_class: if daily return is higher than 0.5% -> then return 'class1'(positive), if between -0.5 to 0.5% -> 'class0'(neutral)
                #if daily return is lower than -0.5% then return 'class-1' (negative)
SP_all['return_class'] = SP_all['return(%)'].apply(lambda x: 1 if x >= 0.5 
                                              else( 0 if x < 0.5 and x > -0.5 else -1))

In [17]:
SP_all['Close_t+1'] = SP_all['S&P_Close'].shift(1)
SP_all.head(2)

Unnamed: 0,Date,S&P_Open,S&P_Close,S&P_Volume,PRIME,t_3m_Close,t_10y_Close,cpi_Value,cpi_MoM%,unemp_rate,PE_Ratio,Div_Ratio,EY_Ratio,CS_Index,Close_t-1,return(%),return_class,Close_t+1
0,2022-09-02,3994.66,3924.26,4134920000.0,5.5,2.86,3.133,295.271,0.0,3.5,20.04,64.01,4.99,50.0,3966.85,-1.073648,-1,
1,2022-09-01,3936.73,3966.85,3754570000.0,5.5,2.86,3.133,295.271,0.0,3.5,20.04,64.01,4.99,50.0,3955.0,0.299621,0,3924.26


In [18]:
SP_all.dropna(inplace=True)

In [19]:
SP_all['return(%)_t+1'] = (SP_all['Close_t+1']-SP_all['S&P_Close'])/SP_all['S&P_Close'] * 100
# return_class: if daily return is higher than 0.5% -> then return 'class1'(positive), if between -0.5 to 0.5% -> 'class0'(neutral)
                #if daily return is lower than -0.5% then return 'class-1' (negative)
SP_all['return_class_t+1'] = SP_all['return(%)_t+1'].apply(lambda x: 1 if x >= 0.5 
                                              else( 0 if x < 0.5 and x > -0.5 else -1))

In [20]:
SP_all.head()

Unnamed: 0,Date,S&P_Open,S&P_Close,S&P_Volume,PRIME,t_3m_Close,t_10y_Close,cpi_Value,cpi_MoM%,unemp_rate,PE_Ratio,Div_Ratio,EY_Ratio,CS_Index,Close_t-1,return(%),return_class,Close_t+1,return(%)_t+1,return_class_t+1
1,2022-09-01,3936.73,3966.85,3754570000.0,5.5,2.86,3.133,295.271,0.0,3.5,20.04,64.01,4.99,50.0,3955.0,0.299621,0,3924.26,-1.073648,-1
2,2022-08-31,4000.67,3955.0,3797860000.0,5.5,2.86,3.133,295.271,0.0,3.5,19.98,64.01,5.0,50.0,3986.16,-0.781705,-1,3966.85,0.299621,0
3,2022-08-30,4041.25,3986.16,3190580000.0,5.5,2.873,3.11,295.271,0.0,3.5,20.93,64.01,4.78,50.0,4030.61,-1.102811,-1,3955.0,-0.781705,-1
4,2022-08-29,4034.58,4030.61,2963020000.0,5.5,2.798,3.11,295.271,0.0,3.5,20.93,64.01,4.78,50.0,4057.66,-0.66664,-1,3986.16,-1.102811,-1
5,2022-08-26,4198.74,4057.66,3175260000.0,5.5,2.755,3.035,295.271,0.0,3.5,20.93,64.01,4.78,50.0,4199.12,-3.368801,-1,4030.61,-0.66664,-1


Let's save the file.

In [None]:
SP_all.to_csv('data/Processed/S&P_daily.csv')

### Monthly data

#### Almost same process as daily data, but there are some difference described below.

1. Daily S&P index converting to monthly S&P open, close index.
    - 1) extracting year & month from 'Date' column
    - 2) extracting S&P_Open index of the first day of the month
    - 3) extracting S&P_Close index of the last day of the month

In [22]:
# Oldest date is 1928-01-06 (trading volume records since 1951-Dec-24)
SP = pd.read_csv('data/Raw_data/S&P500.csv')
SP['Date']= pd.to_datetime(SP['Date'])
SP['Year'] = SP['Date'].dt.year
SP['Month'] = SP['Date'].dt.month
SP['Day'] = SP['Date'].dt.day
SP['Dayofweek'] = SP['Date'].dt.dayofweek

# Manually converting 2027 ~ 2072 to Year 1927 ~ 1972 and combine 'Date' again
SP['Year'] = np.where((SP['Year'] >= 2027), SP['Year'] - 100, SP.Year)
SP['Date_adj'] = pd.to_datetime(SP[['Year','Month','Day']])
SP = SP.drop(['Date','Adj Close**'], axis=1)
SP = SP.rename(columns = {'Open':'S&P_Open','Close*':'S&P_Close','Volume':'S&P_Volume','Date_adj':'Date'})
SP = SP.set_index('Date')
SP.replace(to_replace=',', value='', inplace=True, regex = True)
SP.replace(to_replace='-', value=np.nan, inplace=True)


In [29]:
#daily information of S&P index
SP.head()

Unnamed: 0_level_0,S&P_Open,High,Low,S&P_Close,S&P_Volume,Year,Month,Day,Dayofweek
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-09-02,3994.66,4018.43,3906.21,3924.26,4134920000,2022,9,2,4
2022-09-01,3936.73,3970.23,3903.65,3966.85,3754570000,2022,9,1,3
2022-08-31,4000.67,4015.37,3954.53,3955.0,3797860000,2022,8,31,2
2022-08-30,4041.25,4044.98,3965.21,3986.16,3190580000,2022,8,30,1
2022-08-29,4034.58,4062.99,4017.42,4030.61,2963020000,2022,8,29,0


In [25]:
## Converting daily to monthly information. 
# year list = 1928 - 2021 (which has all months info)
years = SP.index.year.unique()
years = years[1:-1]

months = [12,11,10,9,8,7,6,5,4,3,2,1]
SP_month_open = []
SP_month_close = []
SP_month = []

for year in years:
    for month in months:
        sp_open = SP.loc[(SP.index.year == year) & (SP.index.month == month), 'S&P_Open'][-1]
        sp_close = SP.loc[(SP.index.year == year) & (SP.index.month == month), 'S&P_Close'][0]
        sp_month = str(year)+'-'+str(month)
        SP_month_open.append(sp_open)
        SP_month_close.append(sp_close)
        SP_month.append(sp_month)

In [26]:
SP_monthly = pd.DataFrame(list(zip(SP_month,SP_month_open,SP_month_close)))
SP_monthly.columns = ['Date','S&P_Open','S&P_Close']

In [28]:
year = 2022
months = [8,7,6,5,4,3,2,1]
SP_2022_open = []
SP_2022_close = []
SP_2022month = []

for month in months:
    sp_open = SP.loc[(SP.index.year == year) & (SP.index.month == month), 'S&P_Open'][-1]
    sp_close = SP.loc[(SP.index.year == year) & (SP.index.month == month), 'S&P_Close'][0]
    sp_month = str(year)+'-'+str(month)
    SP_2022_open.append(sp_open)
    SP_2022_close.append(sp_close)
    SP_2022month.append(sp_month)

In [None]:
SP_2022 = pd.DataFrame(list(zip(SP_2022month,SP_2022_open,SP_2022_close)))
SP_2022.columns = ['Date','S&P_Open','S&P_Close']

In [None]:
SP_monthly = pd.concat([SP_2022, SP_monthly])
SP_monthly.head()

In [None]:
SP_volume = SP['S&P_Volume']
#SP_volume = SP_volume.set_index('Date')
SP_vol_month = SP_volume.resample('MS').mean()

In [None]:
SP_vol_month

In [None]:
SP_monthly['Date'] = pd.to_datetime(SP_monthly['Date'])
SP_monthly[['S&P_Open','S&P_Close']] = SP_monthly[['S&P_Open','S&P_Close']].astype(float)
SP_monthly.info()

In [None]:
SP_monthly['Close_t-1'] = SP_monthly['S&P_Close'].shift(-1)
SP_monthly['return(%)'] = (SP_monthly['S&P_Close']-SP_monthly['Close_t-1'])/SP_monthly['Close_t-1'] * 100
# return_class: if daily return is higher than 0.5% -> 1 (positive), if between -0.5 to 0.5% -> 0(neutral)
# if daily return is lower than -0.5% then -1 (negative)
'''SP_monthly['return_class'] = SP_monthly['return(%)'].apply(lambda x: 2 if x >= 3 
                                              else( 1 if x < 3 and x > -3 else 0))
'''
# Binary classification
SP_monthly['return_class'] = SP_monthly['return(%)'].apply(lambda x: 1 if x > 0 else 0)
SP_monthly.head(5)

In [None]:
SP_monthly['Close_t+1'] = SP_monthly['S&P_Close'].shift(1)
SP_monthly['t+1_return_class'] = SP_monthly['return_class'].shift(1)

In [None]:
SP_monthly['return_class'].value_counts()

In [None]:
oil = oil.set_index('Date')
oil_monthly = oil.resample('MS').mean()
oil_monthly.head(2)

In [None]:
PutCallRatio = PutCallRatio.set_index('Date')
PCR_monthly = PutCallRatio.resample('MS').mean()
PCR_monthly.head(2)

In [None]:
treasury_3m = treasury_3m.set_index('Date')
t3m_monthly = treasury_3m.resample('MS').mean()

In [None]:
t3m_monthly.info()

In [None]:
treasury_10y.replace(to_replace='-', value=np.nan, inplace=True)
treasury_10y_r = treasury_10y.dropna()
treasury_10y_r['year'] = treasury_10y_r['Date'].dt.year
treasury_10y_r['month'] = treasury_10y_r['Date'].dt.month
#treasury_10y_r['year_month']= str(treasury_10y_r['year']) +'-'+ str(treasury_10y_r['month'])
treasury_10y_r.head()

In [None]:
treasury_10y_r['t_10y_Close'].unique()

In [None]:
treasury_10y_r.loc[:,('t_10y_Close')] = treasury_10y_r.loc[:,('t_10y_Close')].astype(float)
#treasury_10y_r.groupby(['year','month'])['t_10y_Close'].mean()
t10y_monthly = treasury_10y_r.groupby(['year','month']).mean()
t10y_monthly = t10y_monthly.reset_index(level=['year','month'])
t10y_monthly = t10y_monthly.sort_values(['year','month'], ascending=False, ignore_index=True)

In [None]:
t10y_monthly.tail(2)

In [None]:
# Merging all data into one file
SP_monthly.set_index('Date')

SP_all_month = SP_monthly.merge(t3m_monthly, on='Date', how='left')
SP_all_month = SP_all_month.join(t10y_monthly)
SP_all_month.drop(['year','month'], axis=1)

In [None]:
SP_all_month.head()

In [None]:
Filename = [cpi, oil_monthly, unemployment, SP_PER, SP_EYR, PCR_monthly]
for file in Filename:
    SP_all_month = SP_all_month.merge(file, on='Date', how='left')

In [None]:
SP_all_month.head()

In [None]:
def year_month(date_data):
    year_month = []
    for date in date_data:
        date_year = date.year
        date_month = date.month
        year_month.append(str(date_year)+'-'+str(date_month))
    return year_month

In [None]:
SP_all_month['year_month'] = year_month(SP_all_month['Date'])

In [None]:
SP_all_month.head()

In [None]:
CSI['year_month'] = year_month(CSI['Date'])
SP_DIV['year_month'] = year_month(SP_DIV['Date'])

Filename = [SP_DIV, CSI]
for file in Filename:
    SP_all_month = SP_all_month.merge(file, on='year_month', how='left')

In [None]:
SP_all_monthly = SP_all_month.drop(['Date_x','Date_y','Date'], axis=1)
SP_all_monthly = SP_all_monthly.set_index('year_month')
SP_all_monthly.head(2)

In [None]:
err

In [None]:
cols_tobfill = ['t_3m_Close','t_10y_Close','unemp_rate','cpi_Value',
               'cpi_MoM%','oil_Price','PE_Ratio','Div_Ratio','EY_Ratio','CS_Index','Put/Call_Ratio']

for col in cols_tobfill:
    SP_all_monthly[col].fillna(method='bfill', inplace=True)         

In [None]:
# S&P_Volume data to add later?
cols_toffill = ['t_3m_Close','t_10y_Close','cpi_Value','oil_Price','PE_Ratio',
                'Div_Ratio','EY_Ratio']

for col in cols_toffill:
    SP_all_monthly[col].fillna(method='ffill', inplace=True)         

In [None]:
cols_tofill_mean = ['cpi_MoM%','unemp_rate','CS_Index','Put/Call_Ratio']
for col in cols_tofill_mean:
    SP_all_monthly[col].fillna(SP_all[col].mean(), inplace=True)

In [None]:
SP_all_monthly = SP_all_monthly.drop(['year','month'], axis=1)
SP_all_monthly

In [None]:
SP_all_monthly.to_csv('SP_all_monthly_binary.csv')

Data explanation: 

`filename` 
* SP_all_monthly : same month return prediction
* SP_all_monthly_t+1 : next month return prediction based on current month information (multiclass)
* SP_all_monthly_binary : next month return prediction based on current month information (binary)
* (SP_all_monthly_t+2 : next month return prediction based on 1-month prior information (multiclass))