<a href="https://colab.research.google.com/github/jwxiong/ORIE5741-Project/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime

In [2]:
funda = pd.read_csv("fundamentals.csv").drop(['Unnamed: 0'], axis = 1)
price_adj = pd.read_csv("prices-split-adjusted.csv")
securities = pd.read_csv("securities.csv")
#https://www.kaggle.com/dgawlik/nyse

reduce scope to finance and REITs that report on Dec 31st

In [3]:
#find finance and REIT company tickers
finance_reit_tickers = list((securities
 .query("`GICS Sector` == 'Financials' or `GICS Sub Industry` == ['REITs','Residential REITs','Office REITs','Retail REITs']")
)['Ticker symbol'])

#finance and REIT companies that file on dec 31st
finance_reit__dec_file_tickers = list((funda
 .query(" `Ticker Symbol` == @finance_reit_tickers and `Period Ending` == '2012-12-31' ")
)['Ticker Symbol'])

#two normal dataframes. one for fundamentals data and another for general information about the securities
relevant_fundamentals = (funda.query(" `Ticker Symbol` == @finance_reit__dec_file_tickers "))
relevant_securities = (securities.query(" `Ticker symbol` == @finance_reit__dec_file_tickers "))

Create stock price dataframe with heirarchical multi index. so there is the higher index which allows you to choose category returns, volume or close price etc. then the lower level index lets you select a stock

In [4]:
#create multi index 

intermediate = (price_adj
 .assign(date = lambda df: pd.to_datetime(df.date))
 .query(" `symbol` == @finance_reit__dec_file_tickers ")
 .set_index(['date','symbol'])
 .assign(daily_return = lambda df: (df.close - df.open)/ df.open)
 .assign(dispersion = lambda df: (df.high - df.low)/ df.open)
 .unstack()
)

#create returns variable
returns = intermediate['close'].pct_change()
returns.columns = pd.MultiIndex.from_product([['returns'],intermediate['close']])

#create gap open variable
opening = intermediate['open'][:-1].reset_index().drop('date',axis=1)
previous_close = intermediate['close'][1:].reset_index().drop('date',axis=1)
gap_open = opening/previous_close - 1
gap_open = pd.DataFrame([[np.nan] * len(gap_open.columns)], columns=gap_open.columns).append(gap_open, ignore_index=True)
gap_open.index = returns.index
gap_open.columns = pd.MultiIndex.from_product([['gap_open'],intermediate['close']])

#append variables
multi_index = pd.concat([intermediate,gap_open,returns],axis=1)[1:]

Create dataframe of annual returns based on our stock price data, note we only have 3 years of returns data and therefore must reduce our fundamentals data

In [5]:
returns = multi_index['returns'].copy()
annual_rtns = pd.DataFrame(columns=returns.columns)
for n in range(2013,2016):
    year_start = str(n)+'-03-08'
    year_end = str(n+1)+'-03-08'
    yearly_rtn = returns[year_start:year_end].apply(lambda x: ((x + 1).cumprod()-1).last("D"))
    annual_rtns = pd.concat([annual_rtns,yearly_rtn])

annual returns in same shape as fundamentals, prepped for regression, this drops 3 companies who were missing returns in our dataset

In [6]:
non_na_tickers = list(set(finance_reit__dec_file_tickers)-set(annual_rtns.columns[annual_rtns.isna().sum()>0]))

y = (annual_rtns.T
  .stack()
  .reset_index()
  .sort_values(['level_1','symbol'])
  .query(" `symbol` == @non_na_tickers ")
)[0].values
y.shape
pd.DataFrame(y).to_csv("y_dataframe")

Benchmark

In [7]:
#!pip install yfinance
import yfinance as yf  
Benchmark = yf.download('XLF','2013-03-08','2016-03-08')['Adj Close'].pct_change()[1:]

#Benchmark
annual_benchmark = pd.DataFrame(columns=[Benchmark.name])
for n in range(2013,2016):
    year_start = str(n)+'-03-08'
    year_end = str(n+1)+'-03-08'
    yearly_rtn = pd.DataFrame(Benchmark[year_start:year_end]).apply(lambda x: ((x + 1).cumprod()-1).last("D"))
    annual_benchmark = pd.concat([annual_benchmark,yearly_rtn])
    
#financial benchmark
annual_benchmark.to_csv("benchmark_dataframe")

[*********************100%***********************]  1 of 1 completed


Interest rate section

In [None]:
#IR = yf.download(["^IRX",'^FVX','^TNX','^TYX'],'2012-03-07','2015-03-08')['Adj Close']

In [65]:
#nominal interest rates
#percent change from 1 month ago
#perct change from 6 months ago

In [90]:
nominal_rates = IR.loc[["2013-03-08","2014-03-07","2015-03-06"]]
nominal_rates.columns = nominal_rates.columns+"Nominal"

In [91]:
#6 mon change
six_mon_changeIR = pd.DataFrame(columns=IR.columns)
IR_change = IR.pct_change()
for n in range(2012,2015): 
    year_start = str(n)+'-09-08'
    year_end = str(n+1)+'-03-08'
    yearly_rtn = IR_change[year_start:year_end].apply(lambda x: ((x + 1).cumprod()-1).last("D"))
    six_mon_changeIR = pd.concat([six_mon_changeIR,yearly_rtn])
six_mon_changeIR.columns = six_mon_changeIR.columns+"Pct_chng_6"

In [92]:
#1 mon change
one_mon_changeIR = pd.DataFrame(columns=IR.columns)
IR_change = IR.pct_change()
for n in range(2012,2015): 
    year_start = str(n+1)+'-02-08'
    year_end = str(n+1)+'-03-08'
    yearly_rtn = IR_change[year_start:year_end].apply(lambda x: ((x + 1).cumprod()-1).last("D"))
    one_mon_changeIR = pd.concat([one_mon_changeIR,yearly_rtn])
one_mon_changeIR.columns = one_mon_changeIR.columns+"Pct_chng_1"

In [95]:
IR_features = pd.concat([nominal_rates,six_mon_changeIR,one_mon_changeIR],axis=1)

Company fundamentals, dropping companies with missing returns data, dropping columns that are missing more than 1/4 of values, only use first 3 years. add IR features

In [None]:
## data preprocessing for fundamental data
## find the columns that has more than 1/4 of NaN values
def cols_NaNRatio_largerThan_Pct(ser,pct = 1/4):
  return ser.columns[(ser.isna()*1).apply(sum)/ser.shape[0]>pct]

#list of first 3 years data
first3years = list(relevant_fundamentals['Period Ending'].unique()[:3])

x = (relevant_fundamentals
                  .drop(cols_NaNRatio_largerThan_Pct(relevant_fundamentals).tolist(),axis=1)
                  .fillna(0)
                  .query(" `Ticker Symbol` == @non_na_tickers and `Period Ending` == @first3years ")
                 )

In [98]:
IR_features

Unnamed: 0,^FVXNominal,^IRXNominal,^TNXNominal,^TYXNominal,^FVXPct_chng_6,^IRXPct_chng_6,^TNXPct_chng_6,^TYXPct_chng_6,^FVXPct_chng_1,^IRXPct_chng_1,^TNXPct_chng_1,^TYXPct_chng_1
2013-03-08,0.897,0.087,2.056,3.255,0.405956,-0.13,0.237809,0.151805,0.087273,0.338462,0.053819,0.029086
2014-03-07,1.64,0.045,2.79,3.722,-0.072398,2.0,-0.050374,-0.038988,0.121751,-0.375,0.042991,0.015553
2015-03-06,1.696,0.005,2.24,2.839,0.004144,-0.75,-0.089801,-0.122953,0.15847,-0.666667,0.155831,0.127035


In [110]:
x_dated = x.set_index("Period Ending").rename(index={"2012-12-31":"2013-03-08","2013-12-31":"2014-03-07","2014-12-31":"2015-03-06"})
x_dated.index = pd.to_datetime(x_dated.index)
X = x_dated.join(IR_features)

In [113]:
X.to_csv("x_dataframe")

In [114]:
X.head()

Unnamed: 0,Ticker Symbol,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash and Cash Equivalents,Changes in Inventories,Common Stocks,...,^TNXNominal,^TYXNominal,^FVXPct_chng_6,^IRXPct_chng_6,^TNXPct_chng_6,^TYXPct_chng_6,^FVXPct_chng_1,^IRXPct_chng_1,^TNXPct_chng_1,^TYXPct_chng_1
2013-03-08,AFL,3858000000.0,-199000000.0,0.0,18.0,0.0,1505000000.0,2041000000.0,0.0,67000000.0,...,2.056,3.255,0.405956,-0.13,0.237809,0.151805,0.087273,0.338462,0.053819,0.029086
2013-03-08,AIG,0.0,3085000000.0,-6768000000.0,4.0,0.0,80410000000.0,1151000000.0,0.0,4766000000.0,...,2.056,3.255,0.405956,-0.13,0.237809,0.151805,0.087273,0.338462,0.053819,0.029086
2013-03-08,AIV,344358000.0,30716000.0,30743000.0,14.0,-359926000.0,3712684000.0,229998000.0,0.0,1456000.0,...,2.056,3.255,0.405956,-0.13,0.237809,0.151805,0.087273,0.338462,0.053819,0.029086
2013-03-08,ALL,0.0,-1685000000.0,18000000.0,11.0,-285000000.0,3162000000.0,806000000.0,0.0,9000000.0,...,2.056,3.255,0.405956,-0.13,0.237809,0.151805,0.087273,0.338462,0.053819,0.029086
2013-03-08,AMG,0.0,-41600000.0,22000000.0,8.0,-20000000.0,868500000.0,430400000.0,0.0,500000.0,...,2.056,3.255,0.405956,-0.13,0.237809,0.151805,0.087273,0.338462,0.053819,0.029086
