<a href="https://colab.research.google.com/github/jwxiong/ORIE5741-Project/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime

In [4]:
funda = pd.read_csv("fundamentals.csv").drop(['Unnamed: 0'], axis = 1)
price_adj = pd.read_csv("prices-split-adjusted.csv")
securities = pd.read_csv("securities.csv")
#https://www.kaggle.com/dgawlik/nyse

reduce scope to finance and REITs that report on Dec 31st

In [5]:
#find finance and REIT company tickers
finance_reit_tickers = list((securities
 .query("`GICS Sector` == 'Financials' or `GICS Sub Industry` == ['REITs','Residential REITs','Office REITs','Retail REITs']")
)['Ticker symbol'])

#finance and REIT companies that file on dec 31st
finance_reit__dec_file_tickers = list((funda
 .query(" `Ticker Symbol` == @finance_reit_tickers and `Period Ending` == '2012-12-31' ")
)['Ticker Symbol'])

#two normal dataframes. one for fundamentals data and another for general information about the securities
relevant_fundamentals = (funda.query(" `Ticker Symbol` == @finance_reit__dec_file_tickers "))
relevant_securities = (securities.query(" `Ticker symbol` == @finance_reit__dec_file_tickers "))

Create stock price dataframe with heirarchical multi index. so there is the higher index which allows you to choose category returns, volume or close price etc. then the lower level index lets you select a stock

In [6]:
#create multi index 

intermediate = (price_adj
 .assign(date = lambda df: pd.to_datetime(df.date))
 .query(" `symbol` == @finance_reit__dec_file_tickers ")
 .set_index(['date','symbol'])
 .assign(daily_return = lambda df: (df.close - df.open)/ df.open)
 .assign(dispersion = lambda df: (df.high - df.low)/ df.open)
 .unstack()
)

#create returns variable
returns = intermediate['close'].pct_change()
returns.columns = pd.MultiIndex.from_product([['returns'],intermediate['close']])

#create gap open variable
opening = intermediate['open'][:-1].reset_index().drop('date',axis=1)
previous_close = intermediate['close'][1:].reset_index().drop('date',axis=1)
gap_open = opening/previous_close - 1
gap_open = pd.DataFrame([[np.nan] * len(gap_open.columns)], columns=gap_open.columns).append(gap_open, ignore_index=True)
gap_open.index = returns.index
gap_open.columns = pd.MultiIndex.from_product([['gap_open'],intermediate['close']])

#append variables
multi_index = pd.concat([intermediate,gap_open,returns],axis=1)[1:]

Create dataframe of annual returns based on our stock price data, note we only have 3 years of returns data and therefore must reduce our fundamentals data

In [7]:
returns = multi_index['returns'].copy()
annual_rtns = pd.DataFrame(columns=returns.columns)
for n in range(2013,2016):
    year_start = str(n)+'-03-08'
    year_end = str(n+1)+'-03-08'
    yearly_rtn = returns[year_start:year_end].apply(lambda x: ((x + 1).cumprod()-1).last("D"))
    annual_rtns = pd.concat([annual_rtns,yearly_rtn])

annual returns in same shape as fundamentals, prepped for regression, this drops 3 companies who were missing returns in our dataset

In [8]:
non_na_tickers = list(set(finance_reit__dec_file_tickers)-set(annual_rtns.columns[annual_rtns.isna().sum()>0]))

y = (annual_rtns.T
  .stack()
  .reset_index()
  .sort_values(['level_1','symbol'])
  .query(" `symbol` == @non_na_tickers ")
)[0].values
y.shape

(171,)

In [12]:
pd.DataFrame(y).to_csv("y_dataframe")

Company fundamentals, dropping companies with missing returns data, dropping columns that are missing more than 1/4 of values, only use first 3 years

In [9]:
## data preprocessing for fundamental data
## find the columns that has more than 1/4 of NaN values
def cols_NaNRatio_largerThan_Pct(ser,pct = 1/4):
  return ser.columns[(ser.isna()*1).apply(sum)/ser.shape[0]>pct]

#list of first 3 years data
first3years = list(relevant_fundamentals['Period Ending'].unique()[:3])

x = (relevant_fundamentals
                  .drop(cols_NaNRatio_largerThan_Pct(relevant_fundamentals).tolist(),axis=1)
                  .fillna(0)
                  .query(" `Ticker Symbol` == @non_na_tickers and `Period Ending` == @first3years ")
                 )

In [13]:
x.to_csv("x_dataframe")

In [11]:
x.head()

Unnamed: 0,Ticker Symbol,Period Ending,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash and Cash Equivalents,Changes in Inventories,...,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,For Year,Earnings Per Share,Estimated Shares Outstanding
52,AFL,2012-12-31,3858000000.0,-199000000.0,0.0,18.0,0.0,1505000000.0,2041000000.0,0.0,...,0.0,0.0,15978000000.0,115116000000.0,131094000000.0,25364000000.0,-5696000000.0,2012.0,6.14,466775200.0
53,AFL,2013-12-31,3718000000.0,-8000000.0,0.0,22.0,0.0,1644000000.0,2543000000.0,0.0,...,0.0,0.0,14620000000.0,106687000000.0,121307000000.0,23939000000.0,-6413000000.0,2013.0,6.8,464411800.0
54,AFL,2014-12-31,5293000000.0,-7000000.0,0.0,16.0,0.0,1711000000.0,4658000000.0,0.0,...,0.0,0.0,18347000000.0,101420000000.0,119767000000.0,22728000000.0,-7566000000.0,2014.0,6.54,451223200.0
56,AIG,2012-12-31,0.0,3085000000.0,-6768000000.0,4.0,0.0,80410000000.0,1151000000.0,0.0,...,0.0,0.0,98002000000.0,450631000000.0,548633000000.0,71214000000.0,-13924000000.0,2012.0,0.0,0.0
57,AIG,2013-12-31,0.0,2174000000.0,-699000000.0,9.0,0.0,80899000000.0,2241000000.0,0.0,...,0.0,0.0,100470000000.0,440859000000.0,541329000000.0,68874000000.0,-14520000000.0,2013.0,6.16,1474838000.0
