In [135]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from sklearn import linear_model

In [117]:
data = pd.read_csv('data_v1.csv')
name = data.columns.values
name = np.array(['gvkey', 'date', 'fyear', 'fquarter', 'industry_format', 'consol',
       'popsrc', 'datafmt', 'ticker', 'cusip', 'conpany_name', 'currency', 'calendaryr_quarter',
       'fiscalyear_qtr', 'total_asset', 'cash', 'st_debt', 'lt_debt', 'net_income', 'ebitda',
       'int_exp', 'exchange', 'status', 'mkt_val', 'group', 'industry', 'sector',
       'subindustry'])

In [118]:
data.columns = name
data.head()

Unnamed: 0,gvkey,date,fyear,fquarter,industry_format,consol,popsrc,datafmt,ticker,cusip,...,net_income,ebitda,int_exp,exchange,status,mkt_val,group,industry,sector,subindustry
0,1004,19900228,1989,3,INDL,C,D,STD,AIR,361105,...,6.109,14.251,2.758,11,A,,2010.0,201010.0,20.0,20101010.0
1,1004,19900531,1989,4,INDL,C,D,STD,AIR,361105,...,6.224,13.137,2.309,11,A,,2010.0,201010.0,20.0,20101010.0
2,1004,19900831,1990,1,INDL,C,D,STD,AIR,361105,...,6.697,15.4,2.607,11,A,,2010.0,201010.0,20.0,20101010.0
3,1004,19901130,1990,2,INDL,C,D,STD,AIR,361105,...,0.126,8.871,2.708,11,A,,2010.0,201010.0,20.0,20101010.0
4,1004,19910228,1990,3,INDL,C,D,STD,AIR,361105,...,3.977,11.344,2.587,11,A,,2010.0,201010.0,20.0,20101010.0


In [119]:
# there is no dual-listing
a = data[['ticker','exchange']].drop_duplicates().groupby('ticker').count()
a[a['exchange'] != 1]

Unnamed: 0_level_0,exchange
ticker,Unnamed: 1_level_1


In [120]:
# how many company compustst is inactively updating the data
# when compustat stop track the data, we believe the company will not be included in the top 1000 market cap range.
# so not affecting our analysis
len(data[data['status']== 'I']['ticker'].unique())

8536

In [121]:
len(data[data['status']== 'A']['ticker'].unique())

5576

In [122]:
# check if any company have 2 status
# if a company is not updated by the database anymore,
# the status changed to I for each time period before

inactive = data[data['status']== 'I']['ticker'].unique()
active = data[data['status']== 'A']['ticker'].unique()
duplicates = []
for ticker in inactive:
    if ticker in active:
        duplicates.append(inactive[i])
duplicates


[]

In [123]:
# 0(n)
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 
   
print(intersection(inactive, active)) 

[]


In [124]:
len(data['ticker'].unique())

14111

In [125]:
data[data['ticker']=='TAM']

Unnamed: 0,gvkey,date,fyear,fquarter,industry_format,consol,popsrc,datafmt,ticker,cusip,...,net_income,ebitda,int_exp,exchange,status,mkt_val,group,industry,sector,subindustry
666554,311524,20100331,2010,1,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666555,311524,20100630,2010,2,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666556,311524,20100930,2010,3,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666557,311524,20101231,2010,4,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666558,311524,20110331,2011,1,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666559,311524,20110630,2011,2,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666560,311524,20110930,2011,3,INDL,C,D,STD,TAM,87509U106,...,,,,11,I,,1510.0,151010.0,15.0,15101050.0
666561,311524,20111231,2011,4,INDL,C,D,STD,TAM,87509U106,...,30.0,226.9,75.4,11,I,,1510.0,151010.0,15.0,15101050.0
666562,311524,20120331,2012,1,INDL,C,D,STD,TAM,87509U106,...,-43.0,9.0,11.0,11,I,,1510.0,151010.0,15.0,15101050.0
666563,311524,20120630,2012,2,INDL,C,D,STD,TAM,87509U106,...,7.0,68.0,19.0,11,I,,1510.0,151010.0,15.0,15101050.0


In [126]:
# make sure data is sorted by ticker and time ascendingly
data.sort_values(by = ['ticker','date'],ascending = True, inplace = True)

In [127]:
# ret is the table we will be used for analysis
ret = data[['date', 'ticker', 'conpany_name','calendaryr_quarter', 'total_asset', 'st_debt', 'lt_debt', 'net_income', 'ebitda',
       'int_exp', 'exchange', 'status', 'mkt_val','industry', 'sector']].copy()
ret['ROA'] = ret['net_income']/ret['total_asset']
ret['total_debt'] = ret['st_debt']+ret['lt_debt']
ret['D/A'] = ret['total_debt']+ret['total_asset']

In [128]:
# match forward 1year ROA to current leverage
ret['lead_ROA'] = ret.groupby(['ticker'])['ROA'].shift(-4)
ret[ret['ticker']=='AAPL'].head()
# calculate change in current leverage
ret['1q_before_D/A'] = ret.groupby(['ticker'])['D/A'].shift(1)
ret['change_D/A']=(ret['D/A']-ret['1q_before_D/A'])/ret['1q_before_D/A']
ret

Unnamed: 0,date,ticker,conpany_name,calendaryr_quarter,total_asset,st_debt,lt_debt,net_income,ebitda,int_exp,...,status,mkt_val,industry,sector,ROA,total_debt,D/A,lead_ROA,1q_before_D/A,change_D/A
528523,19980131,A,AGILENT TECHNOLOGIES INC,1997Q4,,,,161.0,,0.0,...,A,,352030.0,35.0,,,,,,
528524,19980430,A,AGILENT TECHNOLOGIES INC,1998Q1,,,,93.0,,0.0,...,A,,352030.0,35.0,,,,,,
528525,19980731,A,AGILENT TECHNOLOGIES INC,1998Q2,,,,54.0,,0.0,...,A,,352030.0,35.0,,,,0.026733,,
528526,19981031,A,AGILENT TECHNOLOGIES INC,1998Q3,4987.0,0.0,0.0,-51.0,,0.0,...,A,,352030.0,35.0,-0.010227,0.0,4987.0,0.026819,,
528527,19990131,A,AGILENT TECHNOLOGIES INC,1998Q4,,,,74.0,225.0,0.0,...,A,,352030.0,35.0,,,,0.018433,4987.0,
528528,19990430,A,AGILENT TECHNOLOGIES INC,1999Q1,,,,157.0,351.0,0.0,...,A,,352030.0,35.0,,,,0.022674,,
528529,19990731,A,AGILENT TECHNOLOGIES INC,1999Q2,5050.0,0.0,0.0,135.0,376.0,0.0,...,A,,352030.0,35.0,0.026733,0.0,5050.0,0.019803,,
528530,19991031,A,AGILENT TECHNOLOGIES INC,1999Q3,5444.0,0.0,0.0,146.0,315.0,0.0,...,A,,352030.0,35.0,0.026819,0.0,5444.0,0.036202,5050.0,0.078020
528531,20000131,A,AGILENT TECHNOLOGIES INC,1999Q4,7107.0,111.0,,131.0,,0.0,...,A,,352030.0,35.0,0.018433,,,0.010860,5444.0,
528532,20000430,A,AGILENT TECHNOLOGIES INC,2000Q1,7321.0,98.0,0.0,166.0,473.0,0.0,...,A,,352030.0,35.0,0.022674,98.0,7419.0,0.011233,,


In [144]:
# get rid of NAs
ret2=ret[['date', 'ticker','ROA', 'change_D/A', 'mkt_val']].copy()
ret2=ret2.dropna()
ret2

Unnamed: 0,date,ticker,ROA,change_D/A,mkt_val
528555,20060131,A,0.350860,0.411050,14581.3000
528556,20060430,A,0.014273,0.003254,16366.9200
528557,20060731,A,0.029370,-0.034320,11631.9600
528558,20061031,A,0.020220,-0.039007,14524.8000
528559,20070131,A,0.020854,-0.019844,12960.0000
528560,20070430,A,0.016889,0.010353,13609.0765
528561,20070731,A,0.026338,-0.029489,14746.8062
528562,20071031,A,0.023828,0.131042,13634.5000
528563,20080131,A,0.016088,-0.005809,12460.4800
528564,20080430,A,0.022544,0.048305,10878.8627


In [145]:
# select the top 1000 market cap company for each quarter
ret2['rank']=ret2.groupby(['date'])['mkt_val'].rank(ascending = False)

In [146]:
def solve_sv_regression(X, Y):
    model_iid = sm.OLS(Y, X).fit()
    model_hetero = sm.OLS(Y, X).fit(cov_type='HC1')
    model_serial = sm.OLS(Y, X).fit(cov_type='HAC',cov_kwds={'maxlags':4}) 
    return (model_iid, model_hetero, model_serial)

dates = ret2['date'].unique()
for date in dates:
    subdata = ret2[ret2['date']==date]
    X = subdata['change_D/A']
    Y = subdata['ROA']
    solve_sv_regression(Y, X)



MissingDataError: exog contains inf or nans