In [1]:
#Note accounting CRSP info seems to be in millions

import pandas as pd
import numpy as np
import os


import pandas_datareader as pdr  # to install: !pip install pandas_datareader
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt 
from scipy import stats
from sklearn.linear_model import LinearRegression

from statsmodels.formula.api import ols as sm_ols
from statsmodels.iolib.summary2 import summary_col # nicer tables
import statsmodels.api as sm

# importing required modules
from zipfile import ZipFile
import zipfile
import time

In [2]:
#Compustat data - roughly 1min to load 
fundamentals = pd.concat((chunk for chunk in pd.read_csv('fundamentals.csv',chunksize=100000, low_memory=False)))
fundamentals.shape #~500k rows, 659 columns 

(501147, 659)

In [3]:
fundamentals['cusip'] = fundamentals['cusip'].astype(str).str[:-1]

In [4]:
fundamentals['cusip']

0         00036110
1         00036110
2         00036110
3         00036110
4         00036110
            ...   
501142    21077C10
501143    21077C10
501144    21077C10
501145    21077C10
501146    21077C10
Name: cusip, Length: 501147, dtype: object

In [5]:
#CRSP data - roughly 10 seconds to load 
returns = pd.concat((chunk for chunk in pd.read_csv('returns.csv',chunksize=100000, low_memory=False)))
returns.shape #~5.7 million rows, 16 columns

(5711711, 16)

In [6]:
#cleaning up returns dataset - roughly 10 sec to load

returns['date'] = pd.to_datetime(returns['date'], format='%Y%m%d', errors='coerce') #change dates format 
returns.sort_values(by='date') # This now sorts in date order
returns #returns in number format, not percentages 
returns[['prc','vol','ret','shrout','sprtrn']] = returns[['prc','vol','ret','shrout','sprtrn']].apply(pd.to_numeric, errors='coerce') #had to change the num to to numeric values 
#returns['ticker'].nunique() #have stock return data on 9,670 firms 
returns.head()

Unnamed: 0.1,Unnamed: 0,permno,date,siccd,ncusip,ticker,comnam,shrcls,naics,prc,vol,ret,shrout,cfacpr,cfacshr,sprtrn
0,0,10026,2018-01-02,2052,46603210,JJSF,J & J SNACK FOODS CORP,,311821.0,149.17999,190618.0,-0.017454,18668.0,1.0,1.0,0.008303
1,1,10026,2018-01-03,2052,46603210,JJSF,J & J SNACK FOODS CORP,,311821.0,147.69,63693.0,-0.009988,18668.0,1.0,1.0,0.006399
2,2,10026,2018-01-04,2052,46603210,JJSF,J & J SNACK FOODS CORP,,311821.0,149.73,127552.0,0.013813,18668.0,1.0,1.0,0.004029
3,3,10026,2018-01-05,2052,46603210,JJSF,J & J SNACK FOODS CORP,,311821.0,148.3,44647.0,-0.00955,18668.0,1.0,1.0,0.007034
4,4,10026,2018-01-08,2052,46603210,JJSF,J & J SNACK FOODS CORP,,311821.0,148.41,55014.0,0.000742,18668.0,1.0,1.0,0.001662


In [7]:
fundamentals.sort_values(by='fyearq').reset_index() # This now sorts in date order
fundamentals.head()

Unnamed: 0.1,Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,indfmt,consol,popsrc,datafmt,...,prirow,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,ipodate
0,0,1004,2010-02-28,2009.0,3.0,5.0,INDL,C,D,STD,...,,1.0,5080.0,110.0,925.0,B,IL,0.0,www.aarcorp.com,1988-01-01
1,1,1004,2010-05-31,2009.0,4.0,5.0,INDL,C,D,STD,...,,1.0,5080.0,110.0,925.0,B,IL,0.0,www.aarcorp.com,1988-01-01
2,2,1004,2010-08-31,2010.0,1.0,5.0,INDL,C,D,STD,...,,1.0,5080.0,110.0,925.0,B,IL,0.0,www.aarcorp.com,1988-01-01
3,3,1004,2010-11-30,2010.0,2.0,5.0,INDL,C,D,STD,...,,1.0,5080.0,110.0,925.0,B,IL,0.0,www.aarcorp.com,1988-01-01
4,4,1004,2011-02-28,2010.0,3.0,5.0,INDL,C,D,STD,...,,1.0,5080.0,110.0,925.0,B,IL,0.0,www.aarcorp.com,1988-01-01


In [8]:
#group returns by quarter
#group return dates by 3/31, 6/30, 9/30, 12/31 


returns['Gross_Ret'] = 1 + returns['ret']

q2_2018 = ((returns[(returns['date'] >= '2018-03-31') & (returns['date'] <= '2018-06-30')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q3_2018 = ((returns[(returns['date'] >= '2018-06-30') & (returns['date'] <= '2018-09-30')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q4_2018 = ((returns[(returns['date'] >= '2018-09-30') & (returns['date'] <= '2018-12-31')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 

q1_2019 = ((returns[(returns['date'] >= '2018-12-31') & (returns['date'] <= '2019-03-31')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q2_2019 = ((returns[(returns['date'] >= '2019-03-31') & (returns['date'] <= '2019-06-30')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q3_2019 = ((returns[(returns['date'] >= '2019-06-30') & (returns['date'] <= '2019-09-30')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q4_2019 = ((returns[(returns['date'] >= '2019-09-30') & (returns['date'] <= '2019-12-31')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 

q1_2020 = ((returns[(returns['date'] >= '2019-12-31') & (returns['date'] <= '2020-03-31')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q2_2020 = ((returns[(returns['date'] >= '2020-03-31') & (returns['date'] <= '2020-06-30')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q3_2020 = ((returns[(returns['date'] >= '2020-06-30') & (returns['date'] <= '2020-09-30')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 
q4_2020 = ((returns[(returns['date'] >= '2020-09-30') & (returns['date'] <= '2020-12-31')]).groupby(['ncusip','ticker'])['Gross_Ret'].prod()-1).reset_index() #Gross Returns 

In [9]:
q1_2019

Unnamed: 0,ncusip,ticker,Gross_Ret
0,00030710,AAC,0.304962
1,00036020,AAON,0.325110
2,00036110,AIR,-0.116952
3,00037520,ABB,-0.003171
4,00081T10,ACCO,0.280592
...,...,...,...
7688,Y8564W10,TK,0.184951
7689,Y8565J10,TOO,-0.024388
7690,Y8565N10,TNK,0.032023
7691,Y8897Y80,TOPS,-0.029583


In [10]:
# Merge = pd.merge(fundamentals.loc[fundamentals['datadate']]=='2020-06-30', q3_2020a,left_on='cusip', right_on='ncusip', how='right',indicator=True,validate='many_to_one')
# Merge.tail(20)

q1_2018_acct = fundamentals[fundamentals['datadate'] == '2018-03-31']
q2_2018_acct = fundamentals[fundamentals['datadate'] == '2018-06-30']
q3_2018_acct = fundamentals[fundamentals['datadate'] == '2018-09-30']
q4_2018_acct = fundamentals[fundamentals['datadate'] == '2018-12-31']

q1_2019_acct = fundamentals[fundamentals['datadate'] == '2019-03-31']
q2_2019_acct = fundamentals[fundamentals['datadate'] == '2019-06-30']
q3_2019_acct = fundamentals[fundamentals['datadate'] == '2019-09-30']
q4_2019_acct = fundamentals[fundamentals['datadate'] == '2019-12-31']

q1_2020_acct = fundamentals[fundamentals['datadate'] == '2020-03-31']
q2_2020_acct = fundamentals[fundamentals['datadate'] == '2020-06-30']
q3_2020_acct = fundamentals[fundamentals['datadate'] == '2020-09-30']
q4_2020_acct = fundamentals[fundamentals['datadate'] == '2020-12-31']

q1_2018_acct


#df[(df['date'] > '2013-01-01') & (df['date'] < '2013-02-01')]

# Merge = pd.merge(q2_2020_acct, q3_2020,left_on='cusip', right_on='ncusip',how='left',indicator=True)
# Merge

Unnamed: 0.1,Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,indfmt,consol,popsrc,datafmt,...,prirow,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,ipodate
80,80,1045,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,,4.0,4512.0,605.0,600.0,C,TX,0.0,www.aa.com,
124,124,1050,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,,1.0,3564.0,345.0,925.0,C,TX,0.0,www.cecoenviro.com,
211,211,1072,2018-03-31,2017.0,4.0,3.0,INDL,C,D,STD,...,,1.0,3670.0,220.0,925.0,B-,SC,0.0,www.avx.com,1973-03-01
251,251,1075,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,,1.0,4911.0,705.0,700.0,A,AZ,0.0,www.pinnaclewest.com,
295,295,1076,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,,1.0,6141.0,175.0,976.0,B,UT,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501048,501048,329141,2018-03-31,2018.0,3.0,6.0,INDL,C,D,STD,...,01W,90.0,7372.0,,,,,0.0,www.renalytixai.com,2020-07-17
501064,501064,330227,2018-03-31,2018.0,2.0,9.0,INDL,C,D,STD,...,01W,1.0,4412.0,,,,,0.0,www.castormaritime.com,2019-02-11
501102,501102,335466,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,,1.0,,,,,,3.0,www.hermitage-offshore.com,
501123,501123,345920,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,,1.0,3524.0,,,,PA,0.0,www.hydrofarm.com,2020-12-10


In [11]:
Merge_q1_2018 = pd.merge(q1_2018_acct, q2_2018, left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q2_2018 = pd.merge(q2_2018_acct, q3_2018, left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q3_2018 = pd.merge(q3_2018_acct, q4_2018, left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q4_2018 = pd.merge(q4_2018_acct, q1_2019, left_on='cusip', right_on='ncusip',how='inner',indicator=True)

Merge_q1_2019 = pd.merge(q1_2019_acct, q2_2019,left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q2_2019 = pd.merge(q2_2019_acct, q3_2019,left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q3_2019 = pd.merge(q3_2019_acct, q4_2019,left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q4_2019 = pd.merge(q4_2019_acct, q1_2020,left_on='cusip', right_on='ncusip',how='inner',indicator=True)

Merge_q1_2020 = pd.merge(q1_2020_acct, q2_2020,left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q2_2020 = pd.merge(q2_2020_acct, q3_2020,left_on='cusip', right_on='ncusip',how='inner',indicator=True)
Merge_q3_2020 = pd.merge(q3_2020_acct, q4_2020,left_on='cusip', right_on='ncusip',how='inner',indicator=True)





In [12]:
frames = [Merge_q1_2018, Merge_q2_2018, Merge_q3_2018, Merge_q4_2018, 
          Merge_q1_2019, Merge_q2_2019, Merge_q3_2019, Merge_q4_2019, 
          Merge_q1_2020, Merge_q2_2020, Merge_q3_2020]

Final = pd.concat(frames)
#Final.sort_values(by='datadate',ascending=True).reset_index() # This now sorts in date order

Final.tail(20)
Final.iloc[350:370]

Unnamed: 0.1,Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,indfmt,consol,popsrc,datafmt,...,spcseccd,spcsrc,state,stko,weburl,ipodate,ncusip,ticker,Gross_Ret,_merge
350,27667,6900,2018-03-31,2018.0,2.0,9.0,INDL,C,D,STD,...,940.0,B-,MN,0.0,www.mts.com,,55377710,MTSC,0.025086,both
351,27750,6932,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,700.0,A-,WI,0.0,www.mgeenergy.com,,55277P10,MGEE,0.129965,both
352,27822,6946,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,976.0,A,ON,0.0,www.magna.com,,55922240,MGA,0.036857,both
353,27917,6994,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,925.0,C,WI,0.0,www.manitowoc.com,,56357140,MTW,-0.091356,both
354,27979,7017,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,935.0,C,TX,0.0,www.marathonoil.com,,56584910,MRO,0.296284,both
355,28025,7022,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,976.0,B,WI,0.0,www.marcuscorp.com,,56633010,MCS,0.075908,both
356,28069,7034,2018-03-31,2018.0,3.0,6.0,INDL,C,D,STD,...,800.0,B-,TX,0.0,www.marps-marine.com,,56842310,MARPS,0.282305,both
357,28113,7063,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,800.0,B,MD,0.0,www.hosthotels.com,,44107P10,HST,0.141204,both
358,28157,7065,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,800.0,A,NY,0.0,www.mmc.com,,57174810,MMC,-0.002891,both
359,28206,7085,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,976.0,B,MI,0.0,masco.com,,57459910,MAS,-0.072298,both


In [13]:
Final.shape

(69668, 663)

In [14]:
Final['ticker'].nunique() #8012 unique tickers 
Final['ncusip'].nunique() #7944 unique ncusip
#ticker and ncusip should match up!!!!

Final['cusip'].nunique() #11,453 unique cusip identifers 
Final['ticker'].value_counts()
#Final['ncusip'].value_counts()


VVPR    14
CTHR    14
RXN     14
FOR     13
AM      12
        ..
SEAH     1
SGY      1
AHGP     1
DYSL     1
LNSR     1
Name: ticker, Length: 8012, dtype: int64

In [15]:
_Final = Final
_Final.drop_duplicates()
_Final
_Final[_Final['ticker'].str.contains("CTHR")]


Unnamed: 0.1,Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,indfmt,consol,popsrc,datafmt,...,spcseccd,spcsrc,state,stko,weburl,ipodate,ncusip,ticker,Gross_Ret,_merge
3593,254743,65850,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.213236,both
3594,254744,65850,2018-03-31,2018.0,3.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.213236,both
3708,254745,65850,2018-06-30,2018.0,4.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.184206,both
3709,254746,65850,2018-06-30,2018.0,2.0,12.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.184206,both
3798,254747,65850,2018-09-30,2018.0,3.0,12.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.026237,both
3799,254748,65850,2018-09-30,2019.0,1.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.026237,both
3877,254749,65850,2018-12-31,2019.0,2.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,0.310346,both
3999,254750,65850,2019-03-31,2019.0,3.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,0.385964,both
4118,254751,65850,2019-06-30,2019.0,4.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,0.031643,both
4204,254752,65850,2019-09-30,2020.0,1.0,6.0,INDL,C,D,STD,...,976.0,C,NC,0.0,www.charlesandcolvard.com,1997-11-14,15976510,CTHR,-0.089741,both


In [16]:
_Final.columns[120:130]
_Final.columns[175]


'invrmq'

In [17]:
_Final.groupby(['ticker','datadate'])['niq'].mean()
_Final.groupby(['ticker','datadate'])['atq'].mean()
_Final['dvpsxq']
_Final['prccq']
_Final['revtq']

0       10401.000
1          74.139
2         446.632
3         692.714
4        7390.000
          ...    
6788          NaN
6789        0.000
6790        0.288
6791       96.658
6792      606.000
Name: revtq, Length: 69668, dtype: float64

### useful columns
niq: net income (in millions)
dvpsxq: dividends per share
revtq: revenue 


In [49]:
metrics = pd.read_stata('metrics.dta')
metrics.head(15)

#evm: enterprise value multiple
#pe_exi: price/earnings excluding extraordianry income 
#npm: net profit margin
#gpm: gross profit margin
#roa: retrun on assets
#roe: return on equity
#de_ratio: debt/equity
#ptb: price/book
#PEG_trailing: PEG 12ttm 
#divyield: dividend yield 



Unnamed: 0,adate,qdate,public_date,evm,pe_exi,npm,gpm,roa,roe,efftax,de_ratio,ptb,PEG_trailing,divyield,TICKER,cusip
0,2017-05-31,2017-11-30,2018-01-31,13.627491,25.29375,0.032812,0.177485,0.070778,0.058999,0.324361,0.67503,1.524219,,0.007413,AIR,36110
1,2017-05-31,2017-11-30,2018-02-28,13.627491,26.6125,0.032812,0.177485,0.070778,0.058999,0.324361,0.67503,1.603688,,0.007046,AIR,36110
2,2017-05-31,2017-11-30,2018-03-31,13.627491,27.56875,0.032812,0.177485,0.070778,0.058999,0.324361,0.67503,1.657007,,0.006801,AIR,36110
3,2017-05-31,2018-02-28,2018-04-30,12.982963,20.817308,0.041674,0.185011,0.077603,0.076687,0.324361,0.66429,1.614672,,0.006928,AIR,36110
4,2017-05-31,2018-02-28,2018-05-31,12.982963,21.485577,0.041674,0.185011,0.077603,0.076687,0.324361,0.66429,1.666505,,0.006713,AIR,36110
5,2017-05-31,2018-02-28,2018-06-30,12.982963,22.350962,0.041674,0.185011,0.077603,0.076687,0.324361,0.66429,1.733628,,0.006453,AIR,36110
6,2018-05-31,2018-05-31,2018-07-31,13.665281,22.469194,0.042155,0.191672,0.083531,0.07744,0.045337,0.628431,1.730067,,0.006328,AIR,36110
7,2018-05-31,2018-05-31,2018-08-31,13.665281,22.118483,0.042155,0.191672,0.083531,0.07744,0.045337,0.628431,1.703063,,0.006428,AIR,36110
8,2018-05-31,2018-05-31,2018-09-30,13.665281,22.696682,0.042155,0.191672,0.083531,0.07744,0.045337,0.628431,1.762724,,0.006264,AIR,36110
9,2018-05-31,2018-08-31,2018-10-31,13.105509,20.333333,0.044917,0.189685,0.086178,0.086684,0.045337,0.65957,1.773482,,0.006305,AIR,36110


In [48]:
#for each qdate and cusip, take the first qdate option 

metrics_adj = metrics.drop_duplicates(
  subset = ['qdate', 'cusip'],
  keep = 'first').reset_index(drop = True)
metrics_adj

Unnamed: 0,adate,qdate,public_date,evm,pe_exi,npm,gpm,roa,roe,efftax,de_ratio,ptb,PEG_trailing,divyield,TICKER,cusip
0,2017-05-31,2017-11-30,2018-01-31,13.627491,25.293750,0.032812,0.177485,0.070778,0.058999,0.324361,0.675030,1.524219,,0.007413,AIR,00036110
1,2017-05-31,2018-02-28,2018-04-30,12.982963,20.817308,0.041674,0.185011,0.077603,0.076687,0.324361,0.664290,1.614672,,0.006928,AIR,00036110
2,2018-05-31,2018-05-31,2018-07-31,13.665281,22.469194,0.042155,0.191672,0.083531,0.077440,0.045337,0.628431,1.730067,,0.006328,AIR,00036110
3,2018-05-31,2018-08-31,2018-10-31,13.105509,20.333333,0.044917,0.189685,0.086178,0.086684,0.045337,0.659570,1.773482,,0.006305,AIR,00036110
4,2018-05-31,2018-11-30,2019-01-31,12.598434,16.599119,0.042077,0.186303,0.090734,0.084956,0.045337,0.662693,1.397717,,0.007962,AIR,00036110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45741,2019-12-31,2019-12-31,2020-02-29,-2.110395,-1.498039,,,,,,0.744055,2.148020,,,ARMP,04216R10
45742,2019-12-31,2020-03-31,2020-05-31,-2.383779,-1.556485,,,-0.627996,-1.371752,,0.505825,1.897191,,,ARMP,04216R10
45743,2019-12-31,2020-06-30,2020-08-31,-2.946872,-1.714286,-688.032258,,-0.560890,-0.843502,,0.547478,1.848924,,,ARMP,04216R10
45744,2019-12-31,2020-09-30,2020-11-30,-3.583620,-2.156463,-63.144201,,-0.477488,-0.744982,,0.609360,2.140574,,,ARMP,04216R10


In [67]:
#metrics['qdate'] = pd.to_datetime(metrics['qdate'], format='%Y%m%d', errors='coerce') #change dates format 

data_with_metrics = pd.merge(Final, metrics_adj,  how='inner', left_on=['cusip','datadate'], right_on = ['cusip','qdate'])
data_with_metrics.sort_values(['TICKER','datadate']).reset_index()# This now sorts in date order
data_with_metrics.head(20)

Unnamed: 0.1,Unnamed: 0,gvkey,datadate,fyearq,fqtr,fyr,indfmt,consol,popsrc,datafmt,...,npm,gpm,roa,roe,efftax,de_ratio,ptb,PEG_trailing,divyield,TICKER
0,80,1045,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.025484,0.328241,0.129348,0.292062,0.377756,18.904739,5.148619,1.7223,0.009187,AAL
1,124,1050,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.008256,0.335112,0.048277,0.01321,3.149752,1.296207,0.975482,0.213359,,CECE
2,211,1072,2018-03-31,2017.0,4.0,3.0,INDL,C,D,STD,...,0.003142,0.245048,0.094114,0.002195,0.974469,0.191368,1.141916,,0.030085,AVX
3,251,1075,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.130818,0.398014,0.085254,0.059396,0.337072,2.3596,1.288686,3.634455,0.03492,PNW
4,339,1078,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.013218,0.6283,0.097544,0.011759,0.832447,1.301136,3.435576,,0.018203,ABT
5,433,1094,2018-03-31,2018.0,3.0,6.0,INDL,C,D,STD,...,-0.37909,0.235754,0.05064,-0.68632,0.344738,1.852784,0.609787,,0.098485,ACET
6,556,1104,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.030904,0.390004,0.101426,0.084574,0.375943,1.205887,1.510188,,0.019469,ACU
7,689,1121,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.001078,0.018175,0.062366,0.010233,,0.804912,1.264578,1.68923,0.019923,AE
8,733,1161,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.014156,0.37749,0.105835,0.164634,0.306452,5.300928,18.614116,,,AMD
9,835,1177,2018-03-31,2018.0,1.0,12.0,INDL,C,D,STD,...,0.057748,0.270891,0.107536,0.229092,0.363302,2.613357,3.513363,0.268924,0.011355,AET


In [78]:
a =data_with_metrics[data_with_metrics['TICKER'] == 'AAL']
sm_ols('Gross_Ret ~ npm + gpm ', data=data_with_metrics).fit().summary()
#evm: enterprise value multiple
#pe_exi: price/earnings excluding extraordianry income 
#npm: net profit margin
#gpm: gross profit margin
#roa: retrun on assets
#roe: return on equity
#de_ratio: debt/equity
#ptb: price/book
#PEG_trailing: PEG 12ttm 
#divyield: dividend yield 



0,1,2,3
Dep. Variable:,Gross_Ret,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.09079
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.913
Time:,02:23:19,Log-Likelihood:,-15707.0
No. Observations:,30364,AIC:,31420.0
Df Residuals:,30361,BIC:,31450.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0623,0.002,26.723,0.000,0.058,0.067
npm,-1.417e-07,1.18e-05,-0.012,0.990,-2.33e-05,2.3e-05
gpm,-3.881e-06,1.4e-05,-0.277,0.782,-3.13e-05,2.36e-05

0,1,2,3
Omnibus:,37203.42,Durbin-Watson:,1.565
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17772390.619
Skew:,6.166,Prob(JB):,0.0
Kurtosis:,120.879,Cond. No.,359.0


In [110]:
a = data_with_metrics[data_with_metrics['datadate'] >= '2018-03-31' & data_with_metrics['datadate'] <= '2019-03-31']

a['gpm'] = a['gpm'][a['gpm'].between(a['gpm'].quantile(.15), a['gpm'].quantile(.85))] # without outliers
a['npm'] = a['npm'][a['npm'].between(a['npm'].quantile(.15), a['npm'].quantile(.85))] # without outliers
a['roe'] = a['roe'][a['roe'].between(a['roe'].quantile(.15), a['roe'].quantile(.85))] # without outliers
a['roa'] = a['roa'][a['roa'].between(a['roa'].quantile(.15), a['roa'].quantile(.85))] # without outliers

#sns.scatterplot(x='gpm', y='Gross_Ret', data=a)
sns.scatterplot(x='roe', y='Gross_Ret', data=a)


TypeError: Cannot perform 'rand_' with a dtyped [object] array and scalar of type [bool]

In [106]:
sm_ols('Gross_Ret ~ npm + gpm', data=a).fit().summary()


  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,Gross_Ret,R-squared:,0.105
Model:,OLS,Adj. R-squared:,-0.119
Method:,Least Squares,F-statistic:,0.4695
Date:,"Mon, 12 Apr 2021",Prob (F-statistic):,0.642
Time:,02:39:42,Log-Likelihood:,9.6709
No. Observations:,11,AIC:,-13.34
Df Residuals:,8,BIC:,-12.15
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.3343,5.136,0.649,0.534,-8.510,15.179
npm,0.5187,1.908,0.272,0.793,-3.881,4.918
gpm,-8.3799,12.722,-0.659,0.529,-37.717,20.958

0,1,2,3
Omnibus:,4.834,Durbin-Watson:,1.798
Prob(Omnibus):,0.089,Jarque-Bera (JB):,1.44
Skew:,0.362,Prob(JB):,0.487
Kurtosis:,4.618,Cond. No.,418.0
