In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

We start off by loading and then merging the datasets we want to use:

In [116]:
dat1 = pd.read_csv("key.csv", sep=',')
dat2 = pd.read_csv("SP500_finratios.csv", sep=',', parse_dates=['adate', 'qdate', 'public_date'])
dat3 = pd.read_csv("ratings2.csv", sep=',', parse_dates=['datadate'])

In [117]:
#the columns of the first data set are renamed to match the names of the other data sets
dat1.columns = ["gvkey","linktype","permno","permco","linkdt","linkenddt","conm","tic","cusip"]

In [118]:
#the duplicates in the key data set are removed, so that pd.merge() will work
dat1 = dat1.set_index("permno")
dat1 = dat1[~dat1.index.duplicated(keep="first")]

In [120]:
#the variable datadate is renamed public_date and both are transformed to the same format, so 
#that pd.merge()recognises them as one and the same
dat3['public_date'] = dat3['datadate']
del dat3['datadate']
dat2['public_date'] = pd.to_datetime(dat2.public_date)
dat3['public_date'] = pd.to_datetime(dat3.public_date)

In [138]:
dat1and2 = pd.merge(dat1, dat2, on="permno", how="outer", validate="one_to_many")

In [140]:
dat = pd.merge(dat1and2, dat3, on=["gvkey", "public_date", "conm", "tic", "cusip"], how="outer", validate="one_to_one")

In [141]:
dat

Unnamed: 0,permno,gvkey,linktype,permco,linkdt,linkenddt,conm,tic,cusip,adate,...,debt_capital,de_ratio,cash_ratio,quick_ratio,curr_ratio,at_turn,ptb,PEG_trailing,DIVYIELD,splticrm
0,21020.0,1045.0,LC,20010.0,19500101.0,19620130,AMERICAN AIRLINES GROUP INC,AAL,02376R102,2008-12-31,...,1.338,-9.366,0.428,0.603,0.664,0.816,,,,B-
1,21020.0,1045.0,LC,20010.0,19500101.0,19620130,AMERICAN AIRLINES GROUP INC,AAL,02376R102,2009-12-31,...,1.376,-8.291,0.629,0.787,0.859,0.787,,,,B-
2,21020.0,1045.0,LC,20010.0,19500101.0,19620130,AMERICAN AIRLINES GROUP INC,AAL,02376R102,2009-12-31,...,1.376,-8.291,0.629,0.787,0.859,0.787,,,,B-
3,21020.0,1045.0,LC,20010.0,19500101.0,19620130,AMERICAN AIRLINES GROUP INC,AAL,02376R102,2009-12-31,...,1.376,-8.291,0.629,0.787,0.859,0.787,,,,B-
4,21020.0,1045.0,LC,20010.0,19500101.0,19620130,AMERICAN AIRLINES GROUP INC,AAL,02376R102,2009-12-31,...,1.366,-8.617,0.551,0.712,0.780,0.799,,,,B-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41711,,316056.0,,,,,ALLEGION PLC,ALLE,G0176J109,NaT,...,,,,,,,,,,BBB-
41712,,316056.0,,,,,ALLEGION PLC,ALLE,G0176J109,NaT,...,,,,,,,,,,BBB-
41713,,316056.0,,,,,ALLEGION PLC,ALLE,G0176J109,NaT,...,,,,,,,,,,BBB-
41714,,316056.0,,,,,ALLEGION PLC,ALLE,G0176J109,NaT,...,,,,,,,,,,BBB-


In [136]:
pd.isnull(dat).sum()

permno             4517
gvkey                 0
linktype           4517
permco             4517
linkdt             4517
linkenddt          4517
conm                  0
tic                   0
cusip                 0
adate              4535
qdate              4517
public_date           0
bm                 5462
ps                 4534
pcf                4564
dpr                6791
npm                4534
gpm                4534
cfm                4632
roa                4564
roe                5492
roce               4730
efftax             7142
GProf              4534
equity_invcap      4543
debt_invcap        4659
totdebt_invcap     4682
capital_ratio      4650
int_debt           9010
int_totdebt        8756
cash_lt            4569
invt_act           9994
debt_at            4673
debt_ebitda        4759
short_debt         6384
curr_debt          9697
lt_debt            4650
ocf_lct            9683
cash_debt          4812
fcf_ocf            5354
dltt_be            5514
debt_assets     

In [125]:
dat.isnull().sum()/len(dat)

permno            0.000000
gvkey             0.000000
linktype          0.000000
permco            0.000000
linkdt            0.000000
linkenddt         0.000000
conm              0.000000
tic               0.000000
cusip             0.000000
adate             0.000435
qdate             0.000000
public_date       0.000000
bm                0.021502
ps                0.000402
pcf               0.000904
dpr               0.062563
npm               0.000402
gpm               0.000402
cfm               0.003282
roa               0.000904
roe               0.022440
roce              0.003651
efftax            0.070802
GProf             0.000402
equity_invcap     0.000636
debt_invcap       0.002210
totdebt_invcap    0.002780
capital_ratio     0.001976
int_debt          0.107609
int_totdebt       0.101949
cash_lt           0.000871
invt_act          0.143245
debt_at           0.002545
debt_ebitda       0.005426
short_debt        0.038382
curr_debt         0.134570
lt_debt           0.001976
o

In [135]:
dat3["splticrm"].value_counts()

BBB     5599
BBB+    5104
A-      4053
A       3451
BBB-    3175
A+      1962
BB+     1591
BB-      957
AA-      941
BB       888
AA       417
B+       361
AAA      296
AA+      199
B        135
B-       133
CCC+      50
D         25
CCC        2
Name: splticrm, dtype: int64

In [13]:
des = dat.loc[:, "bm":].describe()
ind = [3, 1, 5, 7]
print(des.iloc[ind])

              bm          ps         pcf        dpr        npm        gpm  \
min     0.001000    0.047000 -225.440000  -1.795000 -51.493000 -37.707000   
mean    0.521213    2.380448   11.931066   0.500626   0.084048   0.424128   
50%     0.407000    1.736000   10.846000   0.318000   0.090000   0.393000   
max   137.237000  145.774000  280.893000  80.554000  14.836000   1.169000   

            cfm       roa        roe      roce  ...     dltt_be  debt_assets  \
min  -47.694000 -0.595000 -34.647000 -3.560000  ...    0.000000      0.04300   
mean   0.154931  0.140678   0.178237  0.180012  ...    0.916859      0.62851   
50%    0.151000  0.135000   0.135000  0.153000  ...    0.504000      0.62900   
max   14.836000  1.010000  37.037000  6.615000  ...  290.238000      1.91900   

      debt_capital     de_ratio  cash_ratio  quick_ratio  curr_ratio  \
min       0.002000 -1228.100000    0.000000      0.09100    0.113000   
mean      0.509278     3.428223    0.719937      1.43636    1.838751 