In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

We start off by loading and then merging the datasets we want to use:

In [None]:
dat1 = pd.read_csv('key.csv', sep=',')
dat2 = pd.read_csv('SP500_finratios.csv', sep=',', parse_dates=['adate', 'qdate', 'public_date'])
dat3 = pd.read_csv('ratings2.csv', sep=',', parse_dates=['datadate'])

In [None]:
#the columns of the first data set are renamed to match the names of the other data sets
dat1.columns = ['gvkey','linktype','permno','permco','linkdt','linkenddt','conm','tic','cusip']

In [None]:
#the duplicates in the key data set are removed, so that pd.merge() will work
dat1 = dat1.set_index('permno')
dat1 = dat1[~dat1.index.duplicated(keep='first')]

In [None]:
#the variable datadate is renamed public_date and both are transformed to the same format, so
# that pd.merge()recognises them as one and the same
dat3['public_date'] = dat3['datadate']
del dat3['datadate']
dat2['public_date'] = pd.to_datetime(dat2.public_date)
dat3['public_date'] = pd.to_datetime(dat3.public_date)

In [None]:
dat1and2 = pd.merge(dat1, dat2, on='permno', how='inner', validate='one_to_many')

In [None]:
dat = pd.merge(dat1and2, dat3, on=['gvkey', 'public_date', 'conm', 'tic', 'cusip'], how='inner', validate='one_to_one')

In [None]:
dat

In [None]:
#just a little indulgence
class color:
   purple = '\033[95m'
   cyan = '\033[36m'
   blue = '\033[94m'
   green = '\033[92m'
   yellow = '\033[93m'
   red = '\033[91m'
   bold = '\033[1m'
   underline = '\033[4m'
   end = '\033[0m'

Now that we have the data, let's have a look at it. First of, we're interested in the distribution of the ratings:

In [None]:
dat['splticrm'].value_counts()

Seeing as there are only four observations of rating D, and only two observations of rating CCC, our data set does not allow us to draw any conclusions for these ratings and we have to drop them from our sample.

In [None]:
dat = dat[dat['splticrm'] != 'CCC']
dat = dat[dat['splticrm'] != 'D']

Following that, we consider our numerical data:

In [None]:
des = dat.loc[:, 'bm':'cash_lt'].describe()
ind = [3, 1, 5, 7]   #printing the entire .describe() information consumes unnecessarily much computation power, so I index the lines I'm interested in
print(des.iloc[ind], '\n')
des = dat.loc[:, 'invt_act':].describe()  #I do this in two steps, because I don't want any variables hidden behind "..."
ind = [3, 1, 5, 7]
print(des.iloc[ind])

The output suggests that several variables have extreme outliers - for instance bm has a minimum of 0.001000, a mean of  0.506463, but a maximum of 137.237000. Visualising the data with boxplots shows this quite notably. Since our models will work a lot better if the data is more or less normally distributed, we use monotonic transformations on all variables where it makes sense.

In [None]:
sns.boxplot(x = 'bm', data = dat)

In [None]:
logbm = np.log(dat.loc[:, 'bm'])
sns.boxplot(x = logbm)

In [None]:
sns.boxplot(x = 'ps', data = dat)

In [None]:
logps = np.log(dat.loc[:, 'ps'])
sns.boxplot(x = logps)

In [None]:
sns.boxplot(x = 'pcf', data = dat)

In [None]:
sns.boxplot(x = 'dpr', data = dat)

In [None]:
def safe_ln(x):
    logdpr = np.log(dat.loc[:, 'dpr'])
    logdpr = 0
    return logdpr
sns.boxplot(x = logdpr)

In [None]:
sns.boxplot(x = 'npm', data = dat)

In [None]:
def safe_ln(x):
    lognpm = np.log(dat.loc[:, 'npm'])
    lognpm = 0
    return lognpm
sns.boxplot(x = lognpm)

In [None]:
sns.boxplot(x = 'gpm', data = dat)

In [None]:
expgpm = np.exp(dat.loc[:, 'gpm'])
sns.boxplot(x = expgpm)

In [None]:
sns.boxplot(x = 'cfm', data = dat)

In [None]:
def safe_ln(x):
    logcfm = np.log(dat.loc[:, 'cfm'])
    logcfm = 0
    return logcfm
sns.boxplot(x = logcfm)

In [None]:
sns.boxplot(x = 'roa', data = dat)

In [None]:
sns.boxplot(x = 'roe', data = dat)

In [None]:
def safe_ln(x):
    logroe = np.log(dat.loc[:, 'roe'])
    logroe = 0
    return logroe
sns.boxplot(x = logroe)

In [None]:
sns.boxplot(x = 'roce', data = dat)

In [None]:
def safe_ln(x):
    logroce = np.log(dat.loc[:, 'roce'])
    logroce = 0
    return logroe
sns.boxplot(x = logroce)

In [None]:
sns.boxplot(x = 'efftax', data = dat)

In [None]:
def safe_ln(x):
    logefftax = np.log(dat.loc[:, 'efftax'])
    logefftax = 0
    return logefftax
sns.boxplot(x = logefftax)

In [None]:
sns.boxplot(x = 'GProf', data = dat)

In [None]:
sns.boxplot(x = 'equity_invcap', data = dat)

In [None]:
expequity_invcap = np.exp(dat.loc[:, 'equity_invcap'])
sns.boxplot(x = expequity_invcap)

In [None]:
sns.boxplot(x = 'debt_invcap', data = dat)

In [None]:
def safe_ln(x):
    logdebt_invcap = np.log(dat.loc[:, 'debt_invcap'])
    logdebt_invcap = 0
    return logdebt_invcap
sns.boxplot(x = logdebt_invcap)

In [None]:
sns.boxplot(x = 'totdebt_invcap', data = dat)

In [None]:
def safe_ln(x):
    logtotdebt_invcap = np.log(dat.loc[:, 'totdebt_invcap'])
    logtotdebt_invcap = 0
    return logtotdebt_invcap
sns.boxplot(x = logtotdebt_invcap)

In [None]:
sns.boxplot(x = 'capital_ratio', data = dat)

In [None]:
def safe_ln(x):
    logcapital_ratio = np.log(dat.loc[:, 'capital_ratio'])
    logcapital_ratio = 0
    return logcapital_ratio
sns.boxplot(x = logcapital_ratio)

In [None]:
sns.boxplot(x = 'int_debt', data = dat)

In [None]:
def safe_ln(x):
    logint_debt = np.log(dat.loc[:, 'int_debt'])
    logint_debt = 0
    return logint_debt
sns.boxplot(x = logint_debt)

In [None]:
sns.boxplot(x = 'int_totdebt', data = dat)

In [None]:
def safe_ln(x):
    logint_totdebt = np.log(dat.loc[:, 'int_totdebt'])
    logint_totdebt = 0
    return logint_totdebt
sns.boxplot(x = logint_totdebt)

In [None]:
sns.boxplot(x = 'cash_lt', data = dat)

In [None]:
def safe_ln(x):
    logcash_lt = np.log(dat.loc[:, 'cash_lt'])
    logcash_lt = 0
    return logcash_lt
sns.boxplot(x =  logcash_lt)

In [None]:
sns.boxplot(x = 'invt_act', data = dat)

In [None]:
def safe_ln(x):
    loginvt_act = np.log(dat.loc[:, 'invt_act'])
    loginvt_act = 0
    return loginvt_act
sns.boxplot(x = loginvt_act)

In [None]:
sns.boxplot(x = 'debt_at', data = dat)

In [None]:
def safe_ln(x):
    logdebt_at = np.log(dat.loc[:, 'debt_at'])
    logdebt_at = 0
    return logdebt_at
sns.boxplot(x = logdebt_at)

In [None]:
sns.boxplot(x = 'debt_ebitda', data = dat)

In [None]:
def safe_ln(x):
    logdebt_ebitda = np.log(dat.loc[:, 'debt_ebitda'])
    logdebt_ebitda = 0
    return logdebt_ebitda
sns.boxplot(x = logdebt_ebitda)

In [None]:
sns.boxplot(x = 'short_debt', data = dat)

In [None]:
def safe_ln(x):
    logshort_debt = np.log(dat.loc[:, 'short_debt'])
    logshort_debt = 0
    return logshort_debt
sns.boxplot(x = logshort_debt)

In [None]:
sns.boxplot(x = 'curr_debt', data = dat)

In [None]:
def safe_ln(x):
    logcurr_debt = np.log(dat.loc[:, 'curr_debt'])
    logcurr_debt = 0
    return logcurr_debt
sns.boxplot(x = logcurr_debt)

In [None]:
sns.boxplot(x = 'lt_debt', data = dat)

In [None]:
def safe_ln(x):
    loglt_debt = np.log(dat.loc[:, 'lt_debt'])
    loglt_debt = 0
    return loglt_debt
sns.boxplot(x = loglt_debt)

In [None]:
sns.boxplot(x = 'ocf_lct', data = dat)

In [None]:
def safe_ln(x):
    logocf_lct = np.log(dat.loc[:, 'ocf_lct'])
    logocf_lct = 0
    return logocf_lct
sns.boxplot(x = logocf_lct)

In [None]:
sns.boxplot(x = 'cash_debt', data = dat)

In [None]:
def safe_ln(x):
    logcash_debt = np.log(dat.loc[:, 'cash_debt'])
    logcash_debt = 0
    return logcash_debt
sns.boxplot(x = logcash_debt)

In [None]:
sns.boxplot(x = 'fcf_ocf', data = dat)

In [None]:
def safe_ln(x):
    logfcf_ocf = np.log(dat.loc[:, 'fcf_ocf'])
    logfcf_ocf = 0
    return logfcf_ocf
sns.boxplot(x = logfcf_ocf)

In [None]:
sns.boxplot(x = 'dltt_be', data = dat)

In [None]:
def safe_ln(x):
    logdltt_be = np.log(dat.loc[:, 'dltt_be'])
    logdltt_be = 0
    return logdltt_be
sns.boxplot(x = logdltt_be)

In [None]:
sns.boxplot(x = 'debt_assets', data = dat)

In [None]:
def safe_ln(x):
    logdebt_assets = np.log(dat.loc[:, 'debt_assets'])
    logdebt_assets = 0
    return logdebt_assets
sns.boxplot(x = logdebt_assets)

In [None]:
sns.boxplot(x = 'debt_capital', data = dat)

In [None]:
def safe_ln(x):
    logdebt_capital = np.log(dat.loc[:, 'debt_capital'])
    logdebt_capital = 0
    return logdebt_capital
sns.boxplot(x = logdebt_capital)

In [None]:
sns.boxplot(x = 'de_ratio', data = dat)

In [None]:
def safe_ln(x):
    logde_ratio = np.log(dat.loc[:, 'de_ratio'])
    logde_ratio = 0
    return logde_ratio
sns.boxplot(x = logde_ratio)

In [None]:
sns.boxplot(x = 'cash_ratio', data = dat)

In [None]:
def safe_ln(x):
    logcash_ratio = np.log(dat.loc[:, 'cash_ratio'])
    logcash_ratio = 0
    return logcash_ratio
sns.boxplot(x = logcash_ratio)

In [None]:
sns.boxplot(x = 'quick_ratio', data = dat)

In [None]:
def safe_ln(x):
    logquick_ratio = np.log(dat.loc[:, 'quick_ratio'])
    logquick_ratio = 0
    return logquick_ratio
sns.boxplot(x = logquick_ratio)

In [None]:
sns.boxplot(x = 'curr_ratio', data = dat)

In [None]:
def safe_ln(x):
    logcurr_ratio = np.log(dat.loc[:, 'curr_ratio'])
    logcurr_ratio = 0
    return logcurr_ratio
sns.boxplot(x = logcurr_ratio)

In [None]:
sns.boxplot(x = 'at_turn', data = dat)

In [None]:
def safe_ln(x):
    logat_turn = np.log(dat.loc[:, 'at_turn'])
    logat_turn = 0
    return logat_turn
sns.boxplot(x = logat_turn)

In [None]:
sns.boxplot(x = 'ptb', data = dat)

In [None]:
def safe_ln(x):
    logptb = np.log(dat.loc[:, 'ptb'])
    logptb = 0
    return logptb
sns.boxplot(x = logptb)

In [None]:
sns.boxplot(x = 'PEG_trailing', data = dat)

In [None]:
def safe_ln(x):
    logPEG_trailing = np.log(dat.loc[:, 'PEG_trailing'])
    logPEG_trailing = 0
    return logPEG_trailing
sns.boxplot(x = logPEG_trailing)

In [None]:
sns.boxplot(x = 'DIVYIELD', data = dat)

In [None]:
def safe_ln(x):
    logDIVYIELD = np.log(dat.loc[:, 'DIVYIELD'])
    logDIVYIELD = 0
    return logDIVYIELD
sns.boxplot(x = logDIVYIELD)

After looking at the data we have, we take a look at the data we do not have:

In [None]:
col_Names = dat.columns.values
total_NAs = pd.isna(dat).sum()
percentage_NAs = dat.isna().sum()/len(dat)
print(color.bold + "%-20s %-14s %s" %("Column Names", "Total NAs", "NAs per observations") + color.end )
#I used the % operator because tab didn't work and this allows me to define the spaces between the items

#the loop prints one line after another
for item_a, item_b, item_c in zip(col_Names, total_NAs, percentage_NAs):
    print("%-20s %-14d %.6f" %(item_a, item_b, item_c))

As shown above, one third of the observations of PEG_trailing is missing. The large amount of missing values does not mean we have to dispense with the variable, though. As shown in https://www.sciencedirect.com/science/article/pii/S0895435618308710 we can use multiple imputation even with large amounts of missing data.

However, multiple imputation is based on the assumption of **missing at random**, where "conditional on the observed data, the probability of missingness is independent of unobserved data". (Same source.) This is of course impossible to test, since we will never know what the unobserved data is. What makes the evaluation even harder is that we were not involved in the data collection, and have no way of finding out why data is missing. However, as the data we have is quite extensive and all companies are bound to reporting standards - meaning they cannot change what data to publish on a whim - we believe the assumption is reasonable. By contrast, it is relatively easy to show that the data is not missing completely at random: Simply plotting the relationship between for instance pcf and the missingness of PEG_trailing shows that there exists a pattern.

(For further explanations on the differences between missing at random, missing not at random, and missing completely at random, see our paper.)

In [None]:
sns.regplot(dat.pcf, dat.PEG_trailing.isna())

So now, we will go about filling in the missing data by using multiple imputation, the imputation method being a modeling of each feature with missing values as a function of other features in a round-robin regression.

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer