In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

We start off by loading and then merging the datasets we want to use:

In [60]:
dat1 = pd.read_csv('key.csv', sep=',')
dat2 = pd.read_csv('SP500_finratios.csv', sep=',', parse_dates=['adate', 'qdate', 'public_date'])
dat3 = pd.read_csv('ratings2.csv', sep=',', parse_dates=['datadate'])

In [61]:
#the columns of the first data set are renamed to match the names of the other data sets
dat1.columns = ['gvkey','linktype','permno','permco','linkdt','linkenddt','conm','tic','cusip']

In [62]:
#the duplicates in the key data set are removed, so that pd.merge() will work
dat1 = dat1.set_index('permno')
dat1 = dat1[~dat1.index.duplicated(keep='first')]

In [63]:
#the variable datadate is renamed public_date and both are transformed to the same format, so
# that pd.merge()recognises them as one and the same
dat3['public_date'] = dat3['datadate']
del dat3['datadate']
dat2['public_date'] = pd.to_datetime(dat2.public_date)
dat3['public_date'] = pd.to_datetime(dat3.public_date)

In [64]:
dat1and2 = pd.merge(dat1, dat2, on='permno', how='inner', validate='one_to_many')

KeyError: 'permno'

In [None]:
dat = pd.merge(dat1and2, dat3, on=['gvkey', 'public_date', 'conm', 'tic', 'cusip'], how='inner', validate='one_to_one')

In [None]:
dat

In [None]:
#just a little indulgence
class color:
   purple = '\033[95m'
   cyan = '\033[36m'
   blue = '\033[94m'
   green = '\033[92m'
   yellow = '\033[93m'
   red = '\033[91m'
   bold = '\033[1m'
   underline = '\033[4m'
   end = '\033[0m'

Now that we have the data, let's have a look at it. First of, we're interested in the distribution of the ratings:

In [None]:
dat['splticrm'].value_counts()

Seeing as there are only four observations of rating D, and only two observations of rating CCC, our data set does not allow us to draw any conclusions for these ratings and we have to drop them from our sample.

In [None]:
dat = dat[dat['splticrm'] != 'CCC']
dat = dat[dat['splticrm'] != 'D']

Following that, we consider our numerical data:

In [None]:
des = dat.loc[:, 'bm':'cash_lt'].describe()
ind = [3, 1, 5, 7]   #printing the entire .describe() information consumes unnecessarily much computation power, so I index the lines I'm interested in
print(des.iloc[ind], '\n')
des = dat.loc[:, 'invt_act':].describe()  #I do this in two steps, because I don't want any variables hidden behind "..."
ind = [3, 1, 5, 7]
print(des.iloc[ind])

The output suggests that several variables have extreme outliers - for instance bm has a minimum of 0.001000, a mean of  0.506463, but a maximum of 137.237000. Visualising the data with boxplots shows this quite notably. Since our models will work a lot better if the data is more or less normally distributed, we use monotonic transformations on all variables where it makes sense.

In [None]:
sns.boxplot(x = 'bm', data = dat)

In [None]:
logbm = np.log(dat.loc[:, 'bm'])
sns.boxplot(x = logbm)

In [None]:
sns.boxplot(x = 'ps', data = dat)

In [None]:
logps = np.log(dat.loc[:, 'ps'])
sns.boxplot(x = logps)

In [None]:
sns.boxplot(x = 'pcf', data = dat)

In [None]:
sns.boxplot(x = 'dpr', data = dat)

In [None]:
def safe_ln(x):
    logdpr = np.log(dat.loc[:, 'dpr'])
    logdpr = 0
    return logdpr
sns.boxplot(x = logdpr)

In [None]:
sns.boxplot(x = 'npm', data = dat)

In [None]:
def safe_ln(x):
    lognpm = np.log(dat.loc[:, 'npm'])
    lognpm = 0
    return lognpm
sns.boxplot(x = lognpm)

In [None]:
sns.boxplot(x = 'gpm', data = dat)

In [None]:
expgpm = np.exp(dat.loc[:, 'gpm'])
sns.boxplot(x = expgpm)

In [None]:
sns.boxplot(x = 'cfm', data = dat)

In [None]:
def safe_ln(x):
    logcfm = np.log(dat.loc[:, 'cfm'])
    logcfm = 0
    return logcfm
sns.boxplot(x = logcfm)

In [None]:
sns.boxplot(x = 'roa', data = dat)

In [None]:
sns.boxplot(x = 'roe', data = dat)

In [None]:
def safe_ln(x):
    logroe = np.log(dat.loc[:, 'roe'])
    logroe = 0
    return logroe
sns.boxplot(x = logroe)

In [None]:
sns.boxplot(x = 'roce', data = dat)

In [None]:
def safe_ln(x):
    logroce = np.log(dat.loc[:, 'roce'])
    logroce = 0
    return logroe
sns.boxplot(x = logroce)

In [65]:
sns.boxplot(x = 'efftax', data = dat)

NameError: name 'dat' is not defined

In [66]:
def safe_ln(x):
    logefftax = np.log(dat.loc[:, 'efftax'])
    logefftax = 0
    return logefftax
sns.boxplot(x = logefftax)

NameError: name 'logefftax' is not defined

In [67]:
sns.boxplot(x = 'GProf', data = dat)

NameError: name 'dat' is not defined

In [68]:
sns.boxplot(x = 'equity_invcap', data = dat)

NameError: name 'dat' is not defined

In [69]:
expequity_invcap = np.exp(dat.loc[:, 'equity_invcap'])
sns.boxplot(x = expequity_invcap)

NameError: name 'dat' is not defined

In [70]:
sns.boxplot(x = 'debt_invcap', data = dat)

NameError: name 'dat' is not defined

In [71]:
def safe_ln(x):
    logdebt_invcap = np.log(dat.loc[:, 'debt_invcap'])
    logdebt_invcap = 0
    return logdebt_invcap
sns.boxplot(x = logdebt_invcap)

NameError: name 'logdebt_invcap' is not defined

In [72]:
sns.boxplot(x = 'totdebt_invcap', data = dat)

NameError: name 'dat' is not defined

In [73]:
def safe_ln(x):
    logtotdebt_invcap = np.log(dat.loc[:, 'totdebt_invcap'])
    logtotdebt_invcap = 0
    return logtotdebt_invcap
sns.boxplot(x = logtotdebt_invcap)

NameError: name 'logtotdebt_invcap' is not defined

In [74]:
sns.boxplot(x = 'capital_ratio', data = dat)

NameError: name 'dat' is not defined

In [75]:
def safe_ln(x):
    logcapital_ratio = np.log(dat.loc[:, 'capital_ratio'])
    logcapital_ratio = 0
    return logcapital_ratio
sns.boxplot(x = logcapital_ratio)

NameError: name 'logcapital_ratio' is not defined

In [76]:
sns.boxplot(x = 'int_debt', data = dat)

NameError: name 'dat' is not defined

In [77]:
def safe_ln(x):
    logint_debt = np.log(dat.loc[:, 'int_debt'])
    logint_debt = 0
    return logint_debt
sns.boxplot(x = logint_debt)

NameError: name 'logint_debt' is not defined

In [78]:
sns.boxplot(x = 'int_totdebt', data = dat)

NameError: name 'dat' is not defined

In [79]:
def safe_ln(x):
    logint_totdebt = np.log(dat.loc[:, 'int_totdebt'])
    logint_totdebt = 0
    return logint_totdebt
sns.boxplot(x = logint_totdebt)

NameError: name 'logint_totdebt' is not defined

In [80]:
sns.boxplot(x = 'cash_lt', data = dat)

NameError: name 'dat' is not defined

In [81]:
def safe_ln(x):
    logcash_lt = np.log(dat.loc[:, 'cash_lt'])
    logcash_lt = 0
    return logcash_lt
sns.boxplot(x =  logcash_lt)

NameError: name 'logcash_lt' is not defined

In [53]:
sns.boxplot(x = 'invt_act', data = dat)

NameError: name 'dat' is not defined

In [54]:
def safe_ln(x):
    loginvt_act = np.log(dat.loc[:, 'invt_act'])
    loginvt_act = 0
    return loginvt_act
sns.boxplot(x = loginvt_act)

NameError: name 'loginvt_act' is not defined

In [55]:
sns.boxplot(x = 'debt_at', data = dat)

NameError: name 'dat' is not defined

After looking at the data we have, we take a look at the data we do not have:

In [56]:
col_Names = dat.columns.values
total_NAs = pd.isna(dat).sum()
percentage_NAs = dat.isna().sum()/len(dat)
print(color.bold + "%-20s %-14s %s" %("Column Names", "Total NAs", "NAs per observations") + color.end )
#I used the % operator because tab didn't work and this allows me to define the spaces between the items

#the loop prints one line after another
for item_a, item_b, item_c in zip(col_Names, total_NAs, percentage_NAs):
    print("%-20s %-14d %.6f" %(item_a, item_b, item_c))

NameError: name 'dat' is not defined

As shown above, one third of the observations of PEG_trailing is missing. The large amount of missing values does not mean we have to dispense with the variable, though. As shown in https://www.sciencedirect.com/science/article/pii/S0895435618308710 we can use multiple imputation even with large amounts of missing data.

However, multiple imputation is based on the assumption of **missing at random**, where "conditional on the observed data, the probability of missingness is independent of unobserved data". (Same source.) This is of course impossible to test, since we will never know what the unobserved data is. What makes the evaluation even harder is that we were not involved in the data collection, and have no way of finding out why data is missing. However, as the data we have is quite extensive and all companies are bound to reporting standards - meaning they cannot change what data to publish on a whim - we believe the assumption is reasonable. By contrast, it is relatively easy to show that the data is not missing completely at random: Simply plotting the relationship between for instance pcf and the missingness of PEG_trailing shows that there exists a pattern.

(For further explanations on the differences between missing at random, missing not at random, and missing completely at random, see our paper.)

In [57]:
sns.regplot(dat.pcf, dat.PEG_trailing.isna())

NameError: name 'dat' is not defined

So now, we will go about filling in the missing data by using multiple imputation, the imputation method being a modeling of each feature with missing values as a function of other features in a round-robin regression.

In [58]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

ModuleNotFoundError: No module named 'sklearn.experimental'