In [54]:
import pandas as pd
import numpy as np
import sqlite3
import statsmodels.formula.api as smf
from regtabletotext import prettify_result

In [55]:
crsp_db = sqlite3.connect('../data/crsp.db.sqlite')

crsp_data = pd.read_sql(
    sql=("SELECT permno, gvkey, date, ret_excess, mktcap, "
         "mktcap_lag, exchange FROM crsp_monthly"), 
    con = crsp_db,
    parse_dates=['date']
    ).dropna()

compustat_db = sqlite3.connect('../data/compustat.db.sqlite')

compustat_data = pd.read_sql(
    sql ="SELECT at, act, che, lct, "
    "dlc, txp, dp, ib, gvkey, fyr, datadate "
    "FROM compustat",
    con = compustat_db,
    parse_dates=['datadate']
    ).dropna()

In [56]:
size = (crsp_data
  .query("date.dt.month == 12")
  .assign(yr=lambda x: (x["date"]+pd.DateOffset(months=4)).dt.year)
  .get(["gvkey", "yr", "mktcap"])
  .rename(columns={"mktcap": "size"})
)

Now we can can calculate earnings and accurals for each timestep. Note that the paper only uses companies with fiscal year end in December because of annual portfolios

In [57]:
compustat_lag = (compustat_data
    .query("fyr == 12")
    .assign(accural_lag = lambda x: x.act + x.dlc + x.txp - x.che - x.lct)
    .assign(datadate = lambda x: x.datadate + pd.DateOffset(years=1))
    .assign(at_lag = lambda x: x['at'])
    .get(['gvkey', 'datadate', 'accural_lag', "at_lag"])
)
compustat_cleaned_data = (compustat_data
    .merge(compustat_lag, how = "inner", on=['gvkey', 'datadate'])
    .assign(yr = lambda x: x.datadate.dt.year)
    .assign(accurals = lambda x: x.act + x.dlc + x.txp - x.che - x.lct - x.accural_lag - x.dp)
    .dropna()
)
compustat_cleaned_data['at'] = compustat_cleaned_data['at'].apply(lambda x: np.nan if x <= 0 else x)
compustat_cleaned_data['act'] = compustat_cleaned_data['act'].apply(lambda x: np.nan if x <= 0 else x).dropna()

In [58]:
compustat_cleaned_data = (compustat_cleaned_data
    .assign(avg_total_assets = lambda x: (x['at'] + x.at_lag)/2.0)
    .assign(earnings = lambda x: x.ib/x.avg_total_assets)
    .assign(accurals = lambda x: x.accurals/x.avg_total_assets)
    .assign(cash_flows = lambda x: x.earnings - x.accurals)
    .assign(sorted_date = lambda x: x.datadate + pd.DateOffset(months=4))
    .dropna()
)

In [59]:
compustat_cleaned_data.describe()

Unnamed: 0,at,act,che,lct,dlc,txp,dp,ib,fyr,datadate,accural_lag,at_lag,yr,accurals,avg_total_assets,earnings,cash_flows,sorted_date
count,184691.0,184691.0,184691.0,184691.0,184691.0,184691.0,184691.0,184691.0,184691.0,184691,184691.0,184691.0,184691.0,184691.0,184691.0,184691.0,184691.0,184691
mean,3263.82424,897.131584,280.107712,715.60613,159.65334,30.812497,151.121827,130.672449,12.0,2000-01-19 09:37:30.099896576,92.88193,3073.996229,1999.052109,-0.05323,3168.910234,-0.636733,-0.583502,2000-05-18 15:49:13.332864128
min,0.001,0.001,-40.0,-0.002,-882.0,-66.0,-3.233,-44574.0,12.0,1961-12-31 00:00:00,-65997.265,0.0,1961.0,-1846.5,0.0005,-4206.0,-6401.714286,1962-04-30 00:00:00
25%,26.8025,12.8785,1.838,6.1455,0.134,0.0,0.766,-3.6675,12.0,1988-12-31 00:00:00,-0.838,22.9955,1988.0,-0.083959,25.726,-0.088489,-0.066112,1989-04-30 00:00:00
50%,162.912,68.941,13.3,32.477,2.472,0.024,5.565,1.814,12.0,2000-12-31 00:00:00,4.88,143.355,2000.0,-0.037903,155.136,0.026826,0.053915,2001-04-30 00:00:00
75%,1132.8395,341.052,82.178,202.2985,22.132,3.7445,44.972,34.759,12.0,2011-12-31 00:00:00,50.642,1029.627,2011.0,0.002904,1084.38925,0.068593,0.114071,2012-04-30 00:00:00
max,663580.829,264889.386,139649.0,227561.873,122114.135,17656.0,52892.0,76033.0,12.0,2023-12-31 00:00:00,76482.947,603595.692,2023.0,5245.571429,633588.2605,1230.0,1328.0,2024-04-30 00:00:00
std,15974.184073,4687.068776,1802.104665,4078.810591,1330.063199,215.806918,885.977856,1101.37012,0.0,,1310.956685,15288.795527,14.885178,16.291378,15588.147094,24.61328,28.14224,


In [60]:
compustat_cleaned_data[compustat_cleaned_data['earnings'] < -10000]

Unnamed: 0,at,act,che,lct,dlc,txp,dp,ib,gvkey,fyr,datadate,accural_lag,at_lag,yr,accurals,avg_total_assets,earnings,cash_flows,sorted_date


We can also assign portfolios

In [61]:
def assign_portfolio(data, sorting_variable, percentiles):
    """Assign portfolios to a bin according to a sorting variable."""
    
    breakpoints = (data
      .get(sorting_variable)
      .quantile(percentiles, interpolation="linear")
    )
    breakpoints.iloc[0] = -np.Inf
    breakpoints.iloc[breakpoints.size-1] = np.Inf
    
    assigned_portfolios = pd.cut(
      data[sorting_variable],
      bins=breakpoints,
      labels=pd.Series(range(1, breakpoints.size)),
      include_lowest=True,
      right=False
    )
    
    return assigned_portfolios
merged_data = (
    size
    .merge(compustat_cleaned_data, how = "inner", on = ["gvkey", "yr"])
)
merged_data = (merged_data
    .groupby('datadate')
    .apply(lambda x: x.assign(size_portfolios = assign_portfolio(merged_data, "size", [0, 0.5, 1]),
                              accrual_portfolios = assign_portfolio(merged_data,"accurals", np.linspace(0, 1, 11))), include_groups = True)
    .reset_index(drop=True)
)

  .apply(lambda x: x.assign(size_portfolios = assign_portfolio(merged_data, "size", [0, 0.5, 1]),


In [62]:
crsp_cleaned_data = (crsp_data
    .assign(sorted_date = lambda x: pd.to_datetime(
        x['date'].apply(lambda x: str(x.year) + "0430" if x.month >= 4 else str(x.year-1) + "0430")
        , format='%Y%m%d'))
)

In [63]:
merged_data = (crsp_cleaned_data
    .merge(merged_data, how = "inner", on = ['gvkey', 'sorted_date'])
    .dropna()
)

Sloan does adjustment based on the size factor. We can get this monthly from the ff3 database.

In [64]:
db = sqlite3.connect('../data/db.sqlite')

size_excess_return = pd.read_sql(
    sql = "SELECT smb,date from factors_ff5_monthly",
    con = db,
    parse_dates=['date']
)
size_excess_return = (size_excess_return
    .assign(sorted_date = lambda x: x['date'].apply(lambda x: pd.to_datetime(str(x.year) + "0430" if x.month >= 4 else str(x.year-1) + "0430", format='%Y%m%d')))
)
size_excess_return = size_excess_return.groupby('sorted_date').mean().reset_index().get(['sorted_date', 'smb'])

We adjust the returns for the size factor as Sloan does.

In [65]:
portfolios = (merged_data
    .get(["permno", "sorted_date", "accurals","earnings", "cash_flows", "size_portfolios", "accrual_portfolios", "ret_excess"])
    .merge(size_excess_return, how = "inner", on = ['sorted_date'])
    .assign(ret_excess = lambda x: x.apply(lambda x: ((x['ret_excess']-x["smb"]/12) if x["size_portfolios"] == 1 else x['ret_excess']), axis = 1))
)

In [66]:
portfolios.groupby(['size_portfolios', 'accrual_portfolios'])[['accurals','cash_flows','earnings']].mean()

  portfolios.groupby(['size_portfolios', 'accrual_portfolios'])[['accurals','cash_flows','earnings']].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,accurals,cash_flows,earnings
size_portfolios,accrual_portfolios,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,-0.249758,-0.035048,-0.284806
1,2,-0.113133,0.010938,-0.102195
1,3,-0.080107,0.012364,-0.067743
1,4,-0.059901,-0.005579,-0.06548
1,5,-0.044064,-0.002567,-0.046631
1,6,-0.030309,-0.016004,-0.046313
1,7,-0.016285,-0.024841,-0.041126
1,8,0.001937,-0.036337,-0.0344
1,9,0.033923,-0.068496,-0.034573
1,10,0.157927,-0.216056,-0.058129


We can see here that out adjusted returns based on size has done a good job in standardizing our values accross size portfolios.

In [67]:
portfolios.groupby(['accrual_portfolios'])[['accurals','cash_flows','earnings']].mean()

  portfolios.groupby(['accrual_portfolios'])[['accurals','cash_flows','earnings']].mean()


Unnamed: 0_level_0,accurals,cash_flows,earnings
accrual_portfolios,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.238651,0.005797,-0.232854
2,-0.11253,0.04849,-0.064039
3,-0.079992,0.051786,-0.028207
4,-0.059637,0.042823,-0.016814
5,-0.044076,0.043102,-0.000974
6,-0.030452,0.028857,-0.001595
7,-0.016552,0.012718,-0.003834
8,0.001671,-0.007655,-0.005985
9,0.033037,-0.045338,-0.012301
10,0.148252,-0.179076,-0.030824


In [68]:
portfolios.groupby(['accrual_portfolios'])[['accurals','cash_flows','earnings']].median()

  portfolios.groupby(['accrual_portfolios'])[['accurals','cash_flows','earnings']].median()


Unnamed: 0_level_0,accurals,cash_flows,earnings
accrual_portfolios,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.193194,0.116797,-0.091023
2,-0.110902,0.122303,0.01221
3,-0.079552,0.107213,0.02794
4,-0.059462,0.094867,0.035591
5,-0.044003,0.083323,0.039324
6,-0.03045,0.07111,0.04058
7,-0.016713,0.057494,0.040445
8,0.001271,0.042369,0.04425
9,0.031638,0.016658,0.049835
10,0.110753,-0.069365,0.052927


In [69]:
model = smf.ols("ret_excess ~  earnings", data=portfolios).fit()
prettify_result(model)

OLS Model:
ret_excess ~ earnings

Coefficients:
           Estimate  Std. Error  t-Statistic  p-Value
Intercept     0.009         0.0       52.737    0.000
earnings      0.001         0.0        3.169    0.002

Summary statistics:
- Number of observations: 1,204,107
- R-squared: 0.000, Adjusted R-squared: 0.000
- F-statistic: 10.043 on 1 and 1204105 DF, p-value: 0.002



In [70]:
model = smf.ols("ret_excess ~  accurals + cash_flows", data=portfolios).fit()
prettify_result(model)

OLS Model:
ret_excess ~ accurals + cash_flows

Coefficients:
            Estimate  Std. Error  t-Statistic  p-Value
Intercept      0.009       0.000       47.684      0.0
accurals      -0.011       0.002       -7.203      0.0
cash_flows     0.001       0.000        4.010      0.0

Summary statistics:
- Number of observations: 1,204,107
- R-squared: 0.000, Adjusted R-squared: 0.000
- F-statistic: 37.476 on 2 and 1204104 DF, p-value: 0.000



Much weaker effect than the paper.