# Cross-sectional Predictability

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import plotly.graph_objects as go
pd.options.display.float_format = '{:,.4f}'.format

### Pull data from dropbox

In [2]:
df = pd.read_csv('https://www.dropbox.com/s/w2vb22u6kh7ysvn/stocks.csv?dl=1')
df['mdate']=pd.to_datetime(df.date,format='%Y%m%d').dt.to_period('M')
df.head()

Unnamed: 0,ticker,date,ret,beta,mve,bm,mom12m,illiq,idiovol,mdate
0,GFGC,19880229,0.08,0.0519,8.7323,0.9222,-0.068,0.0,0.0337,1988-02
1,GFGC,19880331,-0.0763,0.0508,8.8093,0.9222,0.0708,0.0,0.0339,1988-03
2,GFGC,19880429,0.0306,0.047,8.7121,0.9222,0.1154,0.0,0.0336,1988-04
3,GFGC,19880531,0.0198,0.0363,8.7423,0.9222,0.0724,0.0,0.0332,1988-05
4,GFGC,19880630,-0.012,0.0243,8.7619,0.9222,0.1902,0.0,0.0334,1988-06


In [3]:
print(f'Total number of obs:\t {len(df): ,.0f}')
print(f'First month:\t\t {df.mdate.min()}')
print(f'Last month:\t\t {df.mdate.max()}')
df.ret.describe().round(4)

Total number of obs:	  2,045,339
First month:		 1980-01
Last month:		 2022-12


count   2,045,339.0000
mean            0.0116
std             0.1894
min            -1.8708
25%            -0.0665
50%             0.0000
75%             0.0722
max            24.0000
Name: ret, dtype: float64

Remove outliers

In [4]:
cols_to_trim_on = ['ret', 'beta', 'mve', 'bm', 'mom12m', 'illiq','idiovol']
df = df[(np.abs(stats.zscore(df[cols_to_trim_on])) < 5).all(axis=1)]
df.ret.describe()

count   2,007,750.0000
mean            0.0069
std             0.1533
min            -0.9354
25%            -0.0657
50%             0.0000
75%             0.0712
max             0.9584
Name: ret, dtype: float64

## Cross-sectional Method #1: Sorting

In [5]:
# Sorting function
def cut_quintiles(x):
    try:
        out = pd.qcut(x, 5, labels=["Lo 20", "Qnt 2", "Qnt 3", "Qnt 4", "Hi 20"])
    except:
        out = pd.Series(np.nan, index=x.index)
    return out

In [6]:
# Characteristic to sort on
# CHAR = 'beta'
# CHAR = 'mve'
CHAR = 'bm'

df["quintile"] = df.groupby("mdate")[CHAR].apply(cut_quintiles)

In [7]:
# Check to see we have about the same number of observations in each bin
df.quintile.value_counts()

Lo 20    401756
Hi 20    401654
Qnt 2    401448
Qnt 4    401448
Qnt 3    401444
Name: quintile, dtype: int64

In [8]:
# Time-series of EW average portfolio across stocks
df = df.dropna(subset=["quintile"])
rets = df.groupby(["mdate", "quintile"]).ret.mean()
rets.head(10).round(4)

mdate    quintile
1980-01  Lo 20       0.0789
         Qnt 2       0.0751
         Qnt 3       0.0791
         Qnt 4       0.0785
         Hi 20       0.1267
1980-02  Lo 20      -0.0157
         Qnt 2      -0.0254
         Qnt 3      -0.0363
         Qnt 4      -0.0332
         Hi 20      -0.0254
Name: ret, dtype: float64

In [9]:
rets = rets.unstack()
rets.head(2)

quintile,Lo 20,Qnt 2,Qnt 3,Qnt 4,Hi 20
mdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980-01,0.0789,0.0751,0.0791,0.0785,0.1267
1980-02,-0.0157,-0.0254,-0.0363,-0.0332,-0.0254


In [10]:
# Time-series average of each portfolio
mns = rets.mean()
print(f'Sorting on {CHAR}:\nAverage monthly returns:\n{mns.round(4)}')

hilo = mns['Hi 20']-mns['Lo 20']
print(f'\nHi-Lo  average return is: {hilo: .2%} per month')

Sorting on bm:
Average monthly returns:
quintile
Lo 20   0.0012
Qnt 2   0.0070
Qnt 3   0.0094
Qnt 4   0.0099
Hi 20   0.0101
dtype: float64

Hi-Lo  average return is:  0.89% per month


## Cross-sectional Method #2: Regression

Example #1: Single characteristic

In [11]:
# Run a cross-sectional regression for each date
coefs = df.groupby('mdate').apply(lambda d: sm.OLS(d.ret,sm.add_constant(d[CHAR])).fit().params) 
coefs

Unnamed: 0_level_0,const,bm
mdate,Unnamed: 1_level_1,Unnamed: 2_level_1
1980-01,0.0571,0.0248
1980-02,-0.0227,-0.0036
1980-03,-0.1464,-0.0054
1980-04,0.0714,-0.0120
1980-05,0.0779,-0.0030
...,...,...
2022-08,-0.0130,0.0045
2022-09,-0.1143,-0.0040
2022-10,0.0681,0.0115
2022-11,0.0237,-0.0144


In [12]:
coefs.mean()
T = len(coefs[CHAR])
tsreg = sm.OLS(coefs[CHAR],np.ones(T)).fit()

In [13]:
print(tsreg.summary())

                            OLS Regression Results                            
Dep. Variable:                     bm   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                       nan
Date:                Thu, 23 Mar 2023   Prob (F-statistic):                nan
Time:                        17:05:08   Log-Likelihood:                 1321.0
No. Observations:                 516   AIC:                            -2640.
Df Residuals:                     515   BIC:                            -2636.
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0040      0.001      4.798      0.0

In [14]:
# Plot coefficient on CHAR
fig = go.Figure()
trace  = go.Scatter(x=coefs.index.to_timestamp(), y=coefs[CHAR], mode="lines", name ='Monthly Coefficient')
fig.add_trace(trace)
trace_ma  = go.Scatter(x=coefs.index.to_timestamp(), y=coefs[CHAR].rolling(120).mean(), mode="lines", name ='Rolling 10-yr Avg')
fig.add_trace(trace_ma)
fig.update_yaxes(title=f'Coefficient on {CHAR}',tickformat=".4f")
fig.update_xaxes(title='Date')
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
fig.show()


Example #2: Multiple characteristics

In [15]:
CHARS = ['beta','mve', 'bm','mom12m']
coefs = df.groupby('mdate').apply(lambda d: sm.OLS(d.ret,sm.add_constant(d[CHARS])).fit().params) 

In [16]:
stats = pd.DataFrame(dtype=float,index=coefs.columns,columns=['mean','tstat','pval'])
for coef in coefs.columns :
    tsreg = sm.OLS(coefs[coef],np.ones(T)).fit()

    stats.loc[coef,'mean'] = coefs[coef].mean()
    stats.loc[coef,'tstat']= tsreg.tvalues['const']
    stats.loc[coef,'pval'] = tsreg.pvalues['const']
stats.round(4)

Unnamed: 0,mean,tstat,pval
const,-0.0118,-2.57,0.0105
beta,-0.0028,-1.9697,0.0494
mve,0.0014,4.2606,0.0
bm,0.0046,6.7982,0.0
mom12m,0.0108,8.8704,0.0


In [17]:
# Plot a chosen characteristic's coefficients over time
CHAR = 'mom12m'
fig = go.Figure()
trace  = go.Scatter(x=coefs.index.to_timestamp(), y=coefs[CHAR], mode="lines", name ='Monthly Coefficient')
fig.add_trace(trace)
trace_ma  = go.Scatter(x=coefs.index.to_timestamp(), y=coefs[CHAR].rolling(120).mean(), mode="lines", name ='Rolling 10-yr Avg')
fig.add_trace(trace_ma)
fig.update_yaxes(title='Coefficient on '+CHAR,tickformat=".4f")
fig.update_xaxes(title='Date')
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
fig.show()

Interpreting the economic magnitude

In [18]:
ts_avg_sd = df.groupby('mdate')[CHAR].std().mean()
ts_avg_coef = stats.loc[CHAR,'mean']
effect = ts_avg_sd * ts_avg_coef
print(f'A one-standard deviation change in {CHAR} results in: {effect: .2%} per month')

A one-standard deviation change in mom12m results in:  0.51% per month
