# Market Model Regression

In [1]:
import pandas as pd
import numpy as np
from pandas_datareader import DataReader as pdr
import statsmodels.api as sm
import plotly.graph_objects as go

# Read industry and clean-up missing data (coded -99.99)
ff48 = pdr("48_Industry_Portfolios", "famafrench", start=1900)[0]

# Clean-up missings
for c in ff48.columns:
    ff48[c] = np.where(ff48[c]==-99.99, np.nan, ff48[c])
ff48 = ff48/100

# Pull and merge market returns
ff3 = pdr('F-F_Research_Data_Factors','famafrench', start=1900)[0]/100
df = ff48.join(ff3[['Mkt-RF','RF']])
df = df.loc['1970-01':].copy()  # There is missing data prior to 1970

In [2]:
df.columns

Index(['Agric', 'Food ', 'Soda ', 'Beer ', 'Smoke', 'Toys ', 'Fun  ', 'Books',
       'Hshld', 'Clths', 'Hlth ', 'MedEq', 'Drugs', 'Chems', 'Rubbr', 'Txtls',
       'BldMt', 'Cnstr', 'Steel', 'FabPr', 'Mach ', 'ElcEq', 'Autos', 'Aero ',
       'Ships', 'Guns ', 'Gold ', 'Mines', 'Coal ', 'Oil  ', 'Util ', 'Telcm',
       'PerSv', 'BusSv', 'Comps', 'Chips', 'LabEq', 'Paper', 'Boxes', 'Trans',
       'Whlsl', 'Rtail', 'Meals', 'Banks', 'Insur', 'RlEst', 'Fin  ', 'Other',
       'Mkt-RF', 'RF'],
      dtype='object')

In [3]:
## Market model - Utilities
varname = 'Util '
y = df[varname]-df['RF']
X = sm.add_constant(df['Mkt-RF'])
mm = sm.OLS(y, X, missing='drop').fit()
print(mm.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.331
Model:                            OLS   Adj. R-squared:                  0.330
Method:                 Least Squares   F-statistic:                     313.6
Date:                Mon, 20 Mar 2023   Prob (F-statistic):           2.46e-57
Time:                        21:04:26   Log-Likelihood:                 1254.6
No. Observations:                 637   AIC:                            -2505.
Df Residuals:                     635   BIC:                            -2496.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0025      0.001      1.856      0.0

In [4]:
## Market model - Autos
varname = 'Autos'
y = df[varname]-df['RF']
X = sm.add_constant(df['Mkt-RF'])
mm = sm.OLS(y, X, missing='drop').fit()
print(mm.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.520
Model:                            OLS   Adj. R-squared:                  0.520
Method:                 Least Squares   F-statistic:                     689.2
Date:                Mon, 20 Mar 2023   Prob (F-statistic):          1.99e-103
Time:                        21:04:27   Log-Likelihood:                 940.08
No. Observations:                 637   AIC:                            -1876.
Df Residuals:                     635   BIC:                            -1867.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0005      0.002     -0.214      0.8

Let's define a function that calculates the alphas and betas for a given column

In [5]:
def params(varname):
    y = df[varname]-df['RF']
    X = sm.add_constant(df['Mkt-RF'])
    mm = sm.OLS(y, X, missing='drop').fit()
    return mm.params.values
df_beta = pd.DataFrame(index=ff48.columns, columns = ['alpha','beta'],dtype=float)
for c in ff48.columns:
    df_beta.loc[c,:] = params(c)
df_beta.head()

Unnamed: 0,alpha,beta
Agric,0.001589,0.853298
Food,0.003277,0.649457
Soda,0.002838,0.794496
Beer,0.00328,0.719799
Smoke,0.006392,0.6352


## Market Model and the Covariance Matrix

We need estimates of the betas, the individual industry volatilities, and the market return volatility.

In [6]:
betas = df_beta.beta.values
asset_list = df.columns[:-2]
sds = df[asset_list].std().values
mkt_std = df['Mkt-RF'].std()
n = len(asset_list)

Calculate the covariance matrix
1. Variances along the diagonal
2. Covariances off the diagonal of $\beta_j \beta_k \sigma_{\text{mkt}}^2$

In [7]:
C  = np.identity(n)
cov = np.diag(sds) @ C @ np.diag(sds)
for j, asset in enumerate(asset_list):
    for k in range(j+1,n):
        cov[j, k] = cov[k, j] = betas[j] * betas[k] * (mkt_std**2)
cov

array([[0.0041656 , 0.00118615, 0.00145104, ..., 0.0022559 , 0.00223918,
        0.0020592 ],
       [0.00118615, 0.00201243, 0.00110441, ..., 0.001717  , 0.00170428,
        0.00156729],
       [0.00145104, 0.00110441, 0.00411775, ..., 0.00210045, 0.00208488,
        0.0019173 ],
       ...,
       [0.0022559 , 0.001717  , 0.00210045, ..., 0.0059591 , 0.00324131,
        0.00298078],
       [0.00223918, 0.00170428, 0.00208488, ..., 0.00324131, 0.00402793,
        0.00295869],
       [0.0020592 , 0.00156729, 0.0019173 , ..., 0.00298078, 0.00295869,
        0.00459477]])

In [8]:
df_cov = pd.DataFrame(cov, columns=ff48.columns, index=ff48.columns)
df_cov.iloc[:10,:10]

Unnamed: 0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths
Agric,0.004166,0.001186,0.001451,0.001315,0.00116,0.002156,0.002455,0.001986,0.001405,0.002038
Food,0.001186,0.002012,0.001104,0.001001,0.000883,0.001641,0.001868,0.001512,0.00107,0.001551
Soda,0.001451,0.001104,0.004118,0.001224,0.00108,0.002007,0.002286,0.001849,0.001308,0.001898
Beer,0.001315,0.001001,0.001224,0.0027,0.000979,0.001819,0.002071,0.001676,0.001185,0.001719
Smoke,0.00116,0.000883,0.00108,0.000979,0.003884,0.001605,0.001827,0.001479,0.001046,0.001517
Toys,0.002156,0.001641,0.002007,0.001819,0.001605,0.005488,0.003396,0.002748,0.001944,0.002819
Fun,0.002455,0.001868,0.002286,0.002071,0.001827,0.003396,0.006241,0.003129,0.002213,0.00321
Books,0.001986,0.001512,0.001849,0.001676,0.001479,0.002748,0.003129,0.003752,0.001791,0.002597
Hshld,0.001405,0.00107,0.001308,0.001185,0.001046,0.001944,0.002213,0.001791,0.002198,0.001838
Clths,0.002038,0.001551,0.001898,0.001719,0.001517,0.002819,0.00321,0.002597,0.001838,0.004565


## Let's split the sample in half and estimate betas (and alphas) on each half

In [9]:
# Adjust function to take a dataframe as an argument
def params(frame, varname):
    y = frame[varname]-frame['RF']
    X = sm.add_constant(frame['Mkt-RF'])
    mm = sm.OLS(y, X, missing='drop').fit()
    return mm.params.values

# Find the halfway mark
T = len(df.index)
halfway = df.index[int(T/2)]
halfway

# Split sample
df_first_half = df.loc[:halfway]
df_second_half= df.loc[halfway+1:]

# Output dataframe will have two sets of parameters
df_beta = pd.DataFrame(index=ff48.columns, columns = ['alpha_0','beta_0','alpha_1','beta_1'],dtype=float)
for c in ff48.columns:
    # First half regression
    df_beta.loc[c,['alpha_0','beta_0']] = params(df_first_half,c)

    # Second half regression
    df_beta.loc[c,['alpha_1','beta_1']] = params(df_second_half,c)    

In [10]:
df_first_half.tail()

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other,Mkt-RF,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-03,-0.014,-0.0422,0.0744,0.019,-0.0768,-0.0047,0.0177,0.0092,0.0156,0.1149,...,0.0269,0.0825,0.0092,0.0314,-0.0252,0.0666,0.0233,0.0666,0.0073,0.0039
1996-04,0.0595,-0.0059,-0.0105,-0.0089,-0.016,0.0084,0.0305,0.028,0.0389,0.0541,...,0.0504,0.0436,0.026,-0.0131,-0.0128,0.0233,0.0054,0.0512,0.0206,0.0046
1996-05,-0.0188,0.0587,0.0494,0.0969,0.0548,0.041,0.0223,0.0323,0.0324,0.0805,...,0.0412,0.0511,0.0034,0.0202,0.0014,0.0412,0.0266,0.0536,0.0236,0.0042
1996-06,-0.0048,0.0246,0.0404,0.0654,0.0368,-0.0174,-0.0297,0.0046,0.0136,0.0012,...,-0.0405,-0.0108,-0.0053,-0.0007,0.015,0.0018,0.0112,-0.0281,-0.0114,0.004
1996-07,-0.0496,-0.0229,-0.0219,-0.0578,0.002,-0.1067,-0.1231,-0.0665,-0.0332,-0.0433,...,-0.0785,-0.0762,-0.0744,-0.0046,-0.0515,-0.0907,-0.0387,-0.1063,-0.0597,0.0045


In [11]:
df_second_half.head()

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other,Mkt-RF,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-08,0.031,-0.0107,0.094,0.0214,-0.1318,0.0187,-0.0035,0.0362,0.0169,0.0577,...,0.038,0.0689,0.0358,0.0444,0.0393,0.0758,0.034,0.0887,0.0277,0.0041
1996-09,0.0421,0.0906,0.0892,0.0073,0.0141,0.0703,0.0766,0.0279,0.0634,0.0715,...,0.0489,0.0329,0.0049,0.0644,0.0502,0.0282,0.0503,0.0512,0.0501,0.0044
1996-10,0.0585,0.0258,-0.0241,0.0078,0.0331,-0.0044,-0.0223,0.0121,0.0105,-0.0025,...,0.0012,-0.0196,-0.0365,0.072,0.0364,-0.0218,0.0289,0.0189,0.0086,0.0042
1996-11,0.0118,0.0621,0.0303,0.0274,0.1074,0.065,0.0909,0.054,0.054,-0.0049,...,0.0477,0.021,0.0391,0.0936,0.0721,-0.0031,0.0802,0.0369,0.0625,0.0041
1996-12,0.0038,-0.0265,0.0463,0.0052,0.093,-0.0785,-0.0575,-0.0168,0.0039,0.0265,...,-0.0016,-0.0431,-0.0298,-0.0345,0.0036,-0.0014,0.02,-0.0443,-0.017,0.0046


### How persistent is beta?

In [12]:
# Beta regression
y = df_beta['beta_1']
X = sm.add_constant(df_beta['beta_0'])
results = sm.OLS(y, X, missing='drop').fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 beta_1   R-squared:                       0.245
Model:                            OLS   Adj. R-squared:                  0.229
Method:                 Least Squares   F-statistic:                     14.95
Date:                Mon, 20 Mar 2023   Prob (F-statistic):           0.000345
Time:                        21:04:28   Log-Likelihood:                -3.3085
No. Observations:                  48   AIC:                             10.62
Df Residuals:                      46   BIC:                             14.36
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0992      0.232      0.428      0.6

In [13]:
# Scatter plot

trace  = go.Scatter(x=df_beta['beta_0'], y=df_beta['beta_1'], mode="markers", name = 'Alphas')
minval = np.min([df_beta['beta_1'].min(), df_beta['beta_0'].min()])
maxval = np.max([df_beta['beta_1'].max(), df_beta['beta_0'].max()])

trace_45 = go.Scatter(x= np.linspace(minval,maxval,20), y = np.linspace(minval,maxval,20), mode='lines',name='45-degree line')
ols_fit = results.params[0] + results.params[1]*np.linspace(minval,maxval,20)
trace_ols= go.Scatter(x= np.linspace(minval,maxval,20), y = ols_fit, mode='lines',name='Predicted')

fig = go.Figure()
fig.add_trace(trace)
fig.add_trace(trace_45)
fig.add_trace(trace_ols)
fig.update_xaxes(title='1st Half Beta',tickformat=".2f", range=[0.9*minval, 1.1*maxval])
fig.update_yaxes(title='2nd Half Beta',tickformat=".2f", range=[0.9*minval, 1.1*maxval])
fig.update_layout(title='Beta Persistence')
fig.update_layout(legend=dict(yanchor="top", y =0.99, xanchor="left", x=0.01))
fig.show()

Let's compare the predictive power (in a mean-squared error sense) of using either
- first half beta
- first half beta 'shrunk' towards 1

(Note: better predictors of beta will have a *lower* MSE)

In [14]:
#Shrinking betas vs. using prior estimates
df_beta['beta_adj'] = 0.67*df_beta['beta_0'] +0.33

In [15]:
# Use first-half estimate
mse_firsthalf = ((df_beta.beta_1 - df_beta.beta_0)**2).mean()
print(f'Mean-squared Error of Using Past Beta:\t {mse_firsthalf: .4f}')

Mean-squared Error of Using Past Beta:	  0.0799


In [16]:
# Use adjusted version of first-half estimate
mse_shrink = ((df_beta.beta_1 - df_beta.beta_adj)**2).mean()
print(f'Mean-squared Error of Adjusted Beta:\t {mse_shrink: .4f}')

Mean-squared Error of Adjusted Beta:	  0.0739


In [17]:
# Use fitted regression (remember that we knew the outcomes in order to fit this!)
a = results.params[0]
b = results.params[1]
df_beta['beta_fitted'] = a + b*df_beta['beta_0']            # alternatively, use: results.fittedvalues
mse_fitted = ((df_beta.beta_1 - df_beta.beta_fitted)**2).mean()
print(f'Mean-squared Error of Fitted Beta:\t {mse_fitted: .4f}')
print(f'Note this has a look-ahead problem!')

Mean-squared Error of Fitted Beta:	  0.0672
Note this has a look-ahead problem!


### How persistent is alpha?

In [18]:
# Alpha regression
y = df_beta['alpha_1']
X = sm.add_constant(df_beta['alpha_0'])
# X = df_beta['alpha_0']
results = sm.OLS(y, X, missing='drop').fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                alpha_1   R-squared:                       0.200
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     11.51
Date:                Mon, 20 Mar 2023   Prob (F-statistic):            0.00144
Time:                        21:04:29   Log-Likelihood:                 222.40
No. Observations:                  48   AIC:                            -440.8
Df Residuals:                      46   BIC:                            -437.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0012      0.000      3.335      0.0

In [19]:
# Scatter plot

trace  = go.Scatter(x=df_beta['alpha_0'], y=df_beta['alpha_1'], mode="markers")
minval = np.min([df_beta['alpha_1'].min(), df_beta['alpha_0'].min()])
maxval = np.max([df_beta['alpha_1'].max(), df_beta['alpha_0'].max()])
trace_45 = go.Scatter(x= np.linspace(minval,maxval,20), y = np.linspace(minval,maxval,20), mode='lines',name='45-degree line')
ols_fit = results.params[0] + results.params[1]*np.linspace(minval,maxval,20)
# ols_fit = results.params['alpha_0']*np.linspace(minval,maxval,20)
trace_ols= go.Scatter(x= np.linspace(minval,maxval,20), y = ols_fit, mode='lines',name='Predicted')

fig = go.Figure()
fig.add_trace(trace)
fig.add_trace(trace_45)
fig.add_trace(trace_ols)
fig.update_xaxes(title='1st Half Alpha',tickformat=".2%", range=[1.1*minval, 1.1*maxval])
fig.update_yaxes(title='2nd Half Alpha',tickformat=".2%", range=[1.1*minval, 1.1*maxval])
fig.update_layout(title='Alpha Persistence')
fig.update_layout(legend=dict(yanchor="top", y =0.99, xanchor="left", x=0.01))
fig.show()