# APT model: Famma-Macbeth Regression

In [1]:
from rqdata_utils import *
import pandas
import numpy as np
import scipy as sp
import alphalens as al
%matplotlib inline

## Loading Data

In [2]:
price_df,instrument_df,equity_df = get_price_instrument_equity("cn_stock_price_2012_2018.csv","cn_instrument_info_2012_2018.csv","cn_equity_daily_2012_2018.csv","sectorCode")

In [3]:
equity_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,return,close,total_turnover,volume,week,month,report_quarter,market_cap,a_share_market_val_2,cash_received_from_sales_of_goods,pb_ratio,net_profit,ps_ratio,sectorCode
date,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-01-04,000001.XSHE,-0.027582,5.1224,227563700.0,40894428.0,0.5775,0.4331,,,,,,,,Financials
2012-01-04,000002.XSHE,-0.018742,6.0525,355989100.0,47432958.0,0.3711,0.403,2011q3,80594890000.0,70821200000.0,75167850000.0,1.5216,4106349000.0,0.8679,Financials
2012-01-04,000004.XSHE,-0.02225,7.91,3763833.0,465469.0,0.572,0.7506,2011q3,664255600.0,663454900.0,59499680.0,8.8175,4500363.0,37.5796,HealthCare
2012-01-04,000005.XSHE,0.0,3.86,0.0,0.0,0.0,0.0,2011q3,3529328000.0,3527048000.0,25658510.0,5.348,13656650.0,-347.2191,Industrials
2012-01-04,000006.XSHE,-0.009756,2.6766,7619286.0,2513811.0,0.1416,0.1667,2011q3,4015370000.0,3929464000.0,2531436000.0,1.4348,276391700.0,1.4139,Financials


In [4]:
healthcareUniverse = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values
len(healthcareUniverse)

164

In [5]:
def equity_universe_filtering(equity_df, universe):
    universeFilter = [book_id in set(universe) for book_id in equity_df.index.get_level_values(level=1).values]
    return equity_df[universeFilter]

In [6]:
healthcare_equity_df = equity_universe_filtering(equity_df, healthcareUniverse)
healthcare_equity_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,return,close,total_turnover,volume,week,month,report_quarter,market_cap,a_share_market_val_2,cash_received_from_sales_of_goods,pb_ratio,net_profit,ps_ratio,sectorCode
date,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2012-01-04,000004.XSHE,-0.02225,7.91,3763832.88,465469.0,0.572,0.7506,2011q3,664255600.0,663454900.0,59499680.0,8.8175,4500363.0,37.5796,HealthCare
2012-01-04,000028.XSHE,-0.045433,19.8422,9326924.28,450553.0,0.4201,0.2722,2011q3,5872485000.0,4753820000.0,10532980000.0,4.3493,248183400.0,0.3414,HealthCare
2012-01-04,000150.XSHE,-0.030295,3.1737,3109304.5,952600.0,0.346,0.361,2011q3,1036800000.0,1036800000.0,49132790.0,1.4763,3657858.0,7.8956,HealthCare
2012-01-04,000153.XSHE,-0.028053,5.77,9673054.49,1596020.0,0.683,2.4594,2011q3,1531454000.0,1360856000.0,1329425000.0,2.1169,15603970.0,0.7818,HealthCare
2012-01-04,000403.XSHE,0.0,3.1625,0.0,0.0,0.0,0.0,,,,,,,,HealthCare


In [7]:
print("universe ratio: {}%".format(len(healthcare_equity_df)/len(equity_df)*100))

universe ratio: 6.210331877919959%


### benchmark

In [27]:
benchmark_df = pd.read_csv("cn_SH_healthcare_index_2012_2018.csv",names=['date','value'])
benchmark_df = benchmark_df.set_index('date',drop=True)

In [33]:
benchmark_df['return'] = np.log(benchmark_df.shift(1)/benchmark_df).fillna(0)
benchmark_df.head()

Unnamed: 0_level_0,value,return
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-04,2891.462,0.0
2012-01-05,2766.955,0.044015
2012-01-06,2744.793,0.008042
2012-01-09,2833.219,-0.031708
2012-01-10,2929.594,-0.03345


## Factor Returns

In [8]:
def equity_factor_return(equity_df, factorColumn, nAllocations, longTop=True):
    equity_copy = equity_df.copy()
#     equity_copy["{}_rank".format(factorColumn)] = equity_copy.groupby(level='date')[factorColumn].rank()
#     equity_copy[equity_copy.groupby(level='date')[factorColumn].nlargest(nAllocations).index]["biggest_{}_{}".format(nAllocations,factorColumn)]=True
    largest = equity_copy[factorColumn].groupby(level='date').nlargest(nAllocations).reset_index(level=0,drop=True)
    smallest = equity_copy[factorColumn].groupby(level='date').nsmallest(nAllocations).reset_index(level=0,drop=True)
    r_largest = equity_copy.loc[largest.index,'return'].groupby(level='date').mean()
    r_smallest = equity_copy.loc[smallest.index,'return'].groupby(level='date').mean()
    LMS = r_largest - r_smallest
    if(longTop):
        return LMS
    else:
        return -LMS

In [9]:
SMB = equity_factor_return(healthcare_equity_df, 'market_cap', 20,longTop=False)
SMB.head()

date
2012-01-04    0.005983
2012-01-05   -0.009098
2012-01-06   -0.004155
2012-01-09    0.014615
2012-01-10    0.006728
Name: return, dtype: float64

In [10]:
HML = equity_factor_return(healthcare_equity_df, 'pb_ratio', 20,longTop=True)
HML.head()

date
2012-01-04    0.005302
2012-01-05   -0.007223
2012-01-06    0.006031
2012-01-09   -0.002597
2012-01-10   -0.010780
Name: return, dtype: float64

In [11]:
import itertools
import statsmodels.api as sm
from statsmodels import regression,stats
import scipy

data = healthcare_equity_df[['return']] # dataframe
data = data.set_index(healthcare_equity_df.index) # elimilate redundant index (whole universe)
asset_list_sizes = [group[1].size for group in data.groupby(level=0)]

# Spreading the factor portfolio data across all assets for each day
SMB_column = [[SMB.loc[group[0]]] * size for group, size \
              in zip(data.groupby(level=0), asset_list_sizes)]
data['SMB'] = list(itertools.chain(*SMB_column))

HML_column = [[HML.loc[group[0]]] * size for group, size \
              in zip(data.groupby(level=0), asset_list_sizes)]
data['HML'] = list(itertools.chain(*HML_column))
data = sm.add_constant(data.dropna())

In [12]:
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,const,return,SMB,HML
date,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-04,000004.XSHE,1.0,-0.02225,0.005983,0.005302
2012-01-04,000028.XSHE,1.0,-0.045433,0.005983,0.005302
2012-01-04,000150.XSHE,1.0,-0.030295,0.005983,0.005302
2012-01-04,000153.XSHE,1.0,-0.028053,0.005983,0.005302
2012-01-04,000403.XSHE,1.0,0.0,0.005983,0.005302


## Factor Exposures ($\beta$)

In [13]:
assets = data.index.levels[1].unique()
Y = [data.xs(asset,level=1)['return'] for asset in assets]
X = [data.xs(asset,level=1)[['SMB','HML','const']] for asset in assets]
reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]
indices = [asset for y, x, asset in zip(Y, X, assets) if not(x.empty or y.empty)]
betas = pd.DataFrame(reg_results, index=indices)

In [15]:
betas.head()

Unnamed: 0,SMB,HML,const
000004.XSHE,0.883906,0.048757,0.002002
000028.XSHE,-0.003029,-0.064295,0.001073
000150.XSHE,0.354122,0.066071,0.002031
000153.XSHE,0.620706,-0.082229,0.001405
000403.XSHE,2.032192,11.457418,-0.017412


## Factor Premium

In [36]:
betas = sm.add_constant(betas.drop('const', axis=1))

R = data['return'].mean(axis=0, level=1)

# Second regression step: estimating the risk premia
risk_free_rate = benchmark_df['return'].mean()

final_results = regression.linear_model.OLS(R - risk_free_rate, betas).fit()

final_results.summary()

0,1,2,3
Dep. Variable:,return,R-squared:,0.398
Model:,OLS,Adj. R-squared:,0.391
Method:,Least Squares,F-statistic:,53.26
Date:,"Sat, 05 May 2018",Prob (F-statistic):,1.77e-18
Time:,21:03:25,Log-Likelihood:,1012.1
No. Observations:,164,AIC:,-2018.0
Df Residuals:,161,BIC:,-2009.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0017,6.72e-05,24.956,0.000,0.002,0.002
SMB,-7.597e-05,0.000,-0.599,0.550,-0.000,0.000
HML,0.0005,4.81e-05,9.695,0.000,0.000,0.001

0,1,2,3
Omnibus:,39.154,Durbin-Watson:,1.906
Prob(Omnibus):,0.0,Jarque-Bera (JB):,78.545
Skew:,1.087,Prob(JB):,8.8e-18
Kurtosis:,5.601,Cond. No.,3.92


## Fama-Macbeth Test Conclusion: 
although our individual factors are significant, we have a very low  $R^2$ . What this may suggest is that there is a real link between our factors and the returns of our assets, but that there still remains a lot of unexplained noise!