In [1]:
import pandas as pd
import numpy as np
import datetime
from linearmodels import FamaMacBeth
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [2]:
price=pd.read_csv("price.csv",encoding="utf-8",index_col=0)
ptv=pd.read_csv("price2value.csv",encoding="utf-8",index_col=0)
regression=pd.read_csv("regression_data.csv",encoding="utf-8",index_col=0)

  mask |= (ar1 == a)


In [3]:
date_data=pd.Series(pd.date_range('1/1/2012','31/12/2019'))
date_data.rename("date")
date_data=pd.DataFrame(date_data.apply(lambda x:x.strftime("%Y-%m-%d").replace('-',"" )))
date_data.columns=["date"]
price["date"]=price.index
price["date"]=price["date"].apply(lambda x:str(x))
price=pd.merge(date_data,price,on=["date"],how="left")
price=price.fillna(method='ffill')

In [4]:
price.set_index(["date"], inplace=True)
price=(price.diff(periods=30)/price).dropna(axis=0,how='all')

In [5]:
price["date"]=price.index
price=price.melt(id_vars=['date'],var_name='股票代码',value_name='monthly_return')

In [6]:
price["date"]=price["date"].astype(float)

In [20]:
data=ptv[["date","股票代码","ptv"]].merge(regression,on=["date","股票代码"],how="left").merge(price,on=["date","股票代码"],how="left")
data["date"]=data["date"].astype(str)
data=data[data["date"].str.contains("....01")]
data["ptv"]=data.groupby("date")["ptv"].apply(lambda x:(x-x.mean())/x.std())
data["momentum"]=data.groupby("date")["momentum"].apply(lambda x:(x-x.mean())/x.std())
data["turnover"]=data.groupby("date")["turnover"].apply(lambda x:(x-x.mean())/x.std())
data["size"]=data.groupby("date")["size"].apply(lambda x:(x-x.mean())/x.std())
data["PE"]=data.groupby("date")["PE"].apply(lambda x:(x-x.mean())/x.std())

In [21]:
data.head()

Unnamed: 0,date,股票代码,ptv,momentum,turnover,size,PE,monthly_return
23,20120401,000001.SZ,-0.604679,-0.096204,-0.637656,2.29575,-0.047682,-0.112661
53,20120501,000001.SZ,-0.431631,0.102062,-0.631261,2.617353,-0.045938,0.051329
84,20120601,000001.SZ,-0.457851,-0.631884,-0.623073,2.557107,-0.050369,-0.063361
114,20120701,000001.SZ,-0.423741,0.456462,-0.384374,2.587571,-0.046025,-0.040894
145,20120801,000001.SZ,-0.419838,0.881372,-0.506721,2.611641,-0.044666,0.000661


In [22]:
data.index=range(len(data))

In [53]:
data["date"]=data["date"].astype(int)
data1=data.set_index(["股票代码","date"])

In [54]:
data1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ptv,momentum,turnover,size,PE,monthly_return
股票代码,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000001.SZ,20120401,-0.604679,-0.096204,-0.637656,2.29575,-0.047682,-0.112661
000001.SZ,20120501,-0.431631,0.102062,-0.631261,2.617353,-0.045938,0.051329
000001.SZ,20120601,-0.457851,-0.631884,-0.623073,2.557107,-0.050369,-0.063361
000001.SZ,20120701,-0.423741,0.456462,-0.384374,2.587571,-0.046025,-0.040894
000001.SZ,20120801,-0.419838,0.881372,-0.506721,2.611641,-0.044666,0.000661


In [50]:
smf.ols("monthly_return~ptv+momentum+turnover+size+PE",data1).fit().summary()

0,1,2,3
Dep. Variable:,monthly_return,R-squared:,0.345
Model:,OLS,Adj. R-squared:,0.345
Method:,Least Squares,F-statistic:,77250.0
Date:,"Tue, 31 Mar 2020",Prob (F-statistic):,0.0
Time:,20:57:35,Log-Likelihood:,544460.0
No. Observations:,732193,AIC:,-1089000.0
Df Residuals:,732187,BIC:,-1089000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0230,0.000,-170.931,0.000,-0.023,-0.023
ptv,-0.0005,0.000,-3.542,0.000,-0.001,-0.000
momentum,0.0847,0.000,610.450,0.000,0.084,0.085
turnover,-0.0056,0.000,-39.964,0.000,-0.006,-0.005
size,0.0014,0.000,9.851,0.000,0.001,0.002
PE,0.0002,0.000,1.749,0.080,-2.85e-05,0.000

0,1,2,3
Omnibus:,349254.81,Durbin-Watson:,0.396
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2945971.429
Skew:,-2.125,Prob(JB):,0.0
Kurtosis:,11.86,Cond. No.,1.36


# Fama-MacBeth regression
$r_{i,t+1}=\hat{\alpha}+\hat{\beta_{1,t}}*ptv_{i,t}+\hat{\beta_{2,t}}*momentum_{i,t}+\hat{\beta_{3,t}}*turnover_{i,t}+\hat{\beta_{4,t}}*size_{i,t}+\hat{\beta_{5,t}}*PE_{i,t}  \\
\hat{\beta_{j}}=\frac{1}{T}\sum_{t=0}^{T-1}\beta_{j,t}$

Looking the p-value of the $\beta s$, we can find whether the coefficients of each factor is significantly different form 0, which means if the factor is valid or not.

In [56]:
# Fama-MacBeth regression
fm=FamaMacBeth(dependent=data1['monthly_return'], exog=sm.add_constant(data1[["ptv","momentum","turnover","size" ,"PE"]]))
res_fm=fm.fit(debiased=False)
res_fm

0,1,2,3
Dep. Variable:,monthly_return,R-squared:,0.3445
Estimator:,FamaMacBeth,R-squared (Between):,0.6144
No. Observations:,732193,R-squared (Within):,0.3380
Date:,"Tue, Mar 31 2020",R-squared (Overall):,0.3445
Time:,21:48:55,Log-likelihood,5.44e+05
Cov. Estimator:,Fama-MacBeth Standard Cov,,
,,F-statistic:,7.697e+04
Entities:,2469,P-value,0.0000
Avg Obs:,296.55,Distribution:,"F(5,732187)"
Min Obs:,18.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-0.0192,0.0051,-3.7452,0.0002,-0.0293,-0.0092
ptv,-0.0002,9.814e-05,-2.4019,0.0163,-0.0004,-4.337e-05
momentum,0.0849,0.0015,57.446,0.0000,0.0820,0.0878
turnover,-0.0049,0.0003,-18.429,0.0000,-0.0054,-0.0044
size,8.88e-05,0.0003,0.2973,0.7662,-0.0005,0.0007
PE,0.0002,5.609e-05,4.0024,0.0001,0.0001,0.0003
