### Dynamics of Price
Based on the celebrated APT model and Berra's risk model, I proposed the stock forecasting model as the following expression.

$$r_{i}(t) = \alpha_i(t) + \sum_{k=1}^K \beta_{ik}(t) F_{ik}(t)$$

Note that here we don't assume the excessive return or factor returns to be constant, which is different comparing to APT and Berra model. The traditional way to handle it is doing the time-series calibration (APT) or cross-section regression (Berra). And we would implement in the end of this part.


In [50]:
import pandas as pd
import statsmodels.api as sm
from statsmodels import regression,stats
import itertools
from pykalman import KalmanFilter
idx = pd.IndexSlice

class GeneralizedAPTModel:
    def __init__(self, price_df, equity_df, benchmark_df, factor_list, universe):
        self.price_df = price_df
        self.equity_df = equity_df
        self.benchmark_df = benchmark_df
        self.factor_list = factor_list
        self.universe = universe
        self.valid_universe = None
        self.subset = None
        
        # z-score in universe
        tmp_equity_df = self.calculate_subset_df(self.equity_df,self.universe)
        tmp_equity_df = tmp_equity_df[factor_list].fillna(0) # TODO: fill or drop??
        self.factor_zscore = (tmp_equity_df - tmp_equity_df.groupby(level='date').mean())/tmp_equity_df.groupby(level='date').std()
        
        # valide_universe
        self.valid_universe = self.calculate_valid_universe()
        self.valid_factor_zscore = self.factor_zscore.loc[idx[:,self.valid_universe],:]


    def calculate_valid_universe(self):
        valid_universe = set(self.universe)

        for date,group in self.factor_zscore.groupby(level=0):
            size = group.shape[0]
            new_set = set(group.loc[idx[:,valid_universe],:].index.get_level_values(1).values)
            if(new_set<valid_universe):
                valid_universe = new_set
        # self.valid_universe = valid_universe        
        return valid_universe

    def calculate_subset_df(self,df,subset=None):
        if(subset!=None):
            return df.loc[idx[:,subset],:]
        else:
            return df.loc[idx[:,self.subset],:]
    def calculate_portfolio_risk(self,hp):
        if(~hasattr(self,'V') or self.V==None):
            self.cross_section_regression()
        sigmap = np.sqrt(np.dot(np.dot(hp.T,self.V),hp))
        MCTR = np.dot(self.V,hp)/sigmap
        return V, sigmap, MCTR
        
    def time_series_regression(self):
        ts_factor_zscore = sm.add_constant(self.factor_zscore)
        Y = [self.equity_df.xs(asset,level=1)['return'] for asset in self.valid_universe]
        X = [ts_factor_zscore.xs(asset,level=1)[factor_list+['const']] for asset in self.valid_universe]
        reg_results = [regression.linear_model.OLS(y,x).fit().params for y,x in zip(Y,X) if not(x.empty or y.empty)]
        indices = [asset for y, x, asset in zip(Y, X, self.valid_universe) if not(x.empty or y.empty)]
        ts_result_df = pd.DataFrame(reg_results, index=indices)
        return ts_result_df

    def cross_section_regression(self, ndays=22, latest=True):
        valid_factor_zscore = self.factor_zscore.loc[idx[:,self.valid_universe],:]
        if(latest):
            dates = valid_factor_zscore.index.get_level_values(level=0).unique()[-ndays:]
        else:
            dates = valid_factor_zscore.index.get_level_values(level=0).unique()[:ndays]
        valid_factor_zscore = valid_factor_zscore.loc[idx[dates,:],:]
        
        result_list = []
        date_list = []
        for date,group in valid_factor_zscore.groupby(level=0):
            X = sm.add_constant(group.loc[:,self.factor_list])
            y = list(equity_df.loc[(date,list(self.valid_universe)),'return'])          
            results = sm.regression.linear_model.OLS(y,X).fit()
            result_list.append(results.params)
            date_list.append(date)

        cs_result_df = pd.DataFrame(result_list,index=date_list)
        F = np.cov(cs_result_df.iloc[:,1:].T)
        X = self.valid_factor_zscore.groupby(level=1).mean()
        V = np.inner(np.inner(X,F),X)
        self.F = F
        self.X = X
        self.V = V
        
        return cs_result_df
    
    def kalman_filter_calibration(self):
        asset_size = len(self.valid_universe)
        factor_size = len(self.factor_list)
        observation_transition_matrix = []
        for date,group in self.valid_factor_zscore.groupby(level=0):
            exposure_matrix = group.values
#             print(np.shape(exposure_matrix))
#             print(asset_size)
            observation_transition_matrix.append(
                np.concatenate((np.eye(asset_size),exposure_matrix),axis=1).tolist()
            )
        observations = self.equity_df.loc[idx[:,self.valid_universe],'return']
        observation_list = []
        for date,group in observations.groupby(level=0):
            observation_list.append(group.values.tolist())
            
        state_transition_matrix = np.eye(factor_size+asset_size)
        state_covariance_matrix = np.eye(factor_size+asset_size)*0.01
        observation_matrices = observation_transition_matrix
        observation_covariance_matrix = np.eye(asset_size)*0.0
        initial_state_mean = np.zeros(factor_size+asset_size)
        initial_state_covariance = np.eye(factor_size+asset_size)*0.01


        nstate = factor_size+asset_size
        nobs = asset_size
#         print(state_transition_matrix.shape)
#         print(state_covariance_matrix.shape)
#         print(np.shape(observation_matrices))
#         print(observation_covariance_matrix.shape==(nobs,nobs))
#         print(initial_state_mean.shape==(nstate,))
#         print(initial_state_covariance.shape==(nstate,nstate))


        kf = KalmanFilter(transition_matrices=state_transition_matrix,
                 transition_covariance=state_covariance_matrix,
                 observation_matrices=observation_matrices,
                 observation_covariance=observation_covariance_matrix,
                 initial_state_mean=initial_state_mean,
                 initial_state_covariance=initial_state_covariance,
                 n_dim_state=nstate,
                 n_dim_obs=nobs)
        returns = kf.filter(observation_list)
        filtered_state_means = returns[0]
        # a and F
        a_list = []
        F_list = []
        for state in filtered_state_means:
            a = state[:asset_size]
            F = state[asset_size:]
            a_list.append(a)
            F_list.append(F)
            
        a_df = pd.Series(list(itertools.chain(*a_list)),index = self.valid_factor_zscore.index,name='alpha')
        
        F_list_temp = [[F,]*asset_size for F in F_list]
        # subset_factor_zscore.index.get_level_values(0).unique()
        F_list_temp = list(itertools.chain(*F_list_temp))
        F_df = pd.DataFrame(F_list_temp,columns=["{}_F".format(f) for f in factor_list],index=self.valid_factor_zscore.index)
        
        return_df = self.equity_df.loc[idx[:,self.valid_universe],'return']
        close_df = self.equity_df.loc[idx[:,self.valid_universe],'close']
        kf_df = pd.concat([self.valid_factor_zscore,F_df,a_df,return_df,close_df],axis=1)
        self.kf_df = kf_df
        return kf_df

        

## Pull Data

In [2]:
from rqdata_utils import *
import pandas as pd
import numpy as np
import scipy as sp
import alphalens as al
from pykalman import KalmanFilter


price_df,instrument_df,equity_df = get_price_instrument_equity("cn_stock_price_2012_2018.csv","cn_instrument_info_2012_2018.csv","cn_equity_daily_2012_2018.csv","sectorCode")
healthcare_universe = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values
benchmark_df = benchmark_reader("cn_SH_healthcare_index_2012_2018.csv")
factor_list = ['market_cap', 'pb_ratio', 'ps_ratio']
universe_list = instrument_df.index[instrument_df.sectorCode=='HealthCare'].values

## Inject Data

In [51]:
myModel = GeneralizedAPTModel(price_df, equity_df, benchmark_df, factor_list, universe_list)



## Time-series Regression

Similar to the APT framework

In [26]:
myModel.time_series_regression()

Unnamed: 0,market_cap,pb_ratio,ps_ratio,const
000150.XSHE,-0.005197,0.008237,-0.000381,-0.000115
600851.XSHG,0.013333,-0.004070,0.000373,0.000198
000766.XSHE,-0.000493,0.000843,0.000807,-0.000387
000919.XSHE,0.053181,-0.016566,0.000339,0.019037
601607.XSHG,0.005867,-0.005305,0.000520,-0.016594
600252.XSHG,0.006719,-0.004957,0.001596,-0.003176
600721.XSHG,-0.008367,0.000223,0.000256,-0.005176
000739.XSHE,0.009936,0.002415,0.004234,0.008364
600055.XSHG,-0.003259,0.003696,0.002861,-0.000134
600056.XSHG,0.006356,0.000898,-0.001715,-0.001246


## Cross-section Regression (BARRA Risk Model)


In [44]:
exposure_df = myModel.cross_section_regression()

In [45]:
exposure_df.head()

Unnamed: 0,const,market_cap,pb_ratio,ps_ratio
2018-03-27,0.020801,-0.0056,-0.000886,-0.000948
2018-03-28,-0.006505,-0.004983,-0.000603,0.001667
2018-03-29,0.002383,-0.003,-0.000925,0.001257
2018-03-30,0.014809,0.004416,0.00053,0.001303
2018-04-02,-0.000119,-0.00392,-0.000729,0.000986


In [53]:
hp = np.ones(len(myModel.valid_universe))
V,sigma, MCTR = myModel.calculate_portfolio_risk(hp)
print("V:\n{}\nsigma\n{}\nMCTR:\n{}".format(V,sigma,MCTR))

V:
[[  3.19844460e-05  -7.42713103e-06   5.18342510e-06 ...,   2.25855232e-06
   -1.12620455e-07  -2.46779972e-05]
 [ -7.42713103e-06   1.60948004e-05  -1.76671909e-05 ...,  -9.67456183e-06
   -8.91835400e-08   1.85188600e-05]
 [  5.18342510e-06  -1.76671909e-05   1.98030463e-05 ...,   1.16950009e-05
   -4.77268788e-07  -1.67809236e-05]
 ..., 
 [  2.25855232e-06  -9.67456183e-06   1.16950009e-05 ...,   1.30630599e-05
   -4.88037744e-06   5.75765299e-06]
 [ -1.12620455e-07  -8.91835400e-08  -4.77268788e-07 ...,  -4.88037744e-06
    3.45766356e-06  -1.09477448e-05]
 [ -2.46779972e-05   1.85188600e-05  -1.67809236e-05 ...,   5.75765299e-06
   -1.09477448e-05   6.49979346e-05]]
sigma
0.03377233630871653
MCTR:
[ 0.00078128  0.00349245 -0.00401757  0.00233878  0.00041072  0.0004083
  0.00222235 -0.00621658  0.00352282 -0.00051361 -0.00095923  0.0025695
 -0.0020911   0.00181597  0.00121508  0.00254621  0.00259181 -0.00777772
  0.0022341   0.0001535  -0.00194782 -0.00624887  0.00143136  0.0021

## Kalman Filter

A powerful tool to address both problems in two traditional regressions

In [28]:
myModel.kalman_filter_calibration()

Unnamed: 0_level_0,Unnamed: 1_level_0,market_cap,pb_ratio,ps_ratio,market_cap_F,pb_ratio_F,ps_ratio_F,alpha,return,close
date,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2012-01-04,000004.XSHE,-0.784393,-0.115692,4.830427,-0.001727,0.007182,0.003632,-0.040317,-0.022250,7.9100
2012-01-04,000028.XSHE,-0.093343,-0.124286,-0.672507,-0.001727,0.007182,0.003632,-0.042259,-0.045433,19.8422
2012-01-04,000150.XSHE,-0.734962,-0.129812,0.443827,-0.001727,0.007182,0.003632,-0.032244,-0.030295,3.1737
2012-01-04,000153.XSHE,-0.669329,-0.128580,-0.607426,-0.001727,0.007182,0.003632,-0.026079,-0.028053,5.7700
2012-01-04,000403.XSHE,-0.872529,-0.132652,-0.722958,-0.001727,0.007182,0.003632,0.002072,0.000000,3.1625
2012-01-04,000423.XSHE,2.685384,-0.117587,0.574595,-0.001727,0.007182,0.003632,-0.042006,-0.045402,37.5863
2012-01-04,000513.XSHE,-0.112104,-0.128775,-0.487653,-0.001727,0.007182,0.003632,-0.050283,-0.052786,10.7564
2012-01-04,000518.XSHE,-0.346596,-0.121920,1.369648,-0.001727,0.007182,0.003632,-0.009864,-0.005167,3.8500
2012-01-04,000538.XSHE,3.825501,-0.120392,-0.306006,-0.001727,0.007182,0.003632,-0.029154,-0.037736,32.6934
2012-01-04,000566.XSHE,-0.238486,-0.125659,0.127319,-0.001727,0.007182,0.003632,-0.048765,-0.048793,4.7021
