# Factor model applied to Chinese stock market

In [2]:
from arctic import Arctic

  from pandas import DataFrame, Series, Panel


In [3]:
import pandas as pd

In [4]:
from sklearn.linear_model import LinearRegression

In [5]:
a = Arctic('localhost')

In [6]:
lib_stock = a['stock']
lib_basic = a['stock_basics']
lib_index = a['index']

In [7]:
def get_sw_industry_class():
    """
    Returns Shen-Wan indsutry classification table
    """
    industry = pd.read_html('./refData/SwClass.xls', encoding='GBK')[0]
    IND_RENAME = {
        '行业名称':'industry', 
        '股票代码':'symbol', 
        '股票名称':'name', 
        '起始日期':'start_date', 
        '结束日期':'end_date',
    }

    industry = industry.rename(columns=IND_RENAME)
    industry['symbol'] = industry.symbol.astype(str).str.zfill(6)
    
    return industry

In [8]:
industry = get_sw_industry_class()

In [9]:
symbol_2_ind=industry.set_index('symbol')['industry'].to_dict()

In [10]:
ind_2_index = {
 '农林牧渔': '801010',
 '采掘': '801020',
 '化工': '801030',
 '钢铁': '801040',
 '有色金属': '801050',
 '建筑建材': '801060',
 '机械设备': '801890',
 '电子': '801080',
 '交运设备': '801090',
 '信息设备': '801100',
 '家用电器': '801110',
 '食品饮料': '801120',
 '纺织服装': '801130',
 '轻工制造': '801140',
 '医药生物': '801150',
 '公用事业': '801160',
 '交通运输': '801170',
 '房地产': '801180',
 '金融服务': '801190',
 '商业贸易': '801200',
 '休闲服务': '801210',
 '信息服务': '801220',
 '综合': '801230',
 '建筑材料': '801710',
 '建筑装饰': '801720',
 '电气设备': '801730',
 '国防军工': '801740',
 '计算机': '801750',
 '传媒': '801760',
 '通信': '801770',
 '银行': '801780',
 '非银金融': '801790',
 '汽车': '801880',
}

In [11]:
est_window = pd.bdate_range('2018-10-23', '2020-10-23')

In [12]:
all_stocks = lib_stock.list_symbols()

In [13]:
from tqdm import tqdm

In [14]:
beta = {}
residual = pd.DataFrame(index = est_window)

In [15]:
df_index_full = lib_index.read(list(ind_2_index.values()), chunk_range=est_window)

In [54]:
for ts_code in tqdm(all_stocks):
    stock = ts_code.split('.')[0]
    if stock in symbol_2_ind:
        ind = symbol_2_ind[stock]
        sw_symbol = ind_2_index[ind]
        
        df_stock = lib_stock.read(ts_code, chunk_range=est_window)
        df_index = df_index_full[sw_symbol]
        
        if len(df_stock) < 0.6 * len(df_index):
            continue
        
        # Extract pct_chg from index and stock
        y = df_stock['pct_chg']/100.
        X = df_index[['pct_chg']].reindex(y.index)/100.
       
        lr = LinearRegression(fit_intercept=False)
        lr.fit(X, y)
        beta[stock]=lr.coef_[0]
        y_hat = lr.predict(X)
        residual[stock] = y-y_hat

100%|██████████████████████████████████████████████████████████████████████████████| 4040/4040 [05:22<00:00, 12.53it/s]


In [56]:
len(beta)

3629

In [62]:
y.tail(20)

date
2020-09-18   -0.011686
2020-09-21   -0.028882
2020-09-22   -0.040519
2020-09-23    0.032453
2020-09-24   -0.014910
2020-09-25    0.010841
2020-09-28   -0.042898
2020-09-29    0.031078
2020-09-30    0.009842
2020-10-09    0.049746
2020-10-12    0.063830
2020-10-13    0.002182
2020-10-14   -0.006531
2020-10-15   -0.006939
2020-10-16   -0.011769
2020-10-19   -0.000744
2020-10-20    0.005773
2020-10-21    0.029439
2020-10-22    0.026439
2020-10-23   -0.018749
Name: pct_chg, dtype: float64

In [19]:
df = pd.concat( list(df_index_full.values()) )

In [31]:
returns = df.reset_index().pivot_table('pct_chg', 'date', 'name')/100.

In [32]:
returns.corr().style.background_gradient()

name,交通运输,传媒,公用事业,农林牧渔,化工,医药生物,商业贸易,国防军工,家用电器,建筑材料,建筑装饰,房地产,有色金属,机械设备,汽车,电子元器件,电气设备,纺织服装,综合,计算机,轻工制造,通信,采掘,钢铁,银行,非银金融,食品饮料,餐饮旅游
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
交通运输,1.0,0.801342,0.87325,0.590211,0.859061,0.700212,0.849208,0.66023,0.766801,0.775888,0.845714,0.804621,0.727866,0.86652,0.829125,0.720264,0.775429,0.792192,0.810045,0.741148,0.868943,0.711458,0.837422,0.769435,0.681727,0.818427,0.695275,0.649165
传媒,0.801342,1.0,0.853344,0.607192,0.856406,0.725937,0.843747,0.714471,0.663063,0.701327,0.783848,0.670324,0.745442,0.896803,0.815949,0.836638,0.860994,0.807195,0.873976,0.91155,0.885494,0.835803,0.749969,0.697095,0.509341,0.732173,0.574672,0.578782
公用事业,0.87325,0.853344,1.0,0.624211,0.877266,0.660503,0.839407,0.70551,0.694782,0.795268,0.888247,0.773785,0.778456,0.91332,0.828187,0.736292,0.825761,0.833413,0.854827,0.795333,0.8946,0.760155,0.863849,0.804488,0.619057,0.772321,0.601726,0.553895
农林牧渔,0.590211,0.607192,0.624211,1.0,0.647698,0.557866,0.629376,0.525814,0.462723,0.556116,0.56359,0.483137,0.587839,0.643784,0.590569,0.551114,0.613864,0.57966,0.630716,0.575776,0.649454,0.537459,0.565919,0.512424,0.32367,0.454181,0.510782,0.424948
化工,0.859061,0.856406,0.877266,0.647698,1.0,0.750709,0.860546,0.755258,0.741499,0.828878,0.851517,0.725928,0.830951,0.943407,0.884489,0.815593,0.879432,0.859845,0.876912,0.832998,0.927539,0.793615,0.851934,0.79826,0.586964,0.750641,0.651223,0.611146
医药生物,0.700212,0.725937,0.660503,0.557866,0.750709,1.0,0.732294,0.533404,0.671584,0.625598,0.586784,0.508117,0.586157,0.72796,0.675808,0.649202,0.697203,0.719719,0.737256,0.675776,0.75714,0.598885,0.560237,0.522035,0.399722,0.570193,0.713975,0.588377
商业贸易,0.849208,0.843747,0.839407,0.629376,0.860546,0.732294,1.0,0.692704,0.684881,0.75194,0.812116,0.737475,0.731125,0.8789,0.803899,0.731283,0.801637,0.827112,0.860486,0.787762,0.880469,0.739602,0.761759,0.722852,0.544272,0.742615,0.644074,0.600993
国防军工,0.66023,0.714471,0.70551,0.525814,0.755258,0.533404,0.692704,1.0,0.559215,0.62491,0.702655,0.561056,0.742755,0.783891,0.712965,0.6935,0.746288,0.668134,0.723061,0.751769,0.745946,0.708662,0.675254,0.632281,0.425492,0.620648,0.42114,0.466092
家用电器,0.766801,0.663063,0.694782,0.462723,0.741499,0.671584,0.684881,0.559215,1.0,0.72382,0.705664,0.709397,0.610681,0.743113,0.737857,0.64675,0.685966,0.657843,0.654017,0.611031,0.755084,0.594934,0.676778,0.631086,0.655231,0.71755,0.723746,0.642203
建筑材料,0.775888,0.701327,0.795268,0.556116,0.828878,0.625598,0.75194,0.62491,0.72382,1.0,0.859585,0.763007,0.721364,0.846658,0.749403,0.632849,0.718571,0.704877,0.73117,0.66873,0.815054,0.647555,0.776067,0.769211,0.621841,0.721224,0.630747,0.559085


In [38]:
close_prices = df.reset_index().pivot_table('close', 'date', 'symbol')

In [41]:
rolling = returns.rolling(250).corr().unstack(level=1).dropna()

In [49]:
import matplotlib
matplotlib.rc('font', family='MicroSoft YaHei')