
---

### 因子实战 第十集 
## 因子回测 Backtesting 系列 代码习题课

### 🎬 主讲：大导演哈罗德
- 学历背景：香港中文大学本科学位 金融工程专业
- 下一步学业：即将前往美国纽约进修金融工程硕士（已获得录取）
- 🌐 [关注我的Bilibili，看所有人都能听得懂的量化学习内容](https://space.bilibili.com/629573485)
- 🌐 [点击这里关注我的YouTube](https://www.youtube.com/@BD_Harold)

🌟🌟🌟 我有一个梦想，就是让量化变成不再是束之高阁的灵丹妙药，而是让散户投资者认识市场风险最好的工具 @哈罗德的量化频道 🌟🌟🌟

---

In [3]:
"""
这个框框的代码是用来导入数据的和因子回测的方程的。
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from 因子回测包 import factor_correlation as FactorIC
from 因子回测包 import factor_group as FactorGroup

sw_ind = pd.read_pickle('数据/IndexComponent_SWN_I.txt')
stock_close = pd.read_pickle('数据/StockQuote_ClosePrice_BackwardAdj.txt')
stock_open = pd.read_pickle('数据/StockQuote_OpenPrice_BackwardAdj.txt')
monthly_trading_day = pd.read_pickle('数据/monthly_trading_day.pkl')
monthly_trading_day['start_date'] = pd.to_datetime(monthly_trading_day['start_date'], format='%Y%m%d')
monthly_trading_day['end_date'] = pd.to_datetime(monthly_trading_day['end_date'], format='%Y%m%d')
start_date = pd.to_datetime('20120101', format='%Y%m%d')
end_date = pd.to_datetime('20231231', format='%Y%m%d')
filtered_trading_days = monthly_trading_day.loc[(monthly_trading_day['end_date'] >= start_date) & (monthly_trading_day['end_date'] <= end_date)]
stock_close.index = pd.to_datetime(stock_close.index)
stock_open.index = pd.to_datetime(stock_open.index)
stock_ret_monthly = stock_close.reindex(index = filtered_trading_days.end_date).pct_change() 
stock_ret_monthly_nextopen = stock_open.shift(-1).reindex(filtered_trading_days.end_date).pct_change()

def simple_factor_test(factor,use_data = 'this_close'):
    if use_data=='this_close':
        # 
        this_ret_data = stock_ret_monthly.shift(-1)
    else: 
        this_ret_data = stock_ret_monthly_nextopen.shift(-1)

    ic,rankic = FactorIC(factor,this_ret_data)
    factor_group = FactorGroup(factor)
    condata = pd.concat([factor_group.unstack(),this_ret_data.unstack()],axis=1).dropna().reset_index()
    condata.columns =['stockcode','date','group_id','ret']
    group_ret = condata.groupby(['date','group_id'])['ret'].mean().unstack()
    return ic,rankic,group_ret

pe = pd.read_pickle('数据/StockQuote_PEttm.txt')
ep = 1/pe
ep.replace(np.inf, np.nan, inplace = True) #  计算EPttm并去除无限大的值
ep.replace(-np.inf, np.nan, inplace = True) #  计算EPttm并去除无限大的值
dEP = ep.diff(60) # dEP因子是EP因子的六十日delta
# 回测周期：2012-01-01至2023-12-31
# change index into datetime
dEP.index = pd.to_datetime(dEP.index)
dEP_monthly = dEP.reindex(index = filtered_trading_days.end_date) # 按月抽样得到dEP的因子值

ic,rankic,group_ret  = simple_factor_test(dEP_monthly)

In [4]:
dEP_monthly

Unnamed: 0_level_0,000001.SZ,000002.SZ,000003.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,...,873570.BJ,873576.BJ,873593.BJ,873665.BJ,873679.BJ,873693.BJ,873703.BJ,873726.BJ,873833.BJ,T00018.SH
end_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-31,0.002481,0.002968,,-0.005549,-0.000836,0.003023,0.001183,0.000250,0.003496,0.000000,...,,,,,,,,,,
2012-02-29,-0.008819,-0.014278,,0.000517,0.000000,-0.048998,-0.006231,0.000168,0.006528,0.000000,...,,,,,,,,,,
2012-03-30,0.009866,0.011612,,-0.000039,0.000000,-0.075154,-0.010442,,0.002593,0.000081,...,,,,,,,,,,
2012-04-27,0.024902,0.009117,,0.000090,-0.011260,-0.026038,-0.008362,,-0.005099,0.003222,...,,,,,,,,,,
2012-05-31,0.035999,0.013286,,0.000321,-0.011072,-0.006224,-0.000973,,-0.006213,0.003018,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-31,0.000320,-0.006076,,,,-0.005207,,,-0.001222,,...,,,-0.002515,,,,,,,
2023-08-31,0.011596,-0.010884,,,,-0.030556,,,0.005729,,...,,,0.009349,,,,,,,
2023-09-28,0.016771,0.002706,,,,-0.021728,,,0.004374,,...,,0.011647,0.005304,,,,,,,
2023-10-31,0.028188,0.009378,,,,-0.012639,,,0.002719,,...,,0.012465,0.004526,,,,,,,


In [10]:
dEP_monthly.stack()

end_date             
2012-01-31  000001.SZ    0.002481
            000002.SZ    0.002968
            000004.SZ   -0.005549
            000005.SZ   -0.000836
            000006.SZ    0.003023
                           ...   
2023-11-30  873305.BJ   -0.023443
            873339.BJ   -0.028873
            873527.BJ   -0.040824
            873576.BJ   -0.006952
            873593.BJ   -0.008262
Length: 404456, dtype: float64

In [13]:
data = pd.DataFrame(index= pd.MultiIndex.from_product([dEP_monthly.index, dEP_monthly.columns], names=['date','asset']))
data['factor'] = dEP_monthly.stack()
data = data.dropna(subset=['factor'])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,factor
date,asset,Unnamed: 2_level_1
2012-01-31,000001.SZ,0.002481
2012-01-31,000002.SZ,0.002968
2012-01-31,000004.SZ,-0.005549
2012-01-31,000005.SZ,-0.000836
2012-01-31,000006.SZ,0.003023
...,...,...
2023-11-30,873305.BJ,-0.023443
2023-11-30,873339.BJ,-0.028873
2023-11-30,873527.BJ,-0.040824
2023-11-30,873576.BJ,-0.006952


In [17]:
# def 函数(data) -- > group column
# for group percentile

data_factor_array = data['factor'].values
# len(data_factor_array)
data_factor_array


array([ 0.00248113,  0.00296799, -0.00554869, ..., -0.04082391,
       -0.00695192, -0.00826239])

In [16]:
grouper = [data.index.get_level_values('date')]
print(grouper)


[DatetimeIndex(['2012-01-31', '2012-01-31', '2012-01-31', '2012-01-31',
               '2012-01-31', '2012-01-31', '2012-01-31', '2012-01-31',
               '2012-01-31', '2012-01-31',
               ...
               '2023-11-30', '2023-11-30', '2023-11-30', '2023-11-30',
               '2023-11-30', '2023-11-30', '2023-11-30', '2023-11-30',
               '2023-11-30', '2023-11-30'],
              dtype='datetime64[ns]', name='date', length=404456, freq=None)]


In [18]:
data_final_split = np.full((len(data_factor_array),),np.nan)
data_final_split


array([nan, nan, nan, ..., nan, nan, nan])

In [27]:
data_groupby = data.groupby(grouper)
for i in data_groupby.groups:
    # # print(i[0]) 
    # print(i[1])
    print(i)
    print(data_groupby.get_group(i))
    break

2012-01-31 00:00:00
                        factor
date       asset              
2012-01-31 000001.SZ  0.002481
           000002.SZ  0.002968
           000004.SZ -0.005549
           000005.SZ -0.000836
           000006.SZ  0.003023
...                        ...
           601991.SH -0.003206
           601992.SH  0.032230
           601996.SH  0.010854
           601998.SH  0.004654
           601999.SH  0.004056

[2076 rows x 1 columns]


In [29]:
data_groupby_indices = data_groupby.indices
# print(data_groupby_indices)
data_groupby_indices = list(data_groupby_indices.values())
data_groupby_indices


[array([   0,    1,    2, ..., 2073, 2074, 2075]),
 array([2076, 2077, 2078, ..., 4160, 4161, 4162]),
 array([4163, 4164, 4165, ..., 6248, 6249, 6250]),
 array([6251, 6252, 6253, ..., 8258, 8259, 8260]),
 array([ 8261,  8262,  8263, ..., 10283, 10284, 10285]),
 array([10286, 10287, 10288, ..., 12350, 12351, 12352]),
 array([12353, 12354, 12355, ..., 14453, 14454, 14455]),
 array([14456, 14457, 14458, ..., 16509, 16510, 16511]),
 array([16512, 16513, 16514, ..., 18588, 18589, 18590]),
 array([18591, 18592, 18593, ..., 20633, 20634, 20635]),
 array([20636, 20637, 20638, ..., 22699, 22700, 22701]),
 array([22702, 22703, 22704, ..., 24777, 24778, 24779]),
 array([24780, 24781, 24782, ..., 26886, 26887, 26888]),
 array([26889, 26890, 26891, ..., 28978, 28979, 28980]),
 array([28981, 28982, 28983, ..., 31060, 31061, 31062]),
 array([31063, 31064, 31065, ..., 33087, 33088, 33089]),
 array([33090, 33091, 33092, ..., 35128, 35129, 35130]),
 array([35131, 35132, 35133, ..., 37187, 37188, 37189])

In [34]:
for this_group_place in range(len(data_groupby_indices)):

    this_indice_place = data_groupby_indices[this_group_place]
    this_factor_array = data_factor_array[this_indice_place]
    this_split_result = data_final_split[this_indice_place]
    # print(this_indice_place)
    # print(this_factor_array)
    # print(this_split_result)
    # break

    split_percentile = np.linspace(0,100,5+1)
    # print(split_percentile)
    # break
    分割值 = np.nanpercentile(this_factor_array,split_percentile)
    # print(分割值)
    分割值[0] -=1
    分割值[-1] +=1
    for i in range(len(分割值)-1):
        this_split_result[ ( this_factor_array<=  分割值[i+1]) &  ( this_factor_array > 分割值[i]  )        ] = i
        data_final_split[this_indice_place] = this_split_result

    data.loc[:, 'group'] = data_final_split
    # final_data = data['group'].unstack()

    # break

    

In [35]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,factor,group
date,asset,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-31,000001.SZ,0.002481,1.0
2012-01-31,000002.SZ,0.002968,1.0
2012-01-31,000004.SZ,-0.005549,0.0
2012-01-31,000005.SZ,-0.000836,0.0
2012-01-31,000006.SZ,0.003023,1.0
...,...,...,...
2023-11-30,873305.BJ,-0.023443,0.0
2023-11-30,873339.BJ,-0.028873,0.0
2023-11-30,873527.BJ,-0.040824,0.0
2023-11-30,873576.BJ,-0.006952,0.0


In [36]:
final_data = data['group'].unstack()

In [37]:
final_data

asset,000001.SZ,000002.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,000011.SZ,...,301418.SZ,301421.SZ,301469.SZ,301507.SZ,301529.SZ,603075.SH,603270.SH,603275.SH,688591.SH,872953.BJ
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-31,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,...,,,,,,,,,,
2012-02-29,0.0,0.0,1.0,1.0,0.0,0.0,1.0,4.0,1.0,0.0,...,,,,,,,,,,
2012-03-30,4.0,4.0,3.0,3.0,0.0,0.0,,3.0,3.0,0.0,...,,,,,,,,,,
2012-04-27,4.0,4.0,3.0,1.0,0.0,1.0,,2.0,4.0,,...,,,,,,,,,,
2012-05-31,4.0,4.0,2.0,0.0,1.0,2.0,,1.0,3.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-31,3.0,0.0,,,0.0,,,1.0,,0.0,...,,,,,,,,,,
2023-08-31,4.0,0.0,,,0.0,,,3.0,,4.0,...,,,,,,,,,,
2023-09-28,4.0,3.0,,,0.0,,,3.0,,4.0,...,,,,,,,,,,
2023-10-31,4.0,4.0,,,0.0,,,3.0,,3.0,...,,,,,,,,,,


In [38]:
final_data.reindex( index=dEP_monthly.index,columns=dEP_monthly.columns  )

Unnamed: 0_level_0,000001.SZ,000002.SZ,000003.SZ,000004.SZ,000005.SZ,000006.SZ,000007.SZ,000008.SZ,000009.SZ,000010.SZ,...,873570.BJ,873576.BJ,873593.BJ,873665.BJ,873679.BJ,873693.BJ,873703.BJ,873726.BJ,873833.BJ,T00018.SH
end_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-31,1.0,1.0,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
2012-02-29,0.0,0.0,,1.0,1.0,0.0,0.0,1.0,4.0,1.0,...,,,,,,,,,,
2012-03-30,4.0,4.0,,3.0,3.0,0.0,0.0,,3.0,3.0,...,,,,,,,,,,
2012-04-27,4.0,4.0,,3.0,1.0,0.0,1.0,,2.0,4.0,...,,,,,,,,,,
2012-05-31,4.0,4.0,,2.0,0.0,1.0,2.0,,1.0,3.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-31,3.0,0.0,,,,0.0,,,1.0,,...,,,0.0,,,,,,,
2023-08-31,4.0,0.0,,,,0.0,,,3.0,,...,,,4.0,,,,,,,
2023-09-28,4.0,3.0,,,,0.0,,,3.0,,...,,4.0,3.0,,,,,,,
2023-10-31,4.0,4.0,,,,0.0,,,3.0,,...,,4.0,3.0,,,,,,,
