In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import warnings
from datetime import datetime
from dateutil.relativedelta import relativedelta

warnings.filterwarnings("ignore")

In [2]:
# 数据存储目录
data_path = rf'/project/model_share/share_1'
fac_path = rf'{data_path}/factor_data'
fac_name = rf'fac20250212'
label_path = rf'{data_path}/label_data'
label_name = rf'label1'
liquid_path = rf'{data_path}/label_data'
liquid_name = rf'can_trade_amt1'

# 读取完整因子集和其他数据
fac_data = pd.read_feather(rf'{fac_path}/{fac_name}/{fac_name}.fea')
liquid_data = pd.read_feather(rf"{liquid_path}/{liquid_name}.fea").set_index("index")
ret_data = pd.read_feather(rf"{label_path}/{label_name}.fea").set_index("index")
season_list = ["2023q1", "2023q2", "2023q3", "2023q4", "2024q1", "2024q2"]
date_list = [x for x in fac_data['date'].unique() if x in ret_data.index and x in liquid_data.index]
date_list.sort()
fac_data = fac_data.set_index('date').sort_index()

In [3]:
fac_data.head(5)  # 因子数据

Unnamed: 0_level_0,Code,fac0001,fac0002,fac0003,fac0004,fac0005,fac0006,fac0007,fac0008,fac0009,...,fac2781,fac2782,fac2783,fac2784,fac2785,fac2786,fac2787,fac2788,fac2789,fac2790
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20210104,1,0.103297,0.508136,-0.276862,-4.331658,0.261553,-5.413435,0.634146,-4.36574,0.288427,...,0.0,-0.077179,-0.03481,-0.036515,-0.040032,-0.043767,-0.041647,-0.030119,-0.026458,-0.007071
20210104,2,0.041108,0.692095,-0.148505,-3.625367,0.389545,3.513462,1.249417,-3.622822,0.421044,...,0.0,-0.20778,-0.084853,-0.077782,-0.077782,-0.077782,-0.007071,-0.007071,-0.007071,0.0
20210104,4,0.061697,0.900875,-0.046951,-1.091614,0.458014,0.0,1.0,-1.03902,0.502059,...,0.0,-0.015275,-0.01,-0.01,-0.01,0.0,0.0,0.0,0.0,0.0
20210104,5,0.0,0.932106,-0.026149,0.495133,0.558257,-2.639419,0.459459,0.573825,0.595144,...,0.0,-0.077889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20210104,6,0.032552,0.958341,-0.025876,-1.673883,0.331534,-3.014057,0.388889,-1.567781,0.368831,...,0.0,-0.023452,-0.019235,-0.017078,-0.017078,-0.017078,-0.01,-0.007071,-0.007071,0.0


In [4]:
liquid_data.head(5)  # 流动性数据

Unnamed: 0_level_0,000001,000002,000004,000005,000006,000008,000009,000010,000011,000012,...,000693,002070,002018,002680,600747,600074,601558,002604,300028,000939
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20210104,46229640.0,68918280.0,465525.0,453882.0,306252.0,1938528.0,4311216.0,8793435.0,311100.0,17735579.1,...,,,,,,,,,,
20210105,30227520.0,66872016.0,1109856.0,230580.0,217800.0,639576.0,7080840.0,2001174.0,203832.0,11240400.0,...,,,,,,,,,,
20210106,31249128.48,55561080.0,161694.0,327522.0,1069908.0,4378452.0,4643352.0,45177.0,1929420.0,8316432.0,...,,,,,,,,,,
20210107,80536923.84,100000000.0,858000.0,545760.0,599292.0,2321298.0,20267940.0,541008.0,1501200.0,5770224.0,...,,,,,,,,,,
20210108,65926710.0,66604734.0,800100.0,75756.0,342297.0,1662276.0,18134212.8,288222.0,3250089.0,4466880.0,...,,,,,,,,,,


In [5]:
ret_data.head(5)  # 收益率数据

Unnamed: 0_level_0,000001,000002,000004,000005,000006,000008,000009,000010,000011,000012,...,000693,002070,002018,002680,600747,600074,601558,002604,300028,000939
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20210104,-0.01516,-0.008873,-0.003585,-0.010165,-0.012514,-0.00306,0.001227,-0.010241,-0.01116,-0.001628,...,,,,,,,,,,
20210105,0.011209,0.005584,-0.010914,-0.010923,-0.005763,0.011299,-0.008579,0.002542,-0.002238,-0.011399,...,,,,,,,,,,
20210106,0.020182,0.007172,-0.016159,-0.007238,-0.007331,0.003986,-0.005613,-0.003882,-0.013385,-0.004789,...,,,,,,,,,,
20210107,0.002479,0.001442,-0.008702,-0.003402,-0.006117,-0.01064,-0.009734,-0.011196,-0.012402,-0.006997,...,,,,,,,,,,
20210108,0.007354,0.008051,-0.006289,-0.003402,0.000611,-0.001361,-0.008165,-0.003515,0.00528,-0.008581,...,,,,,,,,,,


In [6]:
# 多进程计算每日因子的回测表现
class FacMetric(Dataset):

    def __init__(self, data, date):
        self.fac_data = data
        self.date_list = date

    def __getitem__(self, index):
        # 取date日的因子，计算表现
        date = self.date_list[index]
        fac_td = self.fac_data.loc[date].set_index('Code')
        fac_rank = fac_td.rank(pct=True, method='dense')
        ret = ret_data.loc[date].dropna()
        ret_rank = ret.rank(pct=True, method='dense')
        amt = liquid_data.loc[date].dropna()
        amt_ret = pd.concat([amt, ret], axis=1, keys=['amt', 'ret']).fillna(0)
        amt_ret['amt_ret'] = amt_ret['amt'] * amt_ret['ret']
        fac_list = fac_td.columns.tolist()
        fac_info = pd.DataFrame(index=fac_list)

        # ir和rank_ic
        fac_info['ic'] = fac_td.corrwith(ret)
        fac_info['rank_ic'] = fac_rank.corrwith(ret_rank)

        # 打分头/尾10%的收益率
        # 将每个因子下排名前10%的资产的总收益率除以排名前10%的资产的数量，就得到了每个因子下排名前10%的资产的平均收益率
        # 如果该因子下没有排名前10%或者后20%的资产的话，收益就是0
        head10p = (fac_rank > 0.9).T
        tail10p = (fac_rank < 0.1).T
        fac_info['head10p'] = (head10p.dot(ret.reindex(fac_rank.index).fillna(0)) / 
                    head10p.sum(axis=1).where(head10p.sum(axis=1) != 0, np.nan))
        fac_info['tail10p'] = (tail10p.dot(ret.reindex(fac_rank.index).fillna(0)) / 
                    tail10p.sum(axis=1).where(tail10p.sum(axis=1) != 0, np.nan))

        # 打分头/尾1.5e9金额按流动性买入的收益（和模型端label_ret的计算逻辑一致）
        def htamt_ret(code_list, tot_amt):
            amt_ret_new = amt_ret.reindex(code_list).fillna(0)
            amt_ret_new['cum_amt'] = amt_ret_new['amt'].cumsum()
            amt_ret_ht = amt_ret_new.loc[amt_ret_new['cum_amt'] <= tot_amt]
            return amt_ret_ht['amt_ret'].sum() / amt_ret_ht['amt'].sum()

        money = '1.5e9'
        htamt = {}
        for fac_name in fac_list:
            htamt[fac_name] = {}
            code_head = fac_td[fac_name].sort_values(ascending=False).dropna().index.tolist()
            code_tail = fac_td[fac_name].sort_values(ascending=True).dropna().index.tolist()
            htamt[fac_name][f'head{money}'] = htamt_ret(code_head, eval(money))
            htamt[fac_name][f'tail{money}'] = htamt_ret(code_tail, eval(money))

        fac_info = pd.concat([fac_info, pd.DataFrame(htamt).T], axis=1)
        fac_info.insert(0, 'date', date)
        return fac_info

    def __len__(self):
        return len(self.date_list)

In [7]:
# 合并每日因子表现至fac_info_all
cal_res = DataLoader(FacMetric(fac_data, date_list), collate_fn=lambda x: x, num_workers=64)

In [12]:
fac_info_all = pd.concat([res[0] for res in tqdm(cal_res)])

100%|██████████| 844/844 [05:22<00:00,  2.62it/s] 


In [11]:
fac_info_all.head(10)

Unnamed: 0,date,ic,rank_ic,head10p,tail10p,head1.5e9,tail1.5e9
fac0001,20210104,0.150862,0.157892,0.000168,-0.00503,0.001478,-0.003458
fac0002,20210104,-0.087368,-0.11838,-0.004509,-0.001908,0.00067,-0.00806
fac0003,20210104,0.056323,0.026862,0.001001,-0.003129,0.005138,-0.007018
fac0004,20210104,0.090456,0.099856,-0.000987,-0.003732,0.001639,-0.000556
fac0005,20210104,0.132126,0.125647,0.000782,-0.004153,0.005253,-0.003549
fac0006,20210104,0.06175,0.09534,-0.000739,-0.002924,0.000767,-0.001501
fac0007,20210104,0.033139,0.140205,-0.002531,-0.005582,-0.000509,-0.004257
fac0008,20210104,0.089119,0.098309,-0.001004,-0.003768,0.002113,-0.001708
fac0009,20210104,0.119489,0.124735,0.000516,-0.003989,0.004956,-0.004551
fac0010,20210104,0.126413,0.125057,0.000361,-0.004063,0.005208,-0.005475


In [13]:
fac_info_all = fac_info_all.reset_index(drop=False).rename(columns={'index': 'fac_name'})
fac_name_all = fac_info_all['fac_name'].unique()

In [15]:
fac_info_all.head(10)

Unnamed: 0,fac_name,date,ic,rank_ic,head10p,tail10p,head1.5e9,tail1.5e9
0,fac0001,20210104,0.150862,0.157892,0.000168,-0.00503,0.001478,-0.003458
1,fac0002,20210104,-0.087368,-0.11838,-0.004509,-0.001908,0.00067,-0.00806
2,fac0003,20210104,0.056323,0.026862,0.001001,-0.003129,0.005138,-0.007018
3,fac0004,20210104,0.090456,0.099856,-0.000987,-0.003732,0.001639,-0.000556
4,fac0005,20210104,0.132126,0.125647,0.000782,-0.004153,0.005253,-0.003549
5,fac0006,20210104,0.06175,0.09534,-0.000739,-0.002924,0.000767,-0.001501
6,fac0007,20210104,0.033139,0.140205,-0.002531,-0.005582,-0.000509,-0.004257
7,fac0008,20210104,0.089119,0.098309,-0.001004,-0.003768,0.002113,-0.001708
8,fac0009,20210104,0.119489,0.124735,0.000516,-0.003989,0.004956,-0.004551
9,fac0010,20210104,0.126413,0.125057,0.000361,-0.004063,0.005208,-0.005475


In [16]:
fac_name_all[:10]

array(['fac0001', 'fac0002', 'fac0003', 'fac0004', 'fac0005', 'fac0006',
       'fac0007', 'fac0008', 'fac0009', 'fac0010'], dtype=object)

In [None]:
# 对于season季度的测试集，在季度开始前（test_start前）获取前month个月的日期列表，用这些日期来评测因子
def get_eval_date(all_date, season, month):
    test_start = season[:4] + str(int(season.split("q")[1]) * 3 - 2).zfill(2)
    start_date = datetime.strptime(test_start, "%Y%m")
    train_start = (start_date - relativedelta(months=month)).strftime("%Y%m")
    # 隔开10天防止泄露未来数据（同模型训练）
    train_date_list = [x for x in all_date if train_start <= x < test_start][:-10]
    # 不考虑极端日期（同模型训练）
    not_train_date = [x for x in date_list if (x >= "202402") & (x <= "20240223")]
    train_date_list = [x for x in train_date_list if x not in not_train_date]
    train_date_list.sort()
    return train_date_list, train_date_list[0], train_date_list[-1]


# 按ic调整多空头收益
# head是因子大的组，tail是因子小的组
# 如果ic大于0，认为head是多头组，tail是空头组
# 如果ic小于0，认为head是空头组，tail是多头组
def adjust_sign(info_in):
    info = info_in.astype('float')
    sign_ic = np.sign(info['ic'])
    head_cols = [x for x in info.columns if 'head' in x]
    tail_cols = [x for x in info.columns if 'tail' in x]
    temp_head = info[head_cols].copy()
    temp_tail = info[tail_cols].copy()
    cond = sign_ic < 0
    info.loc[cond, head_cols] = temp_tail.loc[cond].values
    info.loc[cond, tail_cols] = temp_head.loc[cond].values
    return info

# 获取每个季度的因子列表
# sel_fac_season中记录的因子列表即为每个季度的筛选结果
sel_fac_season = dict()
for season in season_list:
    # 取前两年的数据
    _, eval_start, eval_end = get_eval_date(date_list, season, 24)
    fac_info = fac_info_all.loc[fac_info_all['date'].between(eval_start, eval_end)]
    fac_info = fac_info.sort_values(['fac_name', 'date']).groupby('fac_name', as_index=False)
    # 计算各项指标在前两年的均值
    fac_info_mean = []
    for fac_name in fac_name_all:
        res = fac_info.get_group(fac_name).loc[:, 'ic':].mean()
        res['fac_name'] = fac_name
        fac_info_mean.append(res)
    fac_info_mean = pd.concat(fac_info_mean, axis=1).T.set_index('fac_name', drop=True)
    # 筛选条件：head1.5e9足够大（多头足够强）或tail1.5e9足够小（空头足够强）
    fac_info_mean = adjust_sign(fac_info_mean)
    sel_fac_long = fac_info_mean[fac_info_mean['head1.5e9'] > 3 / 10000].index.to_list()  # 或者使用quantile筛选
    sel_fac_short = fac_info_mean[fac_info_mean['tail1.5e9'] < - 4 / 10000].index.to_list()
    sel_fac_list = sel_fac_long + [x for x in sel_fac_short if x not in sel_fac_long]
    sel_fac_season[season] = sel_fac_list[:] # 记录每个季度的筛选结果

In [21]:
sel_fac_season

{'2023q1': ['fac0001',
  'fac0002',
  'fac0003',
  'fac0004',
  'fac0005',
  'fac0006',
  'fac0007',
  'fac0008',
  'fac0009',
  'fac0010',
  'fac0011',
  'fac0012',
  'fac0013',
  'fac0015',
  'fac0020',
  'fac0022',
  'fac0023',
  'fac0024',
  'fac0025',
  'fac0026',
  'fac0027',
  'fac0028',
  'fac0029',
  'fac0030',
  'fac0031',
  'fac0032',
  'fac0033',
  'fac0034',
  'fac0035',
  'fac0036',
  'fac0037',
  'fac0038',
  'fac0039',
  'fac0045',
  'fac0046',
  'fac0047',
  'fac0049',
  'fac0050',
  'fac0051',
  'fac0053',
  'fac0055',
  'fac0059',
  'fac0066',
  'fac0074',
  'fac0075',
  'fac0076',
  'fac0078',
  'fac0079',
  'fac0080',
  'fac0081',
  'fac0082',
  'fac0086',
  'fac0087',
  'fac0089',
  'fac0091',
  'fac0094',
  'fac0099',
  'fac0102',
  'fac0104',
  'fac0105',
  'fac0108',
  'fac0112',
  'fac0114',
  'fac0115',
  'fac0116',
  'fac0117',
  'fac0118',
  'fac0119',
  'fac0120',
  'fac0121',
  'fac0122',
  'fac0124',
  'fac0128',
  'fac0129',
  'fac0132',
  'fac0136',
  

## 相关性处理

In [None]:
# 在筛选完初步的因子列表后，添加相关性限制的处理
def filter_correlated_factors(fac_list, max_corr=0.8):
    """
    筛选低相关性的因子组合
    fac_list: 初步筛选的因子列表
    max_corr: 因子间最大允许的相关性
    """
    # 获取这些因子的历史数据
    filtered_fac_data = fac_data.loc[:, fac_list]
    
    # 计算因子间的相关性矩阵
    corr_matrix = filtered_fac_data.corr()
    
    # 初始化保留的因子列表
    selected_factors = []
    
    # 按因子重要性排序（这里以IC均值作为重要性指标）
    factor_importance = {}
    for fac_name in fac_list:
        factor_importance[fac_name] = fac_info_mean.loc[fac_name, 'ic'].abs()
    
    sorted_factors = sorted(factor_importance.items(), key=lambda x: x[1], reverse=True)
    
    # 贪心算法：逐个添加因子，检查相关性
    for fac_name, _ in sorted_factors:
        if not selected_factors:
            # 如果是第一个因子，直接添加
            selected_factors.append(fac_name)
        else:
            # 计算与已选因子的最大相关性
            max_corr_with_selected = max(
                [corr_matrix.loc[fac_name, selected_fac] for selected_fac in selected_factors]
            )
            
            # 如果相关性小于阈值，则添加
            if abs(max_corr_with_selected) <= max_corr:
                selected_factors.append(fac_name)
    
    return selected_factors

In [None]:
# 在每个季度的因子筛选后应用相关性过滤
for season in season_list:
    # 保持原有的筛选逻辑
    _, eval_start, eval_end = get_eval_date(date_list, season, 24)
    fac_info = fac_info_all.loc[fac_info_all['date'].between(eval_start, eval_end)]
    fac_info = fac_info.sort_values(['fac_name', 'date']).groupby('fac_name', as_index=False)
    
    # 计算各项指标在前两年的均值
    fac_info_mean = []
    for fac_name in fac_name_all:
        res = fac_info.get_group(fac_name).loc[:, 'ic':].mean()
        res['fac_name'] = fac_name
        fac_info_mean.append(res)
    fac_info_mean = pd.concat(fac_info_mean, axis=1).T.set_index('fac_name', drop=True)
    
    # 筛选条件：head1.5e9足够大（多头足够强）或tail1.5e9足够小（空头足够强）
    fac_info_mean = adjust_sign(fac_info_mean)
    sel_fac_long = fac_info_mean[fac_info_mean['head1.5e9'] > 3 / 10000].index.to_list()
    sel_fac_short = fac_info_mean[fac_info_mean['tail1.5e9'] < - 4 / 10000].index.to_list()
    sel_fac_list = sel_fac_long + [x for x in sel_fac_short if x not in sel_fac_long]
    
    # 应用相关性过滤
    filtered_factors = filter_correlated_factors(sel_fac_list, max_corr=0.8)
    
    # 保存筛选结果
    sel_fac_season[season] = filtered_factors[:]