In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from scipy.stats import spearmanr
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
total_factor = pd.read_pickle('/home/datamake134/data/haris/dataset_new/total_factor.pkl')
total_factor = total_factor[total_factor['date'] >= '2020-07-01']
total_factor['date'] = pd.to_datetime(total_factor['date'])
total_factor['Code'] = total_factor['Code'].astype('category')

In [3]:
total_factor.head()

Unnamed: 0,date,Code,0,1,2,3,4,5,6,7,...,2786,2787,2788,2789,label,label_,qcut,weight,qid,amount
424421,2020-07-01,1,0.032755,0.84308,-0.074178,-3.458246,0.447829,-7.749803,0.534989,-3.410864,...,70.0,1.0,0.1,0.0,0.031135,0.031135,0,0,0,3718053.0
424422,2020-07-01,2,0.086861,0.737642,-0.144779,-0.831296,0.356911,-3.153102,0.800866,-0.782118,...,63.0,1.0,0.099923,0.0,0.065511,0.065511,0,0,0,8181773.6
424423,2020-07-01,4,0.63258,0.284831,-0.511711,-1.30861,0.095793,-4.276766,0.350877,-1.302909,...,63.0,0.0,0.100135,0.0,-0.030649,-0.030649,0,0,0,888040.0
424424,2020-07-01,5,0.0,0.976718,-0.009933,0.996359,0.529879,-2.759158,0.36,1.02134,...,38.0,0.0,0.100386,0.0,0.003909,0.003909,0,0,0,18410.0
424425,2020-07-01,6,0.108898,0.5335,-0.257871,-1.45285,0.265997,2.785404,1.40566,-1.43477,...,61.0,1.0,0.100152,0.0,0.099182,0.099182,0,0,0,1713309.0


In [4]:
def optimized_factor_selection(total_factor, train_periods, top_n=1500, m_percent=0.1, n_workers=None):
    """
    优化后的因子筛选方案
    参数:
        total_factor: 包含日期(date), 股票代码(Code), 因子值(0-2789), label的DataFrame
        train_periods: 训练周期列表，格式为[(start_date, end_date), ...]
        top_n: 每个周期选取的因子数量
        m_percent: 多空头比例
        n_workers: 并行进程数（默认使用全部核心）
    """
    # 并行进程设置
    if n_workers is None:
        n_workers = os.cpu_count() or 4
    # 结果存储
    scheme_results = {}
    # 遍历每个训练周期
    for i in trange(1, len(train_periods) + 1, desc="Training Periods"):
        start_date, end_date = train_periods[i - 1][0], train_periods[i - 1][1]
        # 1. 数据准备
        mask = (total_factor['date'] >= start_date) & (total_factor['date'] <= end_date)
        data = total_factor[mask].copy()
        # 按日期预分组
        date_groups = {k: v for k, v in data.groupby('date')}
        # 2. 并行计算因子得分
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            futures = [executor.submit(calculate_factor_score, str(i), date_groups, m_percent) for i in range(2790)]
            results = []
            with tqdm(
                total=len(futures), 
                desc=f'Processing {len(futures)} Factors', 
                unit='factor',
                bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'
            ) as pbar:
                for future in as_completed(futures):
                    try:
                        results.append(future.result())
                    except Exception as e:
                        print(f"\nFactor calculation error: {str(e)}")
                    finally:
                        pbar.update(1)
        # 3. 排序选取Top因子
        results.sort(key=lambda x: x[1], reverse=True)
        selected = [int(f[0]) for f in results[:top_n]]
        scheme_results[f'Round {i}'] = selected
    return pd.DataFrame(scheme_results)

def calculate_factor_score(factor, date_groups, m_percent):
    """并行计算单个因子得分的函数"""
    ic_scores = []
    long_ratios = []
    short_ratios = []
    for date, group in date_groups.items():
        # 计算每日期望值
        data = group.dropna(subset=[factor, 'label'])
        if len(data) < 5:  # 数据不足跳过
            continue
        # 1. 计算Rank IC
        factor_rank = data[factor].rank(method='first', ascending=False)
        label_rank = data['label'].rank(method='first', ascending=False)
        ic = spearmanr(factor_rank, label_rank).correlation
        ic_scores.append(ic)
        # 2. 计算多头占比
        m = max(1, int(len(data) * m_percent))
        # 使用分位数阈值代替排序
        factor_thresh = data[factor][factor_rank >= (len(data) - m + 0.5)].min()
        label_thresh = data['label'][label_rank >= (len(data) - m + 0.5)].min()
        long_ratio = ((data[factor] >= factor_thresh) & (data['label'] >= label_thresh)).mean()
        long_ratios.append(long_ratio)
        # 3. 计算空头占比
        factor_bottom = data[factor][factor_rank <= m].max()
        label_bottom = data['label'][label_rank <= m].max()
        short_ratio = ((data[factor] <= factor_bottom) & (data['label'] <= label_bottom)).mean()
        short_ratios.append(short_ratio)
    # 综合得分计算（处理可能的空值）
    ic_score = np.nanmean(ic_scores) if ic_scores else 0
    long_score = np.nanmean(long_ratios) if long_ratios else 0
    short_score = np.nanmean(short_ratios) if short_ratios else 0
    total_score = (ic_score + long_score + short_score) / 3
    return (factor, total_score)

In [None]:
# 完整训练周期定义
train_periods = [
    ('2020-07-01', '2022-07-01'),  # Round 1
    ('2021-01-01', '2023-01-01'),  # Round 2
    ('2021-04-01', '2023-04-01'),  # Round 3
    ('2021-07-01', '2023-07-01'),  # Round 4
    ('2021-10-01', '2023-10-01'),  # Round 5
    ('2022-01-01', '2024-01-01'),  # Round 6
    ('2022-04-01', '2024-04-01'),  # Round 7
    ('2022-07-01', '2024-07-01'),  # Round 8
    ('2022-10-01', '2024-10-01')   # Round 9
]

# 运行优化算法
result_df = optimized_factor_selection(
    total_factor=total_factor,
    train_periods=train_periods,
    top_n=1500,
    m_percent=0.1,
    n_workers=None  # 根据实际CPU核心数调整
)

# 保存结果
result_df.to_feather('/home/datamake134/data/haris/dataset_new/scheme1_selected_factors.fea')

Training Periods:   0%|          | 0/9 [00:00<?, ?it/s]

Processing 2790 Factors:   0%|          | 0/2790 [elapsed: 00:00 remaining: ?]

Process ForkProcess-83:
Process ForkProcess-105:
