In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from scipy.stats import spearmanr
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
pd.read_feather('/home/datamake117/data/haris/dataset_new/scheme1_selected_factors.fea')

Unnamed: 0,Round1,Round2,Round3,Round4,Round5,Round6,Round7,Round8,Round9
0,970,572,572,572,572,572,854,854,854
1,969,970,970,854,854,854,572,855,855
2,968,969,854,970,855,855,855,572,572
3,971,968,969,969,970,970,970,970,970
4,967,971,968,855,969,968,968,969,969
...,...,...,...,...,...,...,...,...,...
1795,602,2511,1569,1621,1984,1529,1908,2197,2526
1796,1589,2106,2067,874,496,1177,450,379,496
1797,2226,2730,997,1385,2361,1569,114,2520,111
1798,601,986,1054,159,490,2680,1011,1855,2025


In [None]:
total_factor = pd.read_pickle('/home/datamake134/data/haris/dataset_new/total_factor.pkl')
total_factor = total_factor[total_factor['date'] >= '2020-07-01']
total_factor['date'] = pd.to_datetime(total_factor['date'])
total_factor['Code'] = total_factor['Code'].astype('category')

In [None]:
total_factor.head()

In [None]:
def optimized_factor_selection(total_factor, train_periods, top_n=1500, m_percent=0.1, n_workers=None):
    """
    优化后的因子筛选方案
    参数:
        total_factor: 包含日期(date), 股票代码(Code), 因子值(0-2789), label的DataFrame
        train_periods: 训练周期列表，格式为[(start_date, end_date), ...]
        top_n: 每个周期选取的因子数量
        m_percent: 多空头比例
        n_workers: 并行进程数（默认使用全部核心）
    """
    # 并行进程设置
    if n_workers is None:
        n_workers = os.cpu_count() or 4
    # 结果存储
    scheme_results = {}
    # 遍历每个训练周期
    for i in trange(1, len(train_periods) + 1, desc="Training Periods"):
        start_date, end_date = train_periods[i - 1][0], train_periods[i - 1][1]
        # 1. 数据准备
        mask = (total_factor['date'] >= start_date) & (total_factor['date'] <= end_date)
        data = total_factor[mask].copy()
        # 按日期预分组
        date_groups = {k: v for k, v in data.groupby('date')}
        # 2. 并行计算因子得分
        with ProcessPoolExecutor(max_workers=n_workers) as executor:
            futures = [executor.submit(calculate_factor_score, str(i), date_groups, m_percent) for i in range(2790)]
            results = []
            with tqdm(
                total=len(futures), 
                desc=f'Processing {len(futures)} Factors', 
                unit='factor',
                bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'
            ) as pbar:
                for future in as_completed(futures):
                    try:
                        results.append(future.result())
                    except Exception as e:
                        print(f"\nFactor calculation error: {str(e)}")
                    finally:
                        pbar.update(1)
        # 3. 排序选取Top因子
        results.sort(key=lambda x: x[1], reverse=True)
        selected = [int(f[0]) for f in results[:top_n]]
        scheme_results[f'Round {i}'] = selected
    return pd.DataFrame(scheme_results)

def calculate_factor_score(factor, date_groups, m_percent):
    """并行计算单个因子得分的函数"""
    ic_scores = []
    long_ratios = []
    short_ratios = []
    for date, group in date_groups.items():
        # 计算每日期望值
        data = group.dropna(subset=[factor, 'label'])
        if len(data) < 5:  # 数据不足跳过
            continue
        # 1. 计算Rank IC
        factor_rank = data[factor].rank(method='first', ascending=False)
        label_rank = data['label'].rank(method='first', ascending=False)
        ic = spearmanr(factor_rank, label_rank).correlation
        ic_scores.append(ic)
        # 2. 计算多头占比
        m = max(1, int(len(data) * m_percent))
        # 使用分位数阈值代替排序
        factor_thresh = data[factor][factor_rank >= (len(data) - m + 0.5)].min()
        label_thresh = data['label'][label_rank >= (len(data) - m + 0.5)].min()
        long_ratio = ((data[factor] >= factor_thresh) & (data['label'] >= label_thresh)).mean()
        long_ratios.append(long_ratio)
        # 3. 计算空头占比
        factor_bottom = data[factor][factor_rank <= m].max()
        label_bottom = data['label'][label_rank <= m].max()
        short_ratio = ((data[factor] <= factor_bottom) & (data['label'] <= label_bottom)).mean()
        short_ratios.append(short_ratio)
    # 综合得分计算（处理可能的空值）
    ic_score = np.nanmean(ic_scores) if ic_scores else 0
    long_score = np.nanmean(long_ratios) if long_ratios else 0
    short_score = np.nanmean(short_ratios) if short_ratios else 0
    total_score = (ic_score + long_score + short_score) / 3
    return (factor, total_score)

In [None]:
# 完整训练周期定义
train_periods = [
    ('2020-07-01', '2022-07-01'),  # Round 1
    ('2021-01-01', '2023-01-01'),  # Round 2
    ('2021-04-01', '2023-04-01'),  # Round 3
    ('2021-07-01', '2023-07-01'),  # Round 4
    ('2021-10-01', '2023-10-01'),  # Round 5
    ('2022-01-01', '2024-01-01'),  # Round 6
    ('2022-04-01', '2024-04-01'),  # Round 7
    ('2022-07-01', '2024-07-01'),  # Round 8
    ('2022-10-01', '2024-10-01')   # Round 9
]

# 运行优化算法
result_df = optimized_factor_selection(
    total_factor=total_factor,
    train_periods=train_periods,
    top_n=1500,
    m_percent=0.1,
    n_workers=None  # 根据实际CPU核心数调整
)

# 保存结果
result_df.to_feather('/home/datamake134/data/haris/dataset_new/scheme1_selected_factors.fea')