In [None]:
import sys
import os

STOCKA_BASE_DIR = '/Users/zhaozhiyu/Projects/stocka'
sys.path.insert(0, STOCKA_BASE_DIR)

from quant_framework import (
    DataHandler,
    FactorMetrics,
    MultiFactorAnalysis,
    Alpha158
)

import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
import multiprocessing as mp
from functools import partial

def _process_single_stock(code, data_path, min_data_points, date, save_dir):
    """
    处理单个股票的因子计算（工作函数）
    
    Args:
        code: 股票代码
        data_path: 数据路径
        min_data_points: 最小数据点数
        date: 计算日期
        save_dir: 保存目录
    
    Returns:
        tuple: (code, factor_df) 或 (code, None) 如果失败
    """
    try:
        # 在每个进程中创建新的 DataHandler 实例
        from quant_framework import DataHandler, Alpha158
        
        data_handler = DataHandler(
            data_path=data_path,
            min_data_points=min_data_points
        )
        
        # 获取该股票在指定日期之前的所有历史数据
        df = data_handler.get_data_before(code, date)

        if len(df) > 0 and all(col in df.columns for col in ['open', 'high', 'low', 'close']):
            # 确保数据类型正确
            df['open'] = pd.to_numeric(df['open'], errors='coerce')
            df['high'] = pd.to_numeric(df['high'], errors='coerce')
            df['low'] = pd.to_numeric(df['low'], errors='coerce')
            df['close'] = pd.to_numeric(df['close'], errors='coerce')
            df['volume'] = pd.to_numeric(df['volume'], errors='coerce')

            # 计算因子
            alpha158 = Alpha158()
            factor_df = alpha158.calculate(df)
            
            # 如果指定了保存目录，保存到文件
            if save_dir is not None:
                save_path = os.path.join(save_dir, f"{code}.csv")
                factor_df.to_csv(save_path)
                return (code, None)  # 保存到文件，不返回数据
            
            return (code, factor_df)
        
        return (code, None)
    
    except Exception as e:
        # 静默处理错误
        return (code, None)


def calculate_factor_values(data_handler, codes, dates, save_dir=None, num_workers=None):
    """
    计算指定因子的值（支持多核并行）

    Args:
        data_handler: 数据处理器
        codes: 股票代码列表
        dates: 日期列表
        save_dir: 保存路径
        num_workers: cpu数（默认使用所有可用核心）

    Returns:
        Dict[str, DataFrame]: key为股票代码, value为DataFrame
    """
    # 创建保存目录
    if save_dir is not None and not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # 获取计算日期
    date = dates[-1]
    
    # 确定使用的CPU核心数
    if num_workers is None:
        num_workers = mp.cpu_count()
    
    print(f"使用 {num_workers} 个 CPU 核心进行并行计算...")
    print(f"待计算股票数量: {len(codes)}")
    
    # 准备工作函数的固定参数
    worker_func = partial(
        _process_single_stock,
        data_path=data_handler.data_path,
        min_data_points=data_handler.min_data_points,
        date=date,
        save_dir=save_dir
    )
    
    # 使用进程池并行计算
    factor_dfs = {}
    
    # 使用 imap_unordered 以获得流式结果
    with mp.Pool(processes=num_workers) as pool:
        # 使用 tqdm 显示进度
        results = list(tqdm(
            pool.imap_unordered(worker_func, codes),
            total=len(codes),
            desc="计算因子进度"
        ))
    
    # 整理结果
    success_count = 0
    for code, factor_df in results:
        if factor_df is not None:
            factor_dfs[code] = factor_df
            success_count += 1
    
    print(f"计算完成！成功: {success_count}/{len(codes)}")
    
    return factor_dfs

In [3]:
# # 测试50只股票
# stock_list = [
#     '000001', '000002', '000063', '000069', '000100', '000157', '000166', '000333', '000338', '000581',
#     '000651', '000725', '000768', '000776', '000858', '000876', '000895', '000938', '000999', '001979',
#     '002001', '002008', '002027', '002032', '002044', '002050', '002142', '002304', '002415', '002456',
#     '002475', '002594', '600000', '600036', '600519', '600900', '601012', '601066', '601138', '601166',
#     '601288', '601318', '601398', '601601', '601628', '601766', '601857', '601988', '603259', '688981'
# ]

# 测试1只股票
stock_list = [
    '000001',
]

data_handler = DataHandler(
    data_path=os.path.join(STOCKA_BASE_DIR, "data/stock/kline/day"),
    min_data_points=100,
    # stock_whitelist=stock_list
)

try:
    data_handler.load_data(
        start_date="2015-01-05",
        end_date="2025-12-31"
    )
    print(f"数据加载成功，股票数量: {len(data_handler.get_all_codes())}")
    print()

except Exception as e:
    print(f"加载数据时出错：{e}")

开始加载数据，共找到 3063 个CSV文件...
数据加载完成！
  - 股票数量: 3047
  - 日期范围: 2015-01-05 至 2025-12-31
  - 总数据点: 6853187
数据加载成功，股票数量: 3047



In [4]:
# 获取交易日期
dates = data_handler.get_available_dates()
dates = [d for d in dates if pd.to_datetime('2015-01-05').to_pydatetime().date() <= d <= pd.to_datetime('2025-12-31').to_pydatetime().date()]

print(f"分析时间范围: {dates[0]} 至 {dates[-1]}")
print(f"交易日数: {len(dates)}")

分析时间范围: 2015-01-05 至 2025-12-31
交易日数: 2674


In [5]:
# 计算所有因子
print(f"计算Alpha158因子...")
# factor_dfs = calculate_factor_values(data_handler, stock_list, dates)
factor_dfs = calculate_factor_values(
    data_handler, 
    data_handler.get_all_codes(), 
    dates, 
    os.path.join(STOCKA_BASE_DIR, 'data/factor/day/alpha158'),
    num_workers=10,
)

print()
print("所有因子计算完成！")
print()

计算Alpha158因子...
使用 10 个 CPU 核心进行并行计算...
待计算股票数量: 3047


计算因子进度: 100%|██████████| 3047/3047 [00:01<00:00, 2251.80it/s]

计算完成！成功: 0/3047

所有因子计算完成！






In [11]:
factor_dfs['000001'].head()
factor_dfs['000001'].to_csv('./000001.csv')

In [14]:
factor_dfs['000001'].shape[1] - 11

151

In [7]:
# 计算未来收益率
returns_df = pd.DataFrame(index=dates[1:], columns=stock_list)

for i in range(len(dates) - 1):
    current_date = dates[i]
    next_date = dates[i + 1]

    for code in stock_list:
        try:
            current_data = data_handler.get_daily_data(current_date)
            next_data = data_handler.get_daily_data(next_date)

            if code in current_data.index and code in next_data.index:
                current_price = current_data.loc[code, 'close']
                next_price = next_data.loc[code, 'close']
                returns_df.loc[next_date, code] = (next_price - current_price) / current_price
        except:
            pass

print(f"收益率计算完成，形状: {returns_df.shape}")
print()

收益率计算完成，形状: (242, 50)



In [8]:
returns_df.head()

Unnamed: 0,000001,000002,000063,000069,000100,000157,000166,000333,000338,000581,...,601288,601318,601398,601601,601628,601766,601857,601988,603259,688981
2020-01-03,0.017704,-0.013571,0.030179,-0.003376,0.015311,-0.003424,0.002714,-0.022844,0.014578,-0.004884,...,-0.001953,0.000853,0.003417,0.016361,0.000767,0.001178,0.00927,-0.001669,-0.025689,
2020-01-06,-0.006171,-0.014567,0.003981,0.002258,-0.00754,-0.021918,-0.008119,-0.016603,-0.067857,-0.010461,...,-0.005871,-0.006394,-0.003405,-0.022188,-0.008427,-0.007059,0.032147,-0.005017,-0.029632,
2020-01-07,0.004516,0.006842,-0.002643,0.007814,-0.007597,0.003545,0.008186,0.013566,0.005896,0.015531,...,0.005906,0.005899,0.005695,0.010901,0.012619,0.0,-0.013348,0.006723,0.015022,
2020-01-08,-0.027541,-0.002175,-0.02367,-0.023297,0.028708,-0.015297,-0.023004,0.003608,-0.030611,-0.001671,...,-0.005871,-0.012262,-0.012458,-0.012984,-0.025687,-0.009479,0.013529,-0.006678,-0.006227,
2020-01-09,0.007513,0.014169,0.035687,0.014769,0.011163,0.011947,0.00554,0.018354,0.012172,0.005278,...,-0.001969,0.000432,0.0,0.008919,0.012529,0.004785,-0.01891,0.0,0.044922,


In [9]:
# 分析每个因子
factor_reports = {}

for factor_name, factor_df in factor_dfs.items():
    print(f"\n【{factor_name} 因子分析】")

    try:
        report = FactorMetrics.generate_report(
            factor_df,
            returns_df,
            factor_name,
            periods=1
        )
        FactorMetrics.print_report(report)
        factor_reports[factor_name] = report
    except Exception as e:
        print(f"  分析出错: {e}")


【KMID 因子分析】
因子分析报告: KMID

IC指标（Pearson相关系数）:
  IC均值: -0.0130
  IC标准差: 0.2470
  ICIR: -0.8354
  IC>0占比: 46.47%
  IC绝对值均值: 0.2005
  最大IC: 0.6441
  最小IC: -0.6358

Rank IC指标（Spearman秩相关系数）:
  Rank IC均值: -0.0171
  Rank IC标准差: 0.2281
  Rank ICIR: -1.1916
  Rank IC>0占比: 46.89%

其他信息:
  预测期数: 1
  样本数: 241


【KLEN 因子分析】
因子分析报告: KLEN

IC指标（Pearson相关系数）:
  IC均值: 0.0254
  IC标准差: 0.2820
  ICIR: 1.4279
  IC>0占比: 54.36%
  IC绝对值均值: 0.2341
  最大IC: 0.6437
  最小IC: -0.6585

Rank IC指标（Spearman秩相关系数）:
  Rank IC均值: -0.0047
  Rank IC标准差: 0.2748
  Rank ICIR: -0.2701
  Rank IC>0占比: 48.96%

其他信息:
  预测期数: 1
  样本数: 241


【KMID2 因子分析】
因子分析报告: KMID2

IC指标（Pearson相关系数）:
  IC均值: -0.0114
  IC标准差: 0.2143
  ICIR: -0.8475
  IC>0占比: 51.04%
  IC绝对值均值: 0.1710
  最大IC: 0.5359
  最小IC: -0.5511

Rank IC指标（Spearman秩相关系数）:
  Rank IC均值: -0.0166
  Rank IC标准差: 0.2178
  Rank ICIR: -1.2109
  Rank IC>0占比: 46.47%

其他信息:
  预测期数: 1
  样本数: 241


【KUP 因子分析】
因子分析报告: KUP

IC指标（Pearson相关系数）:
  IC均值: 0.0117
  IC标准差: 0.2168
  ICIR: 0.8563
  IC>0占