In [1]:
import pandas as pd
import numpy as np
from math import sqrt

In [2]:
# ic 計算
def daily_ic(group):
    return group['factor'].corr(group['ret_fwd_1d'], method='spearman')

In [3]:
# 向量化計算 daily spearman ic
def daily_spearman_ic_vectorized(df, factor_col, ret_col='ret_fwd_1d'):
    tmp = df[['date', 'ticker', factor_col, ret_col]].dropna().copy()
    g = tmp['date']

    # Spearman = corr(rank(x), rank(y))
    x = tmp.groupby('date')[factor_col].rank(method='average')
    y = tmp.groupby('date')[ret_col].rank(method='average')

    dx = x - x.groupby(g).transform('mean')
    dy = y - y.groupby(g).transform('mean')

    num = (dx * dy).groupby(g).sum()
    den = np.sqrt((dx * dx).groupby(g).sum() * (dy * dy).groupby(g).sum())

    ic_ts = num / den
    ic_ts.name = factor_col
    return ic_ts

In [4]:
def preprocess_factors(df, factor_cols):

    df = df.copy()
    
    # 缺失值填補 (Imputation) - 全市場中位數
    print("Step 1: 正在進行缺失值填補 (全市場中位數)...")
    
    def fill_median(x):
        return x.fillna(x.median())

    # 針對每一天 (level='Date')，對指定的因子欄位進行中位數填補
    for col in factor_cols:
        df[col] = df.groupby(level='date')[col].transform(fill_median).fillna(0)

    # 去極值 (Winsorization) - 使用百分位法 (1% ~ 99%)
    print("正在進行去極值 (Winsorization 1% - 99%)...")
    
    def clip_outliers(x):
        # 計算當天的 1% 和 99% 分位點
        lower = x.quantile(0.01)
        upper = x.quantile(0.99)
        return x.clip(lower, upper)

    for col in factor_cols:
        df[col] = df.groupby(level='date')[col].transform(clip_outliers)
    return df

def preprocess_factors_zcore(df, factor_cols):
    df = df.copy()
    print("正在進行標準化 (Z-score)...")
    def z_score(x):
        # 避免標準差為 0 的情況 (例如當天所有股票數值都一樣)
        std = x.std()
        if std == 0:
            return 0
        return (x - x.mean()) / std

    for col in factor_cols:
        df[col] = df.groupby(level='date')[col].transform(z_score)

    return df

In [5]:
factors_data = pd.read_pickle('data/factors_data_log60.pkl')

In [6]:
N_DAYS = 5

factors_data['ret_future'] = factors_data.groupby('ticker')['close'].transform(
    lambda x: x.shift(-N_DAYS) / x - 1
)

factors_data['label'] = factors_data.groupby('date')['ret_future'].transform(
    lambda x: x.rank(pct=True)
)

factors_data['label'] = factors_data['label'] - 0.5
factors_data = factors_data.loc[:, ~factors_data.columns.duplicated()].copy()

# date 欄位保證存在且為 datetime
if 'date' not in factors_data.columns:
    factors_data = factors_data.sort_index()
    factors_data['date'] = factors_data.index
factors_data['date'] = pd.to_datetime(factors_data['date']).dt.normalize()

base_cols = {
    'date','ticker','open','high','low','close','volume','returns',
    'ret_fwd_1d',   
}

factor_cols = [c for c in factors_data.columns if c not in base_cols]
factor_cols.sort()

  factors_data['ret_future'] = factors_data.groupby('ticker')['close'].transform(
  factors_data['label'] = factors_data.groupby('date')['ret_future'].transform(


In [8]:
results = {}
summary_rows = []

for col in factor_cols:
    ic_ts = daily_spearman_ic_vectorized(factors_data, col, ret_col='ret_future')
    results[col] = ic_ts

    vals = ic_ts.to_numpy(dtype='float64')
    vals = vals[~np.isnan(vals)]

    if vals.size == 0:
        ic_mean = ic_std = ic_ir = pct_pos = np.nan
        n_days = 0
    else:
        ic_mean = vals.mean()
        ic_std  = vals.std(ddof=1) if vals.size > 1 else np.nan
        ic_ir   = ic_mean / ic_std if (ic_std is not None and ic_std > 0) else np.nan
        pct_pos = (vals > 0).mean()
        n_days  = vals.size

    summary_rows.append({
        'factor': col,
        'ic_mean': ic_mean,
        'ic_std': ic_std,
        'ic_ir': ic_ir,
        'pct_positive': pct_pos,
        'n_days': n_days,
    })

ic_summary = pd.DataFrame(summary_rows).sort_values('ic_ir', ascending=False)
ic_summary.to_csv('ic_summary_rank.csv', index=False)

In [9]:
ic_summary = pd.read_csv('ic_summary_rank.csv')
ic_summary.head(20)

Unnamed: 0,factor,ic_mean,ic_std,ic_ir,pct_positive,n_days
0,price3,0.06124,0.089407,0.684958,0.790803,2653
1,min5,0.068287,0.104851,0.651276,0.754994,2653
2,alpha_013,0.042363,0.069527,0.609307,0.731523,2652
3,alpha_016,0.043416,0.076637,0.566514,0.720588,2652
4,min10,0.059609,0.108867,0.547544,0.710139,2653
5,ep_ttm_rank,0.049981,0.092242,0.541843,0.742857,2555
6,ep_ttm,0.049981,0.092242,0.541843,0.742857,2555
7,alpha_055,0.034616,0.064501,0.536673,0.719728,2651
8,qtld5,0.055076,0.104336,0.527869,0.724359,2652
9,alpha_015,0.032631,0.062939,0.518457,0.692308,2652


In [26]:
factor_cols.remove('effective_date')
factor_cols.remove('report_date')

ValueError: list.remove(x): x not in list

In [12]:
df = factors_data.drop(columns=['open', 'high', 'low', 'volume', 'returns', 'ret_fwd_1d','effective_date','report_date'])
df1 = df.set_index(['date', 'ticker'])

In [None]:
#(13m48s)
processed_df = preprocess_factors(df1, factor_cols)

Step 1: 正在進行缺失值填補 (全市場中位數)...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

正在進行去極值 (Winsorization 1% - 99%)...


In [14]:
processed_df.to_pickle("data/processed_df.pkl")

In [8]:
processed_df1 = pd.read_pickle("data/processed_df.pkl")

In [9]:
processed_df1

Unnamed: 0_level_0,Unnamed: 1_level_0,close,AccountsPayable_x,AccountsPayable_per,AccountsReceivableNet,AccountsReceivableNet_per,CapitalSurplus,CapitalSurplus_per,CashAndCashEquivalents,CashAndCashEquivalents_per,CurrentAssets,...,pretax_margin,cfo_margin,accruals,cash_conversion,dep_rate,de_ratio,current_ratio,quick_ratio,cash_ratio,label
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-06-01,1101.TW,18.344522,6.927581e+09,2.39,7.211579e+09,2.49,1.222523e+10,4.22,3.140622e+10,10.84,9.247110e+10,...,0.068557,0.298419,-0.018716,6.565441,0.005906,2.308939,1.328992,0.555014,0.451369,-0.389791
2015-06-02,1101.TW,18.032085,6.927581e+09,2.39,7.211579e+09,2.49,1.222523e+10,4.22,3.140622e+10,10.84,9.247110e+10,...,0.068557,0.298419,-0.018716,6.565441,0.005906,2.308939,1.328992,0.555014,0.451369,0.047564
2015-06-03,1101.TW,17.920500,6.927581e+09,2.39,7.211579e+09,2.49,1.222523e+10,4.22,3.140622e+10,10.84,9.247110e+10,...,0.068557,0.298419,-0.018716,6.565441,0.005906,2.308939,1.328992,0.555014,0.451369,-0.252028
2015-06-04,1101.TW,17.317945,6.927581e+09,2.39,7.211579e+09,2.49,1.222523e+10,4.22,3.140622e+10,10.84,9.247110e+10,...,0.068557,0.298419,-0.018716,6.565441,0.005906,2.308939,1.328992,0.555014,0.451369,0.256663
2015-06-05,1101.TW,17.384897,6.927581e+09,2.39,7.211579e+09,2.49,1.222523e+10,4.22,3.140622e+10,10.84,9.247110e+10,...,0.068557,0.298419,-0.018716,6.565441,0.005906,2.308939,1.328992,0.555014,0.451369,0.421205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-05-25,9958.TW,7.179392,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.287703
2015-05-26,9958.TW,7.123982,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.261021
2015-05-27,9958.TW,7.084403,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.307425
2015-05-28,9958.TW,7.195221,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,0.00,0.000000e+00,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.278422


In [27]:
# 標準化 （ram 不夠用 只能分開寫）
processed_df2 = preprocess_factors_zcore(processed_df1, factor_cols)

正在進行標準化 (Z-score)...


In [29]:
processed_df3 = processed_df2.fillna(0)
processed_df3.isna().sum()

close                        0
AccountsPayable_x            0
AccountsPayable_per          0
AccountsReceivableNet        0
AccountsReceivableNet_per    0
                            ..
de_ratio                     0
current_ratio                0
quick_ratio                  0
cash_ratio                   0
label                        0
Length: 345, dtype: int64

In [30]:
df_long = processed_df3.reset_index()

In [31]:
df_long.sort_values(['ticker', 'date']).reset_index(drop=True).to_pickle('data/processed_factors_data_log60.pkl')

: 

In [14]:
df_long.sort_values(['ticker', 'date']).reset_index(drop=True).to_csv('data/processed_factors_data_log60.csv', index=False)