In [None]:
## Config
MARKET_DATA_PATH="../../resources/data/csi300_stock_feats.csv"
NEWS_DATA_DIR="../../resources/data/CSI300news_chunked_summarized_senti"

In [None]:
## 初始化alpha158
import polars as pl
import sys; sys.path.append("../..")
from utils.models import *
from utils.alpha import *

def transform_data(df: pl.DataFrame):
    df = build_alpha158(df)
    df = build_label(df)
    return df

df = (pl
    .read_csv(MARKET_DATA_PATH)
    .filter(pl.col('datetime').str.strptime(pl.Datetime).dt.year() == 2019)
    .group_by('instrument')
    .map_groups(transform_data)
    .to_pandas()
)

df = df.assign(
    full_instrument=df['instrument'],
    datetime=pd.to_datetime(df['datetime']),
    instrument=df['instrument'].str.slice(start=2)
).set_index(['instrument', 'datetime'])


In [None]:
import pandas as pd
import sys; sys.path.append("../..")
from utils.alpha import *
import os


df_news = pd.concat([(pd
    .read_json(os.path.join(NEWS_DATA_DIR, filename))
    .assign(instrument=filename.split('.')[0],datetime=lambda x: x['date'].dt.date)
    )
    for filename in os.listdir(NEWS_DATA_DIR)
    if filename.endswith(".json")
], ignore_index=True)

df_senti = build_senti_alpha(df_news,method="标签众数")
df_senti = (df_senti
    .assign(datetime=pd.to_datetime(df_senti['datetime']))
    .fillna({'SENTI': 0})
    .set_index(['instrument', 'datetime'])
)
    

In [4]:
## 特征拼接
df_tot=pd.merge(df, df_senti, left_index=True, right_index=True, how='left')
df_tot.fillna({'SENTI':0}, inplace=True)

In [5]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
label_cols = ['LABEL0']
feature_cols = [col for col in df_tot.columns if col.isupper() and col not in label_cols] # 非label的大写字段视为特征字段
X = df_tot[feature_cols] # 将特征列转换为 DataFrame（X）
X = add_constant(X) # 添加常数列，因为 VIF 计算需要常数项
X = X[~np.isinf(X).any(axis=1)].dropna()

# 计算 VIF
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# 显示结果
print(vif_data)


  return 1 - self.ssr/self.centered_tss


     feature           VIF
0      const  0.000000e+00
1       KMID  2.692671e+10
2       KLEN  3.410880e+03
3      KMID2  6.724601e+10
4        KUP  3.845024e+09
..       ...           ...
155  VSUMD60  7.503811e+09
156   BETA60  2.215295e+00
157   RSQR60  1.981874e+00
158   RESI60  1.023132e+00
159    SENTI  1.012107e+00

[160 rows x 2 columns]


In [8]:
# 定义 VIF 区间
bins = [0, 5, 10, np.inf]
labels = ["<5", "5-10", ">=10"]

# 添加 VIF 区间列
vif_data["VIF_Range"] = pd.cut(vif_data["VIF"], bins=bins, labels=labels, right=False)

# 统计各区间的特征数量
vif_summary = vif_data["VIF_Range"].value_counts().sort_index()

# 显示结果
print(vif_summary)

VIF_Range
<5       27
5-10      7
>=10    126
Name: count, dtype: int64


In [10]:
vif_data

Unnamed: 0,feature,VIF,VIF_Range
0,const,0.000000e+00,<5
1,KMID,2.692671e+10,>=10
2,KLEN,3.410880e+03,>=10
3,KMID2,6.724601e+10,>=10
4,KUP,3.845024e+09,>=10
...,...,...,...
155,VSUMD60,7.503811e+09,>=10
156,BETA60,2.215295e+00,<5
157,RSQR60,1.981874e+00,<5
158,RESI60,1.023132e+00,<5


In [6]:
vif_data.query('feature == "SENTI"')

Unnamed: 0,feature,VIF
159,SENTI,1.012107
