In [20]:
## Config
MARKET_DATA_PATH="../resources/data/csi300_stock_feats.csv"
NEWS_DATA_DIR="../resources/data/CSI300news_chunked_summarized_senti"
MODEL_SAVE_DIR="../resources/ckpts/lgb_v2"

In [21]:
## 初始化数据集
import polars as pl
import pandas as pd
import sys; sys.path.append("..")
from utils.models import *
from utils.alpha import *

def transform_data(df: pl.DataFrame):
    df = build_alpha158(df)
    df = build_label(df)
    return df

## 1
df = (pl
    .read_csv(MARKET_DATA_PATH)
    .filter(pl.col('datetime').str.strptime(pl.Datetime).dt.year() == 2019)
    .group_by('instrument')
    .map_groups(transform_data)
    .to_pandas()
)

df = df.assign(
    full_instrument=df['instrument'],
    datetime=pd.to_datetime(df['datetime']),
    instrument=df['instrument'].str.slice(start=2)
).set_index(['instrument', 'datetime'])

## 2
df_news = pd.concat([(pd
    .read_json(os.path.join(NEWS_DATA_DIR, filename))
    .assign(instrument=filename.split('.')[0],datetime=lambda x: x['date'].dt.date)
    )
    for filename in os.listdir(NEWS_DATA_DIR)
    if filename.endswith(".json")
], ignore_index=True)

df_senti = build_senti_alpha(df_news,method="概率均值")
df_senti = (df_senti
    .assign(datetime=pd.to_datetime(df_senti['datetime']))
    .fillna({'SENTI': 0})
    .set_index(['instrument', 'datetime'])
)

## 3
df_tot=pd.merge(df, df_senti, left_index=True, right_index=True, how='left')
df_tot.fillna({'SENTI':0}, inplace=True)

df_train = df_tot[df_tot.index.get_level_values('datetime').month.isin(range(0,7)) ].copy()   # train
df_eval = df_tot[df_tot.index.get_level_values('datetime').month.isin(range(7,10)) ].copy()    # val

In [22]:
## 初始化模型
train_params = dict(
    objective="mse", 
    colsample_bytree=0.8879,
    learning_rate=0.0421,
    subsample=0.8789,
    lambda_l1=4,  # 正则
    lambda_l2=10, # 正则 
    max_depth=8,
    num_leaves=210,
    num_threads=20,
    verbosity=-1,
)

model = BaseModel.use_subclass('lgb')(train_params)

In [23]:
## 训练
label_cols = ['LABEL0']
alpha158_cols = [col for col in df.columns if col.isupper() and col not in label_cols]
feature_cols = [col for col in df_tot.columns if col.isupper() and col not in label_cols] ## alpha158+senti
model.train(df_train, df_eval, alpha158_cols, label_cols) # lgb: alpha158_cols, lgb+: alpha158_cols 



[20]	train's l2: 0.00063685	valid's l2: 0.000376133
[40]	train's l2: 0.000612871	valid's l2: 0.00038019
[60]	train's l2: 0.000594415	valid's l2: 0.000382492
[80]	train's l2: 0.000580866	valid's l2: 0.000384473
[100]	train's l2: 0.00056987	valid's l2: 0.000386824
[120]	train's l2: 0.000560556	valid's l2: 0.000388863
[140]	train's l2: 0.000553588	valid's l2: 0.000390901
[160]	train's l2: 0.000548103	valid's l2: 0.000392438
[180]	train's l2: 0.000543248	valid's l2: 0.000394083
[200]	train's l2: 0.000539269	valid's l2: 0.000396025
[220]	train's l2: 0.000535573	valid's l2: 0.000397358
[240]	train's l2: 0.000532404	valid's l2: 0.000398634
[260]	train's l2: 0.00052977	valid's l2: 0.000399811
[280]	train's l2: 0.000528321	valid's l2: 0.000400859
[300]	train's l2: 0.000527715	valid's l2: 0.000401244
[320]	train's l2: 0.000527715	valid's l2: 0.000401244
[340]	train's l2: 0.000527715	valid's l2: 0.000401244
[360]	train's l2: 0.000527715	valid's l2: 0.000401244
[380]	train's l2: 0.000527715	valid'

In [24]:
import lightgbm as lgb

# 获取模型的特征重要性
importance_split = model.model.feature_importance(importance_type='gain')  # 'split' 或 'gain'

# 创建 DataFrame 以便查看
import pandas as pd
importance_df = pd.DataFrame({
    'Feature': model.model.feature_name(),
    'Importance': importance_split
})

importance_df


Unnamed: 0,Feature,Importance
0,Column_0,0.454468
1,Column_1,0.378571
2,Column_2,0.125012
3,Column_3,0.130249
4,Column_4,0.337288
...,...,...
153,Column_153,0.025186
154,Column_154,0.994600
155,Column_155,0.086094
156,Column_156,0.127637


In [25]:
## 保存
model.save(MODEL_SAVE_DIR)