In [None]:
import pandas as pd
import numpy as np
import altair as alt
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score

In [101]:
# 读取数据
df = pd.read_csv("samples.csv", index_col=0)
df['t'] = pd.to_datetime(df['t'])

# 设定可视化参数
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [35]:
df.sample(5)

Unnamed: 0,segment,period,t,rv,past_rv,past_price_range,past_ac_log_return,past_up_ratio,past_weighted_bid_ask_price,past_weighted_mid_bid_ask_price_range,...,past_total_value,past_signed_volume,past_signed_volume_var,past_tick_log_return,past_tick_weighted_bid_ask_price,past_tick_weighted_bid_ask_price_diff,past_tick_weighted_bid_ask_volume_diff,past_tick_volume,past_tick_value,past_tick_signed_volume
71744,231,2,2025-09-26 11:11:19,-0.189334,-0.190141,0.000371,0.0,0.115385,26950.923556,1.995975,...,862540,6.0,3.270222,0.0,26951.540942,-27.878525,0.22975,1,13475,-1.0
20207,57,1,2025-09-23 10:00:30,-0.188781,-0.186494,0.000369,0.0,0.09205,27082.039154,1.804568,...,3033180,-122.0,3.599473,0.0,27081.360639,-26.531428,0.100292,0,0,-0.0
82320,258,4,2025-09-26 22:42:08,-0.190144,-0.190774,0.000371,0.000371,0.033473,26973.529323,6.556863,...,3236430,-78.0,6.025412,0.0,26973.670972,-21.978057,0.049891,0,0,0.0
23908,65,3,2025-09-23 13:31:18,-0.185376,-0.183215,0.000369,0.0,0.118644,27074.970633,3.915207,...,5035425,-83.0,7.525423,0.0,27075.255964,-20.859607,0.225702,0,0,0.0
23331,60,2,2025-09-23 11:13:46,-0.188776,-0.188777,0.001108,-0.000369,0.042194,27061.130634,16.060067,...,7483415,-221.0,20.78791,0.0,27058.406589,-27.370953,0.396226,0,0,0.0


In [109]:
# 变量选取
all_features = ['past_rv', 'past_price_range',
       'past_ac_log_return', 'past_up_ratio', 'past_weighted_bid_ask_price',
       'past_weighted_mid_bid_ask_price_range',
       'past_weighted_bid_ask_price_diff', 'past_weighted_bid_ask_volume_diff',
       'past_total_volume', 'past_total_value', 'past_signed_volume']

cand_features = ['past_rv', 'past_price_range',
       'past_ac_log_return', 'past_up_ratio', 'past_weighted_bid_ask_price',
       'past_weighted_mid_bid_ask_price_range',
       'past_weighted_bid_ask_price_diff', 'past_weighted_bid_ask_volume_diff',
       'past_total_volume', 'past_total_value', 'past_signed_volume']


In [114]:
# 在无前瞻性误差的基础上划分训练集和测试集
def split(df_sample, features=cand_features, time=pd.to_datetime('2025-09-25 23:59:59')):
    train_df = df[df['t'] <= time]
    test_df = df[df['t'] > time]
    train_X = train_df[features]
    train_y = train_df['rv']
    test_X = test_df[features]
    test_y = test_df['rv']
    return train_df, test_df, train_X, train_y, test_X, test_y

df_train, _, X_train, y_train, X_test, y_test = split(df, features=all_features)

In [None]:
# 目标和因子的分布情况


df_long = df_train[['rv'] + all_features].melt(var_name='variable', value_name='value')
selector = alt.selection_point(fields=['variable'],
    bind=alt.binding_select(options=all_features,name='选择变量 '),
    value=all_features[0]
)
diagram = (
    alt.Chart(df_long)
    .transform_filter(selector)
    .mark_bar(color='gray')
    .encode(
        x=alt.X('value:Q',bin=alt.Bin(maxbins=30), title=None),
        y=alt.Y('count():Q',title='频数'),
        tooltip=['variable:N', 'count()']
    )
    .add_params(selector)
    .properties(width=300,height=200,title='目标和特征分布频率直方图')
)

diagram.save('diagrams/histofvariable.html')

In [118]:
# Pearson相关系数/IC，单因子回归
df_pearson = X_train.corrwith(y_train, method='pearson').sort_values(ascending=False).reset_index()
df_pearson.columns = ['feature', 'IC']

ps = []
r2s = []
y = df_train['rv']
for f in df_pearson['feature']:
    X = sm.add_constant(df_train[[f]].copy())
    m = sm.OLS(y, X, missing='drop').fit()
    ps.append(m.pvalues[f])
    r2s.append(m.rsquared)
df_pearson['p_value'] = ps
df_pearson['R2'] = r2s

diagram = (
    alt.Chart(df_pearson).
    mark_bar(color='gray')
    .encode(
        x=alt.X('feature:N', sort='-y', title=None),
        y=alt.Y('IC:Q', title=None),
        tooltip=[alt.Tooltip('feature:N', title='特征'), alt.Tooltip('IC:Q', format='.4f', title='IC'), 
                 alt.Tooltip('p_value:Q', format='.2e', title='p值'), alt.Tooltip('R2:Q', format='.4f', title='R2')]

    )
    .properties(width=300,height=150,title='单变量回归结果')
)

diagram.save('diagrams/pearson.html')

In [155]:
# 基线模型，使用前一分钟的波动率作为下一分钟波动率的预测
y_pred_baseline = X_test['past_rv']

In [None]:
# LASSO回归
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, Lasso, lasso_path
from sklearn.model_selection import TimeSeriesSplit

# 标准化系数
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
penalty_ratio = 0.1

# 交叉验证
model_lasso_cv = LassoCV(cv=TimeSeriesSplit(n_splits=4), max_iter=1000).fit(X_train_std, y_train)
coef_feature_cv = model_lasso_cv.coef_
coef_penalty_cv = model_lasso_cv.alpha_

# LASSO惩罚项路径
coef_penaltys = coef_penalty_cv * np.array([0.05, 0.1, 0.25, 0.5, 0.75, 1])
penaltys, coefs, _ = lasso_path(X_train_std, y_train)
coef_df = pd.DataFrame(coefs.T, columns=X_train.columns)
coef_df['penalty'] = penaltys
coef_long = coef_df.melt(id_vars='penalty', var_name='feature', value_name='coef')

selector = alt.selection_point(
    fields=['alpha_str'],
    bind=alt.binding_select(options=coef_penaltys, name='选择 alpha: '),
    value=coef_penaltys[len(coef_penaltys)//2] 
)

diagram = (
    alt.Chart(coef_long)
    .transform_filter(selector)
    .transform_filter("abs(datum.coef) > 1e-6")  # 可选：只显示非零系数
    .mark_bar()
    .encode(
        x=alt.X('feature:N', sort='-y', title='特征'),
        y=alt.Y('coef:Q', title='系数'),
    )
    .add_params(selector)
    .properties(width=500, height=300, title='不同惩罚系数下的LASSO回归系数')
)

# 最终模型
model_lasso = Lasso(alpha=penalty_ratio*coef_penalty_cv)
model_lasso.fit(X_train_std, y_train)

In [229]:
dict(zip(X_train.columns, model_lasso.coef_))

{'past_rv': 0.001486681085992657,
 'past_price_range': -0.0,
 'past_ac_log_return': -0.0,
 'past_up_ratio': -7.920494412675866e-05,
 'past_weighted_bid_ask_price': -0.0,
 'past_weighted_mid_bid_ask_price_range': 0.0,
 'past_weighted_bid_ask_price_diff': 0.0,
 'past_weighted_bid_ask_volume_diff': 5.286415221214764e-05,
 'past_total_volume': 0.00030414562365366904,
 'past_total_value': 0.0,
 'past_signed_volume': -0.0}

In [220]:
# XGBoost
import xgboost
model_xgb = xgboost.XGBRegressor(n_estimators=200, max_depth=4, learning_rate=0.05,
                                 subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror',
                                 eval_metric='rmse', tree_method='hist')

model_xgb.fit(X_train, y_train)

In [228]:
model_xgb.get_booster().get_score(importance_type='gain')

{'past_rv': 0.0039412458427250385,
 'past_price_range': 0.00011840234219562262,
 'past_ac_log_return': 0.000115728318633046,
 'past_up_ratio': 0.00019044213695451617,
 'past_weighted_bid_ask_price': 0.0001911463332362473,
 'past_weighted_mid_bid_ask_price_range': 0.00011531556083355099,
 'past_weighted_bid_ask_price_diff': 0.0001266521867364645,
 'past_weighted_bid_ask_volume_diff': 0.0001782698673196137,
 'past_total_volume': 0.0004714334791060537,
 'past_total_value': 0.0008975757518783212,
 'past_signed_volume': 0.0001275961403734982}

In [221]:
# 模型评估
y_pred_lasso = model_lasso.predict(X_test_std)
y_pred_xgb = model_xgb.predict(X_test)

rmse_baseline = np.sqrt(mean_squared_error(y_test, y_pred_baseline))
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

mae_baseline = np.mean(np.abs(y_test - y_pred_baseline))
mae_lasso = np.mean(np.abs(y_test - y_pred_lasso))
mae_xgb = np.mean(np.abs(y_test - y_pred_xgb))

r2_baseline = r2_score(y_test, y_pred_baseline)
r2_lasso = r2_score(y_test, y_pred_lasso)
r2_xgb = r2_score(y_test, y_pred_xgb)

In [194]:
# 预测结果展示
df_result = pd.DataFrame({'time': X_test.index, 'y_test': y_test, 
                        'baseline': y_pred_baseline, 'lasso': y_pred_lasso, 'xgb': y_pred_xgb})


df_result_plot = df_result[::10].melt(id_vars='time', value_name='value', var_name='series')

zoomer = alt.selection_interval(bind='scales', encodings=['x'])

diagram = (
    alt.Chart(df_result_plot)
    .mark_line()
    .encode(
        x=alt.X('time:Q', title=None, scale=alt.Scale(domain=[66613, 67400])),      
        y=alt.Y('value:Q', title='预测值', scale=alt.Scale(domain=[-0.20, -0.17])),
        color=alt.Color('series:N', title='模型', scale=alt.Scale(scheme='category10')),
        tooltip=[
            alt.Tooltip('time:T', title='时间'),
            alt.Tooltip('series:N', title='模型'),
            alt.Tooltip('value:Q', format='.3f', title='预测值')
        ]
    )
    .add_params(zoomer)
    .properties(width=400,height=200,title='预测结果')
)

diagram.save('diagrams/predict.html')