In [1]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from tqdm import tqdm
import os
import pandas as pd
import numpy as np

In [2]:
### HF factors
def get_realvar(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).var().fillna(0)

def get_realskew(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).skew().fillna(0)

def get_realkurtosis(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).kurt().fillna(0)

def get_realupvar(df, lookback_len, lookback_shift):
    df['return_up'] = df['return'][df['return'] > 0]
    df['return_up'] = df['return_up'].fillna(0)
    return df['return_up'].shift(lookback_shift).rolling(lookback_len).var().fillna(0)

def get_realdownvar(df, lookback_len, lookback_shift):
    df['return_down'] = df['return'][df['return'] < 0]
    df['return_down'] = df['return_down'].fillna(0)
    return df['return_down'].shift(lookback_shift).rolling(lookback_len).var().fillna(0)

def get_ratio_upvar(df, lookback_len, lookback_shift):
    return get_realupvar(df, lookback_len, lookback_shift) / get_realvar(df, lookback_len, lookback_shift)

def get_ratio_downvar(df, lookback_len, lookback_shift):
    return get_realdownvar(df, lookback_len, lookback_shift) / get_realvar(df, lookback_len, lookback_shift)

def get_trendratio(df, lookback_len, lookback_shift):
    abs_price_diff = abs(df['price'].diff()).fillna(0)
    abs_price_diff_sum = abs_price_diff.shift(lookback_shift).rolling(lookback_len).sum().fillna(0)
    trend_ratio = (df['price']-df['price'].shift(lookback_len)).shift(lookback_shift) / abs_price_diff_sum
    return trend_ratio.replace(np.inf, 0).fillna(0)

def get_windowreturn(df, lookback_len, lookback_shift):
    return np.exp((np.log(df['return']+1)).shift(lookback_shift).rolling(lookback_len).sum())-1

def get_minreturn(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).min().fillna(0)

def calculate_mdd(series):
    max_price = np.maximum.accumulate(series)
    drawdown = (max_price - series) / max_price
    return np.max(drawdown)

def get_mdd(df, lookback_len, lookback_shift):
    return df['price'].shift(lookback_shift).rolling(lookback_len).apply(lambda x: calculate_mdd(x), raw=True)

def get_corrVP_price(df, lookback_len, lookback_shift):
    return df['price'].shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

# def get_corrVP_avg(df, lookback_len, lookback_shift):
#     return df['price'].fillna(method='ffill').shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

def get_corrVP_mid(df, lookback_len, lookback_shift):
    return df['mid'].shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

def get_corrVR(df, lookback_len, lookback_shift):
    return df['return'].shift(lookback_shift).rolling(lookback_len).corr(df['volume'].shift(lookback_shift)).fillna(0)

def get_Amihud(df, lookback_len, lookback_shift):
    abs_return = abs(df['return'].diff()).fillna(0)
    sum_abs_return = abs_return.shift(lookback_shift).rolling(lookback_len).sum()
    return (1 / (lookback_len) * sum_abs_return / df['amount'].shift(lookback_shift)).fillna(0)

def get_BAspread(df, lookback_len, lookback_shift):
    bidsum = df["b1"]*df["b1_v"]+0.8*df["b2"]*df["b2_v"]+0.6*df["b3"]*df["b3_v"]+0.4*df["b4"]*df["b4_v"]+0.2*df["b5"]*df["b5_v"]
    asksum = df["a1"]*df["a1_v"]+0.8*df["a2"]*df["a2_v"]+0.6*df["a3"]*df["a3_v"]+0.4*df["a4"]*df["a4_v"]+0.2*df["a5"]*df["a5_v"]
    df["spread"] = (bidsum - asksum) / (bidsum + asksum)
    return df["spread"].shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

def delta_V_A(a1, a1_v):
    # a1 and a1_v are ndarrays
    diff = a1[-1] - a1[0]
    if diff < 0:
        return a1_v[-1]
    elif diff == 0:
        return a1_v[-1] - a1_v[0]
    else:
        return 0

def delta_V_B(b1, b1_v):
    # b1 and b1_v are ndarrays
    diff = b1[-1] - b1[0]
    if diff < 0:
        return 0
    elif diff == 0:
        return b1_v[-1] - b1_v[0]
    else:
        return b1_v[-1]
    
def get_VOI(df, lookback_len, lookback_shift):
    delta_Va = np.zeros_like(df['a1_v'])
    for i in range(1, len(df)):
        a1_slice = df['a1'].values[i-1:i+1]
        a1_v_slice = df['a1_v'].values[i-1:i+1]
        delta_Va[i] = delta_V_A(a1_slice, a1_v_slice)
    df['delta_Va'] = delta_Va

    delta_Vb = np.zeros_like(df['b1_v'])
    for i in range(1, len(df)):
        a1_slice = df['b1'].values[i-1:i+1]
        a1_v_slice = df['b1_v'].values[i-1:i+1]
        delta_Va[i] = delta_V_A(a1_slice, a1_v_slice)
    df['delta_Vb'] = delta_Vb

    df['ori_VOI'] = df['delta_Vb'] - df['delta_Va']
    df['ori_VOI'] = df['ori_VOI'].fillna(0)
    df = df.drop(columns=['delta_Va'])
    df = df.drop(columns=['delta_Vb'])

    mean = df['ori_VOI'].shift(lookback_shift).rolling(lookback_len).mean()
    std = df['ori_VOI'].shift(lookback_shift).rolling(lookback_len).std()
    return ((df['ori_VOI'] - mean)/std).replace([np.inf, -np.inf], 0).fillna(0)

def get_BAspread_1_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['a1']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_2_mean(df, lookback_len, lookback_shift):
    return (df['b2'] - df['a2']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_3_mean(df, lookback_len, lookback_shift):
    return (df['b3'] - df['a3']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_4_mean(df, lookback_len, lookback_shift):
    return (df['b4'] - df['a4']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_BAspread_5_mean(df, lookback_len, lookback_shift):
    return (df['b5'] - df['a5']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_midprice_1_mean(df, lookback_len, lookback_shift):
    return ((df['b1']+df['a1'])/2).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_midprice_2_mean(df, lookback_len, lookback_shift):
    return ((df['b2']+df['a2'])/2).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_midprice_3_mean(df, lookback_len, lookback_shift):
    return ((df['b3']+df['a3'])/2).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_midprice_4_mean(df, lookback_len, lookback_shift):
    return ((df['b4']+df['a4'])/2).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_midprice_5_mean(df, lookback_len, lookback_shift):
    return ((df['b5']+df['a5'])/2).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_ap_diff_1_mean(df, lookback_len, lookback_shift):
    return (df['a2'] - df['a1']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_ap_diff_2_mean(df, lookback_len, lookback_shift):
    return (df['a3'] - df['a1']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_ap_diff_3_mean(df, lookback_len, lookback_shift):
    return (df['a4'] - df['a1']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_ap_diff_4_mean(df, lookback_len, lookback_shift):
    return (df['a5'] - df['a1']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bp_diff_1_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['b2']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bp_diff_2_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['b3']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bp_diff_3_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['b4']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bp_diff_4_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['b5']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_ap_diff_1_mean(df, lookback_len, lookback_shift):
    return (abs(df['a2'] - df['a1'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_ap_diff_2_mean(df, lookback_len, lookback_shift):
    return (abs(df['a3'] - df['a1'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_ap_diff_3_mean(df, lookback_len, lookback_shift):
    return (abs(df['a4'] - df['a1'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_ap_diff_4_mean(df, lookback_len, lookback_shift):
    return (abs(df['a5'] - df['a1'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_bp_diff_1_mean(df, lookback_len, lookback_shift):
    return (abs(df['b1'] - df['b2'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_bp_diff_2_mean(df, lookback_len, lookback_shift):
    return (abs(df['b1'] - df['b3'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_bp_diff_3_mean(df, lookback_len, lookback_shift):
    return (abs(df['b1'] - df['b4'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_abs_bp_diff_4_mean(df, lookback_len, lookback_shift):
    return (abs(df['b1'] - df['b5'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_ap_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['a1'] + df['a2'] + df['a3'] + df['a4'] + df['a5'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bp_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['b1'] + df['b2'] + df['b3'] + df['b4'] + df['b5'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_av_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['a1_v'] + df['a2_v'] + df['a3_v'] + df['a4_v'] + df['a5_v'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_bv_sum_mean(df, lookback_len, lookback_shift):
    return (1/5 * (df['b1_v'] + df['b2_v'] + df['b3_v'] + df['b4_v'] + df['b5_v'])).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_sum_ap_diff_mean(df, lookback_len, lookback_shift):
    return get_ap_diff_1_mean(df, lookback_len, lookback_shift) + get_ap_diff_2_mean(df, lookback_len, lookback_shift) + get_ap_diff_3_mean(df, lookback_len, lookback_shift) + get_ap_diff_4_mean(df, lookback_len, lookback_shift)
def get_sum_bp_diff_mean(df, lookback_len, lookback_shift):
    return get_bp_diff_1_mean(df, lookback_len, lookback_shift) + get_bp_diff_2_mean(df, lookback_len, lookback_shift) + get_bp_diff_3_mean(df, lookback_len, lookback_shift) + get_bp_diff_4_mean(df, lookback_len, lookback_shift)

def get_deriv_ap_1_mean(df, lookback_len, lookback_shift):
    return (df['a1'] - df['a1'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_ap_2_mean(df, lookback_len, lookback_shift):
    return (df['a2'] - df['a2'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_ap_3_mean(df, lookback_len, lookback_shift):
    return (df['a3'] - df['a3'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_ap_4_mean(df, lookback_len, lookback_shift):
    return (df['a4'] - df['a4'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_ap_5_mean(df, lookback_len, lookback_shift):
    return (df['a5'] - df['a5'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bp_1_mean(df, lookback_len, lookback_shift):
    return (df['b1'] - df['b1'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bp_2_mean(df, lookback_len, lookback_shift):
    return (df['b2'] - df['b2'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bp_3_mean(df, lookback_len, lookback_shift):
    return (df['b3'] - df['b3'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bp_4_mean(df, lookback_len, lookback_shift):
    return (df['b4'] - df['b4'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bp_5_mean(df, lookback_len, lookback_shift):
    return (df['b5'] - df['b5'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_1_mean(df, lookback_len, lookback_shift):
    return (df['a1_v'] - df['a1_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_2_mean(df, lookback_len, lookback_shift):
    return (df['a2_v'] - df['a2_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_3_mean(df, lookback_len, lookback_shift):
    return (df['a3_v'] - df['a3_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_4_mean(df, lookback_len, lookback_shift):
    return (df['a4_v'] - df['a4_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_av_5_mean(df, lookback_len, lookback_shift):
    return (df['a5_v'] - df['a5_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_1_mean(df, lookback_len, lookback_shift):
    return (df['b1_v'] - df['b1_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_2_mean(df, lookback_len, lookback_shift):
    return (df['b2_v'] - df['b2_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_3_mean(df, lookback_len, lookback_shift):
    return (df['b3_v'] - df['b3_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_4_mean(df, lookback_len, lookback_shift):
    return (df['b4_v'] - df['b4_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
def get_deriv_bv_5_mean(df, lookback_len, lookback_shift):
    return (df['b5_v'] - df['b5_v'].shift(2)).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

def get_depth_price_range(df, lookback_len, lookback_shift):
    return (df['a1'].shift(lookback_shift).rolling(lookback_len).max() / df['a1'].shift(lookback_shift).rolling(lookback_len).min() - 1).fillna(0)

import numba as nb

@nb.jit(nopython=True)
def age(prices):
    last_value = prices[-1]
    age = 0
    for i in range(2, len(prices)):
        if prices[-i] != last_value:
            return age
        age += 1
    return age

def get_BAage(df, lookback_len, lookback_shift):
    return df['b1'].shift(lookback_shift).rolling(lookback_len).apply(age, engine='numba', raw=True).fillna(0)

def get_cofi(df, lookback_len, lookback_shift):
    a = df['b1_v']*np.where(df['b1'].diff()>=0, 1, 0)
    b = df['b1_v'].shift()*np.where(df['b1'].diff()<=0, 1, 0)
    c = df['a1_v']*np.where(df['a1'].diff()>=0, 1, 0)
    d = df['a1_v'].shift()*np.where(df['a1'].diff()<=0, 1, 0)
    return (a-b-c+d).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

def get_bp_rank(df, lookback_len, lookback_shift):
    return ((df['b1'].shift(lookback_shift).rolling(lookback_len).rank()) / lookback_len*2 - 1).fillna(0)

def get_ap_rank(df, lookback_len, lookback_shift):
    return ((df['a1'].shift(lookback_shift).rolling(lookback_len).rank()) / lookback_len*2 - 1).fillna(0)

@nb.jit(nopython=True)
def first_location_of_maximum(x):
    max_value = max(x)
    for loc in range(len(x)):
        if x[loc] == max_value:
            return loc + 1
        
def get_price_idxmax(df, lookback_len, lookback_shift):
    return df['a1'].shift(lookback_shift).rolling(lookback_len).apply(first_location_of_maximum, engine='numba', raw=True).fillna(0)

@nb.jit(nopython=True)
def mean_second_derivative_centra(x):
    sum_value = 0
    for i in range(len(x)-5):
        sum_value += (x[i+5]-2*x[i+3]+x[i])/2
    return sum_value/(2*(len(x)-5))

def get_center_deri_two(df, lookback_len, lookback_shift):
    return df['a1'].shift(lookback_shift).rolling(lookback_len).apply(mean_second_derivative_centra, engine='numba', raw=True).fillna(0)

def get_quasi(df, lookback_len, lookback_shift):
    return df['a1'].diff(1).abs().shift(lookback_shift).rolling(lookback_len).sum().fillna(0)

def get_weighted_price_to_mid(df, lookback_len, lookback_shift):
    avs = df[['a1_v', 'a2_v', 'a3_v', 'a4_v', 'a5_v']].values
    bvs = df[['b1_v', 'b2_v', 'b3_v', 'b4_v', 'b5_v']].values
    aps = df[['a1', 'a2', 'a3', 'a4', 'a5']].values
    bps = df[['b1', 'b2', 'b3', 'b4', 'b5']].values
    return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)

In [3]:
lookback_len = [1, 2, 4, 8, 16, 32, 64, 128, 256]
lookback_shift = 0

for i, length in enumerate(lookback_len):
    if i == 0:
        functions = {f'{k}_{lookback_shift}_{length}': (v, length, lookback_shift) for k, v in globals().items() if callable(v) and k.startswith('get_')}
    else:
        functions.update({f'{k}_{lookback_shift}_{length}': (v, length, lookback_shift) for k, v in globals().items() if callable(v) and k.startswith('get_')})
        

print('numbers of factors:', len(functions))

numbers of factors: 711


In [4]:
def preprocess(df):
    df = df.rename(columns={'Unnamed: 0': 'tick'})
    df['lastPx'] = df['lastPx'].fillna(method='ffill')
    df['BP1'] = df['BP1'].replace(0, np.nan).fillna(method='ffill')
    df['SP1'] = df['SP1'].replace(0, np.nan).fillna(method='ffill')
    # assert df['avg_price'].isna().sum() == 0
    df = df[df['BP1'] != 0]
    df = df[df['SP1'] != 0]
    df['mid'] = (df['BP1'] + df['SP1']) / 2
    df['diff_v'] = (df['volume'] - df['volume'].shift(1)).fillna(0)
    df['return'] = (df['mid'] / df['mid'].shift(1) - 1).fillna(0)
    df = df.rename(columns={'lastPx': 'price', 'BP1':'b1', 'BP2':'b2', 'BP3':'b3', 'BP4':'b4', 'BP5':'b5', 
                            'SP1':'a1', 'SP2':'a2', 'SP3':'a3', 'SP4':'a4', 'SP5':'a5',
                            'BV1':'b1_v', 'BV2':'b2_v', 'BV3':'b3_v', 'BV4':'b4_v', 'BV5':'b5_v',
                            'SV1':'a1_v', 'SV2':'a2_v', 'SV3':'a3_v', 'SV4':'a4_v', 'SV5':'a5_v',
                            'volume': 'volume_sum', 'diff_v': 'volume'})
    df['amount'] = df['price'] * df['volume']
    return df

In [5]:
filename = os.listdir('./2330/')
n = len(filename)
new_filename = []
for i in range(n):
    if filename[i][0:4] == '2330':
        new_filename.append(filename[i])
month = []
for file in new_filename:
    month.append(int(file[8:14]))
month.sort()

In [6]:
from sklearn.metrics import r2_score
import joblib

In [7]:

m = 202308
df = pd.read_csv('./2330/2330_md_'+str(m)+'_'+str(m)+'.csv')
date_list = df['date'].unique().tolist()

for i, date in enumerate(date_list):
    if i == 0:
        df_prc = preprocess(df[df['date'] == date])
    else:
        df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])
for name, (func, *args) in functions.items():
    if 'get_ipython' in name:
        continue
    # print(name)
    result = func(df_prc, *args)
    var_name = name.replace('get_', '')
    df_prc = pd.concat([df_prc, result.rename(var_name)], axis=1)
#     df_prc.to_csv('./factors_2330_'+file[8:14]+'.csv')

window_size = 5
date_list = df['date'].unique().tolist()

# 定义 LightGBM 模型的参数
params = {
    'objective': 'regression',
    'metric': 'mse',
    'learning_rate': 0.0001,
    'num_leaves': 31,
    'max_depth': -1,
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': -1,
    'silent': True
}

# 定义 Adam 优化器
optimizer = lgb.LGBMRegressor(boosting_type='gbdt', **params)

# 定义用于存储每个 tick 预测结果的列表
ic_list = []
pred_value = pd.DataFrame(index=df_prc.index, columns=['date', 'return'])
pred_value['date'] = df_prc['date']

# importance_name = pd.DataFrame(index=date_list[window_size:], columns=range(10))
# importance_value = pd.DataFrame(index=date_list[window_size:], columns=range(10))
# 对于每个滚动窗口
cor = []
r_square = []
for i in range(window_size, len(date_list)):
    # 选择训练数据和目标值
    X_train = df_prc.loc[(df_prc['date'] < date_list[i]) & (df_prc['date'] >= date_list[i-window_size]), :]
    X_train = X_train.drop(['tick', 'date', 'time'], axis=1)

    y_train = X_train['return'].shift(-1).fillna(0)
    X_train = X_train.drop(['return'], axis=1)
    X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
    X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

    y_test = X_test['return'].shift(-1).fillna(0)
    X_test = X_test.drop(['return'], axis=1)
    for col in X_train.columns:
        if col != 'price':
            del X_train[col]
            del X_test[col]
        if col == 'price':
            break
    feature_cols = X_train.columns

    # 选择测试数据
#         X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
#         X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

#         y_test = X_test['return'].shift(-1).fillna(0)
#         X_test = X_test.drop(['return'], axis=1)

    # 训练模型
    optimizer.fit(X_train, y_train)

    # 进行预测
    y_pred = optimizer.predict(X_test)
    pred_value.loc[X_test.index[0]:X_test.index[-1], 'return'] = y_pred
    # 计算预测值和真实值之间的皮尔逊相关系数
    corr, _ = pearsonr(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r_square.append(r2)
    # 将预测结果添加到列表中
    ic_list.append(corr)
    # fi = pd.DataFrame(index=feature_cols, columns=['value'])
    # fi['value'] = optimizer.feature_importances_
    # fi = fi[fi.value != 0]
    # fi = fi.sort_values(by='value', ascending=False)
    # fi = fi.iloc[:10]
    # importance_name.iloc[i-window_size, :len(fi)] = fi.index.tolist()
    # importance_value.iloc[i-window_size, :len(fi)] = fi['value'].tolist()
# 计算最后一天内所有 tick 预测的平均 IC 值
ic = np.mean(ic_list)

# 输出结果
print('皮尔逊相关系数:', corr)
print('平均 IC 值:', ic)
print('平均r方:', np.mean(r_square))
joblib.dump(optimizer, '2330_lgbm.pkl')

pred_value.to_csv('./lgbm_pred_2330_'+str(m)+'.csv')
# importance_name.to_csv('./lgbm_importance_name_'+file[0:5]+file[8:14]+'.csv')
# importance_value.to_csv('./lgbm_importance_value_'+file[0:5]+file[8:14]+'.csv')

  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.157304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73881
[LightGBM] [Info] Number of data points in the train set: 123828, number of used features: 665
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.164478 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73921
[LightGBM] [Info] Number of data points in the train set: 125543, number of used features: 656
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.187251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72091
[LightGBM] [Info] Number of data points in the train set: 120321, number of used features: 656
[LightGBM] [Info

In [8]:
filename = os.listdir('./0050/')
n = len(filename)
new_filename = []
for i in range(n):
    if filename[i][0:4] == '0050':
        new_filename.append(filename[i])
month = []
for file in new_filename:
    month.append(int(file[8:14]))
month.sort()

In [9]:

m = 202308
df = pd.read_csv('./0050/0050_md_'+str(m)+'_'+str(m)+'.csv')
date_list = df['date'].unique().tolist()

for i, date in enumerate(date_list):
    if i == 0:
        df_prc = preprocess(df[df['date'] == date])
    else:
        df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])
for name, (func, *args) in functions.items():
    if 'get_ipython' in name:
        continue
    # print(name)
    result = func(df_prc, *args)
    var_name = name.replace('get_', '')
    df_prc = pd.concat([df_prc, result.rename(var_name)], axis=1)
#     df_prc.to_csv('./factors_2330_'+file[8:14]+'.csv')

window_size = 5
date_list = df['date'].unique().tolist()

# 定义 LightGBM 模型的参数
params = {
    'objective': 'regression',
    'metric': 'mse',
    'learning_rate': 0.0001,
    'num_leaves': 31,
    'max_depth': -1,
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': -1,
    'silent': True
}

# 定义 Adam 优化器
optimizer = lgb.LGBMRegressor(boosting_type='gbdt', **params)

# 定义用于存储每个 tick 预测结果的列表
ic_list = []
pred_value = pd.DataFrame(index=df_prc.index, columns=['date', 'return'])
pred_value['date'] = df_prc['date']

# importance_name = pd.DataFrame(index=date_list[window_size:], columns=range(10))
# importance_value = pd.DataFrame(index=date_list[window_size:], columns=range(10))
# 对于每个滚动窗口
cor = []
r_square = []
for i in range(window_size, len(date_list)):
    # 选择训练数据和目标值
    X_train = df_prc.loc[(df_prc['date'] < date_list[i]) & (df_prc['date'] >= date_list[i-window_size]), :]
    X_train = X_train.drop(['tick', 'date', 'time'], axis=1)

    y_train = X_train['return'].shift(-1).fillna(0)
    X_train = X_train.drop(['return'], axis=1)
    X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
    X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

    y_test = X_test['return'].shift(-1).fillna(0)
    X_test = X_test.drop(['return'], axis=1)
    for col in X_train.columns:
        if col != 'price':
            del X_train[col]
            del X_test[col]
        if col == 'price':
            break
    feature_cols = X_train.columns

    # 选择测试数据
#         X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
#         X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

#         y_test = X_test['return'].shift(-1).fillna(0)
#         X_test = X_test.drop(['return'], axis=1)

    # 训练模型
    optimizer.fit(X_train, y_train)

    # 进行预测
    y_pred = optimizer.predict(X_test)
    pred_value.loc[X_test.index[0]:X_test.index[-1], 'return'] = y_pred
    # 计算预测值和真实值之间的皮尔逊相关系数
    corr, _ = pearsonr(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r_square.append(r2)
    # 将预测结果添加到列表中
    ic_list.append(corr)
# 计算最后一天内所有 tick 预测的平均 IC 值
ic = np.mean(ic_list)

# 输出结果
print('皮尔逊相关系数:', corr)
print('平均 IC 值:', ic)
print('平均r方:', np.mean(r_square))
joblib.dump(optimizer, '0050_lgbm.pkl')

pred_value.to_csv('./lgbm_pred_0050_'+str(m)+'.csv')

  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.300645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121528
[LightGBM] [Info] Number of data points in the train set: 126594, number of used features: 710
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.308926 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 120395
[LightGBM] [Info] Number of data points in the train set: 126240, number of used features: 710
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.302647 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 116337
[LightGBM] [Info] Number of data points in the train set: 121824, number of used features: 710
[LightGBM] [I

In [10]:
filename = os.listdir('./2603/')
n = len(filename)
new_filename = []
for i in range(n):
    if filename[i][0:4] == '2603':
        new_filename.append(filename[i])
month = []
for file in new_filename:
    month.append(int(file[8:14]))
month.sort()

In [11]:

m = 202308
df = pd.read_csv('./2603/2603_md_'+str(m)+'_'+str(m)+'.csv')
date_list = df['date'].unique().tolist()

for i, date in enumerate(date_list):
    if i == 0:
        df_prc = preprocess(df[df['date'] == date])
    else:
        df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])
for name, (func, *args) in functions.items():
    if 'get_ipython' in name:
        continue
    # print(name)
    result = func(df_prc, *args)
    var_name = name.replace('get_', '')
    df_prc = pd.concat([df_prc, result.rename(var_name)], axis=1)
#     df_prc.to_csv('./factors_2330_'+file[8:14]+'.csv')

window_size = 5
date_list = df['date'].unique().tolist()

# 定义 LightGBM 模型的参数
params = {
    'objective': 'regression',
    'metric': 'mse',
    'learning_rate': 0.0001,
    'num_leaves': 31,
    'max_depth': -1,
    'n_estimators': 100,
    'random_state': 42,
    'n_jobs': -1,
    'silent': True
}

# 定义 Adam 优化器
optimizer = lgb.LGBMRegressor(boosting_type='gbdt', **params)

# 定义用于存储每个 tick 预测结果的列表
ic_list = []
pred_value = pd.DataFrame(index=df_prc.index, columns=['date', 'return'])
pred_value['date'] = df_prc['date']

# importance_name = pd.DataFrame(index=date_list[window_size:], columns=range(10))
# importance_value = pd.DataFrame(index=date_list[window_size:], columns=range(10))
# 对于每个滚动窗口
cor = []
r_square = []
for i in range(window_size, len(date_list)):
    # 选择训练数据和目标值
    X_train = df_prc.loc[(df_prc['date'] < date_list[i]) & (df_prc['date'] >= date_list[i-window_size]), :]
    X_train = X_train.drop(['tick', 'date', 'time'], axis=1)

    y_train = X_train['return'].shift(-1).fillna(0)
    X_train = X_train.drop(['return'], axis=1)
    X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
    X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

    y_test = X_test['return'].shift(-1).fillna(0)
    X_test = X_test.drop(['return'], axis=1)
    for col in X_train.columns:
        if col != 'price':
            del X_train[col]
            del X_test[col]
        if col == 'price':
            break
    feature_cols = X_train.columns

    # 选择测试数据
#         X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
#         X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

#         y_test = X_test['return'].shift(-1).fillna(0)
#         X_test = X_test.drop(['return'], axis=1)

    # 训练模型
    optimizer.fit(X_train, y_train)

    # 进行预测
    y_pred = optimizer.predict(X_test)
    pred_value.loc[X_test.index[0]:X_test.index[-1], 'return'] = y_pred
    # 计算预测值和真实值之间的皮尔逊相关系数
    corr, _ = pearsonr(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r_square.append(r2)
    # 将预测结果添加到列表中
    ic_list.append(corr)
# 计算最后一天内所有 tick 预测的平均 IC 值
ic = np.mean(ic_list)

# 输出结果
print('皮尔逊相关系数:', corr)
print('平均 IC 值:', ic)
print('平均r方:', np.mean(r_square))
joblib.dump(optimizer, '2603_lgbm.pkl')

pred_value.to_csv('./lgbm_pred_2603_'+str(m)+'.csv')

  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_len).mean().fillna(0)
  return ((avs * aps + bvs * bps).sum(axis=1) / (avs + bvs).sum(axis=1) - df['mid']).shift(lookback_shift).rolling(lookback_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.276021 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77756
[LightGBM] [Info] Number of data points in the train set: 155076, number of used features: 692
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.242476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77598
[LightGBM] [Info] Number of data points in the train set: 156319, number of used features: 692
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.274354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77488
[LightGBM] [Info] Number of data points in the train set: 159323, number of used features: 692
[LightGBM] [Info] 

In [None]:
filename = os.listdir('./2603/')
n = len(filename)
new_filename = []
for i in range(n):
    if filename[i][0:4] == '2603':
        new_filename.append(filename[i])

In [None]:
for file in tqdm(new_filename):
    df = pd.read_csv('./2603/'+file)
    date_list = df['date'].unique().tolist()

    for i, date in enumerate(date_list):
        if i == 0:
            df_prc = preprocess(df[df['date'] == date])
        else:
            df_prc = pd.concat([df_prc, preprocess(df[df['date'] == date])])
    for name, (func, *args) in functions.items():
        if 'get_ipython' in name:
            continue
        # print(name)
        result = func(df_prc, *args)
        var_name = name.replace('get_', '')
        df_prc = pd.concat([df_prc, result.rename(var_name)], axis=1)
#     df_prc.to_csv('./factors_2330_'+file[8:14]+'.csv')
    
    window_size = 5
    date_list = df['date'].unique().tolist()

    # 定义 LightGBM 模型的参数
    params = {
        'objective': 'regression',
        'metric': 'mse',
        'learning_rate': 0.0001,
        'num_leaves': 31,
        'max_depth': -1,
        'n_estimators': 100,
        'random_state': 42,
        'n_jobs': -1,
        'silent': True
    }

    # 定义 Adam 优化器
    optimizer = lgb.LGBMRegressor(boosting_type='gbdt', **params)

    # 定义用于存储每个 tick 预测结果的列表
    ic_list = []
    importance_name = pd.DataFrame(index=date_list[window_size:], columns=range(10))
    importance_value = pd.DataFrame(index=date_list[window_size:], columns=range(10))
    # 对于每个滚动窗口
    cor = []
    r_square = []
    for i in range(window_size, len(date_list)):
        # 选择训练数据和目标值
        X_train = df_prc.loc[(df_prc['date'] < date_list[i]) & (df_prc['date'] >= date_list[i-window_size]), :]
        X_train = X_train.drop(['tick', 'date', 'time'], axis=1)

        y_train = X_train['return'].shift(-1).fillna(0)
        X_train = X_train.drop(['return'], axis=1)
        X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
        X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

        y_test = X_test['return'].shift(-1).fillna(0)
        X_test = X_test.drop(['return'], axis=1)
        for col in X_train.columns:
            if col != 'price':
                del X_train[col]
                del X_test[col]
            if col == 'price':
                break
        # for col in X_test.columns:
        #     if col != 'price':
        #         del X_test[col]
        #         # del X_test[col]
        #     if col == 'price':
        #         break
        feature_cols = X_train.columns
        

        # 选择测试数据
#         X_test = df_prc.loc[(df_prc['date'] == date_list[i]), :]
#         X_test = X_test.drop(['tick', 'date', 'time'], axis=1)

#         y_test = X_test['return'].shift(-1).fillna(0)
#         X_test = X_test.drop(['return'], axis=1)

        # 训练模型
        optimizer.fit(X_train, y_train)

        # 进行预测
        y_pred = optimizer.predict(X_test)

        # 计算预测值和真实值之间的皮尔逊相关系数
        corr, _ = pearsonr(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        r_square.append(r2)
        # 将预测结果添加到列表中
        ic_list.append(corr)
        fi = pd.DataFrame(index=feature_cols, columns=['value'])
        fi['value'] = optimizer.feature_importances_
        fi = fi[fi.value != 0]
        fi = fi.sort_values(by='value', ascending=False)
        fi = fi.iloc[:10]
        importance_name.iloc[i-window_size, :len(fi)] = fi.index.tolist()
        importance_value.iloc[i-window_size, :len(fi)] = fi['value'].tolist()
    # 计算最后一天内所有 tick 预测的平均 IC 值
    ic = np.mean(ic_list)

    # 输出结果
    print('皮尔逊相关系数:', corr)
    print('平均 IC 值:', ic)
    print('平均r方:', np.mean(r_square))

     
    importance_name.to_csv('./lgbm_importance_name_'+file[0:5]+file[8:14]+'.csv')
    importance_value.to_csv('./lgbm_importance_value_'+file[0:5]+file[8:14]+'.csv')