In [1]:
#!pip uninstall -y lightgbm

In [2]:
!pip install lightgbm --config-settings=cmake.define.USE_GPU=ON



In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from itertools import combinations
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from warnings import simplefilter
import joblib
import os

warnings.filterwarnings('ignore')
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

is_train = True
is_infer = True
N_Folds = 6



In [4]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')

#整体特征
median_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()
std_sizes = train.groupby('stock_id')['bid_size'].std() + train.groupby('stock_id')['ask_size'].std()
max_sizes = train.groupby('stock_id')['bid_size'].max() + train.groupby('stock_id')['ask_size'].max()
min_sizes = train.groupby('stock_id')['bid_size'].min() + train.groupby('stock_id')['ask_size'].min()
mean_sizes = train.groupby('stock_id')['bid_size'].mean() + train.groupby('stock_id')['ask_size'].mean()
first_sizes = train.groupby('stock_id')['bid_size'].first() + train.groupby('stock_id')['ask_size'].first()
last_sizes = train.groupby('stock_id')['bid_size'].last() + train.groupby('stock_id')['ask_size'].last()
quantile_dic = {}
for q in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    quantile_dic["quantile" + str(q)] = train.groupby('stock_id')['bid_size'].quantile(q)
#可以再做日期的（好像没看到drop掉日期列）
#date_median_sizes = train.groupby('date_id')['bid_size'].median() + train.groupby('date_id')['ask_size'].median()
#date_std_sizes = train.groupby('date_id')['bid_size'].std() + train.groupby('date_id')['ask_size'].std()
#date_max_sizes = train.groupby('date_id')['bid_size'].max() + train.groupby('date_id')['ask_size'].max()
#date_min_sizes = train.groupby('date_id')['bid_size'].min() + train.groupby('date_id')['ask_size'].min()
#date_mean_sizes = train.groupby('date_id')['bid_size'].mean() + train.groupby('date_id')['ask_size'].mean()
#date_quantile_dic = {}
#for q in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
#    date_quantile_dic["quantile" + str(q)] = train.groupby('date_id')['bid_size'].quantile(q)
    
train = train.dropna(subset=['target'])

In [5]:
def feature_eng(df):
    cols = [c for c in df.columns if c not in ['row_id', 'date_id','time_id']]
    df = df[cols]
    
    #匹配失败数量和匹配成功数量的比率
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    #供需市场的差额
    df['bid_ask_volume_diff'] = df['ask_size'] - df['bid_size']
    #供需市场总和
    df['bid_plus_ask_sizes'] = df['bid_size'] + df['ask_size']
    
    #供需价格的均值
    df['mid_price'] = (df['ask_price'] + df['bid_price']) / 2
    
    #整体数据情况
    df['median_size'] = df['stock_id'].map(median_sizes.to_dict())
    df['std_size'] = df['stock_id'].map(std_sizes.to_dict())
    df['max_size'] = df['stock_id'].map(max_sizes.to_dict())
    df['min_size'] = df['stock_id'].map(min_sizes.to_dict())
    df['mean_size'] = df['stock_id'].map(mean_sizes.to_dict())
    df['first_size'] = df['stock_id'].map(first_sizes.to_dict())    
    df['last_size'] = df['stock_id'].map(last_sizes.to_dict())       
    for q in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
        df["quantile" + str(q)] =  df['stock_id'].map(quantile_dic["quantile" + str(q)].to_dict()) 
    #可以再做日期的
    #df['date_median_size'] = df['date_id'].map(date_median_sizes.to_dict())
    #df['date_std_size'] = df['date_id'].map(date_std_sizes.to_dict())
    #df['date_max_size'] = df['date_id'].map(date_max_sizes.to_dict())
    #df['date_min_size'] = df['date_id'].map(date_min_sizes.to_dict())
    #df['date_mean_size'] = df['date_id'].map(date_mean_sizes.to_dict())       
    #for q in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    #    df["date_date_quantile" + str(q)] =  df['date_id'].map(date_quantile_dic["quantile" + str(q)].to_dict())     
    
    #整体市场规模和当前的市场规模比较
    df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_size'], 1, 0)
    
    prices = ['reference_price', 'far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    #价格之间做差，做差/求和
    for c in combinations(prices, 2):
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]} - {c[1]})/({c[0]} + {c[1]})')
        
    for c in combinations(prices, 3):
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1) - min_ - max_
        
        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)

    gc.collect()
    
    return df

In [6]:
y = train['target'].values
X = feature_eng(train.drop(columns='target'))

y_min = np.min(y)
y_max = np.max(y)
'''
params = {
    'learning_rate': 0.009,#0.009,#0.018,
    'max_depth': 13,#10,#9,
    'n_estimators': 1400,#600,
    'num_leaves': 500,#440,
    'objective': 'mae',
    'random_state': 43,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    #'device': 'gpu'
}
'''

"\nparams = {\n    'learning_rate': 0.009,#0.009,#0.018,\n    'max_depth': 13,#10,#9,\n    'n_estimators': 1400,#600,\n    'num_leaves': 500,#440,\n    'objective': 'mae',\n    'random_state': 43,\n    'reg_alpha': 0.01,\n    'reg_lambda': 0.01,\n    #'device': 'gpu'\n}\n"

In [7]:
params = {
'learning_rate': 0.009,#0.009,#0.018,
'max_depth': 13,#10,#9,
'n_estimators': 900,#600,
'num_leaves': 500,#440,
'objective': 'mae',
'random_state': 43,
'reg_alpha': 0.01,
'reg_lambda': 0.01,
################################################    
 "device": "gpu",
 "gpu_platform_id": 0,
 "gpu_device_id": 0     
}

In [8]:
kf = KFold(n_splits=N_Folds, shuffle=True, random_state=100)
mae_scores = []

if is_train:
    for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid)

        m = lgb.train(params, train_data, valid_sets=[train_data, valid_data],verbose_eval=50, early_stopping_rounds=50)
        print(f"Fold {fold+1} Trainning finished.")

        model_filename = f"/kaggle/working/model_fold_{fold+1}.pkl"
        joblib.dump(m, model_filename)
        y_pred_valid = m.predict(X_valid)

        y_pred_valid = np.nan_to_num(y_pred_valid)
        y_valid = np.nan_to_num(y_valid)
        mae = mean_absolute_error(y_valid, y_pred_valid)
        print("############mae##############:",mae)
        mae_scores.append(mae)

    # 计算4折平均的MAE
    average_mae = np.mean(mae_scores)
    print(f"{N_Folds} fold MAE: {average_mae}")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19778
[LightGBM] [Info] Number of data points in the train set: 4364910, number of used features: 84
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 84 dense feature groups (349.67 MB) transferred to GPU in 0.231481 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.060201
Training until validation scores don't improve for 50 rounds
[50]	training's l1: 6.32876	valid_1's l1: 6.34572
[100]	training's l1: 6.29033	valid_1's l1: 6.31147
[150]	training's l1: 6.26858	valid_1's l1: 6.2937
[200]	training's l1: 6.25413	valid_1's l1: 6.28325
[250]	training's l1: 6.24319	valid_1's l1: 6.27621
[300]	training's l1: 6.23362	valid_1's l1:

In [9]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    return out

if is_infer:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    predictions = []

    for (test, revealed_targets, sample_prediction) in iter_test:
        feat = feature_eng(test)
        fold_prediction = 0
        for fold in range(0, N_Folds):
            model_filename = f"/kaggle/working/model_fold_{fold+1}.pkl"
            m = joblib.load(model_filename)
            fold_prediction += m.predict(feat, predict_disable_shape_check=True)   
        
        fold_prediction /= N_Folds
        fold_prediction = zero_sum(fold_prediction, test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
        clipped_predictions = np.clip(fold_prediction, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
