In [1]:
!pip install pandas
!pip install scikit-learn
!pip install seaborn
!pip install lightgbm[scikit-learn]
!pip install plotly
!pip install xgboost
!pip install catboost
!pip install joblib
!pip install kaggle

zsh:1: no matches found: lightgbm[scikit-learn]


In [4]:
#imports
# Data Analytic Tools
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Computing Tools 
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
import numpy as np 
import joblib 
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/__init__.py
data/public_timeseries_testing_util.py
data/train.csv
data/models/{modelname}_{i}.model
data/example_test_files/revealed_targets.csv
data/example_test_files/test.csv
data/example_test_files/sample_submission.csv
data/optiver2023/__init__.py
data/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
data/optiver2023/__pycache__/__init__.cpython-311.pyc
data/.ipynb_checkpoints/public_timeseries_testing_util-checkpoint.py
data/catboost_info/learn_error.tsv
data/catboost_info/test_error.tsv
data/catboost_info/time_left.tsv
data/catboost_info/catboost_training.json
data/catboost_info/learn/events.out.tfevents
data/catboost_info/test/events.out.tfevents


In [5]:
train = pd.read_csv('data/train.csv')

train.head()

train.isnull().sum()

train = train.dropna(subset=['target'])
targets = train['target'].values
train = train.drop('time_id', axis=1)
train = train.drop('row_id', axis=1)
# Drop the index inplace 
train.reset_index(drop=True, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(train, targets, train_size=0.7, random_state=3)

In [6]:
from numba import njit, prange

def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2'
               ]

    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']

    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    
    return df[features]
    

In [7]:
TRAINING = True
if TRAINING:
   df_ = generate_features(X_train)

In [23]:
model_path ='data/RKCP0219'
os.makedirs('models', exist_ok=True)

N_fold = 4

if TRAINING:
    X = df_.values
    Y = y_train
    X = X[np.isfinite(Y)]
    Y = Y[np.isfinite(Y)]
    index = np.arange(len(X))
    
models = []
scores = pd.DataFrame(columns=['name', 'mean absolute error'])

def train_and_test(model_dict, modelname='lgb'):
    if TRAINING:
        model = model_dict[modelname]
        if modelname=='lgb':
            model.fit(X[index%N_fold!=i], Y[index%N_fold!=i], 
                    eval_set=[(X[index%N_fold==i], Y[index%N_fold==i])])
        else:
            model.fit(X[index%N_fold!=i], Y[index%N_fold!=i], 
                    eval_set=[(X[index%N_fold==i], Y[index%N_fold==i])], 
                    verbose=10, 
                    early_stopping_rounds=100
                    )
        
        models.append(model)
        joblib.dump(model, f'data/models/{modelname}_{i}.model')
        
        feat = generate_features(X_test) 
        # Make predictions using the trained models and compute the mean
        test_preds = model.predict(feat)
        test_score = mean_absolute_error(y_test, test_preds)
        scores.loc[len(scores)]=[modelname, test_score] 

    else:
        models.append(joblib.load(f'{model_path}/{modelname}_{i}.model'))
    return 

model_dict = {
    'lgb': lgb.LGBMRegressor(objective='regression_l1', n_estimators=500),
    'xgb': xgb.XGBRegressor(tree_method='hist', objective='reg:absoluteerror', n_estimators=500),
    'cbt': cbt.CatBoostRegressor(objective='MAE', iterations=10),
}

for i in range(N_fold):
    train_and_test(model_dict, 'lgb')
    train_and_test(model_dict, 'xgb')
    train_and_test(model_dict, 'cbt')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3118
[LightGBM] [Info] Number of data points in the train set: 2749893, number of used features: 14
[LightGBM] [Info] Start training from score -0.060201




[0]	validation_0-mae:6.38074
[10]	validation_0-mae:6.32218
[20]	validation_0-mae:6.30817
[30]	validation_0-mae:6.29795
[40]	validation_0-mae:6.29174
[50]	validation_0-mae:6.28731
[60]	validation_0-mae:6.28374
[70]	validation_0-mae:6.28069
[80]	validation_0-mae:6.27830
[90]	validation_0-mae:6.27637
[100]	validation_0-mae:6.27502
[110]	validation_0-mae:6.27323
[120]	validation_0-mae:6.27207
[130]	validation_0-mae:6.27062
[140]	validation_0-mae:6.26938
[150]	validation_0-mae:6.26821
[160]	validation_0-mae:6.26747
[170]	validation_0-mae:6.26672
[180]	validation_0-mae:6.26606
[190]	validation_0-mae:6.26514
[200]	validation_0-mae:6.26466
[210]	validation_0-mae:6.26417
[220]	validation_0-mae:6.26357
[230]	validation_0-mae:6.26281
[240]	validation_0-mae:6.26219
[250]	validation_0-mae:6.26122
[260]	validation_0-mae:6.26074
[270]	validation_0-mae:6.26002
[280]	validation_0-mae:6.25962
[290]	validation_0-mae:6.25913
[300]	validation_0-mae:6.25864
[310]	validation_0-mae:6.25808
[320]	validation_0-



[0]	validation_0-mae:6.37639
[10]	validation_0-mae:6.32048
[20]	validation_0-mae:6.30514
[30]	validation_0-mae:6.29528
[40]	validation_0-mae:6.28985
[50]	validation_0-mae:6.28552
[60]	validation_0-mae:6.28204
[70]	validation_0-mae:6.27930
[80]	validation_0-mae:6.27699
[90]	validation_0-mae:6.27558
[100]	validation_0-mae:6.27378
[110]	validation_0-mae:6.27263
[120]	validation_0-mae:6.27138
[130]	validation_0-mae:6.27019
[140]	validation_0-mae:6.26909
[150]	validation_0-mae:6.26781
[160]	validation_0-mae:6.26683
[170]	validation_0-mae:6.26569
[180]	validation_0-mae:6.26487
[190]	validation_0-mae:6.26416
[200]	validation_0-mae:6.26334
[210]	validation_0-mae:6.26255
[220]	validation_0-mae:6.26179
[230]	validation_0-mae:6.26139
[240]	validation_0-mae:6.26095
[250]	validation_0-mae:6.26041
[260]	validation_0-mae:6.25969
[270]	validation_0-mae:6.25924
[280]	validation_0-mae:6.25886
[290]	validation_0-mae:6.25859
[300]	validation_0-mae:6.25803
[310]	validation_0-mae:6.25785
[320]	validation_0-



[0]	validation_0-mae:6.37251
[10]	validation_0-mae:6.31349
[20]	validation_0-mae:6.29910
[30]	validation_0-mae:6.28860
[40]	validation_0-mae:6.28250
[50]	validation_0-mae:6.27871
[60]	validation_0-mae:6.27499
[70]	validation_0-mae:6.27251
[80]	validation_0-mae:6.26998
[90]	validation_0-mae:6.26802
[100]	validation_0-mae:6.26655
[110]	validation_0-mae:6.26477
[120]	validation_0-mae:6.26362
[130]	validation_0-mae:6.26238
[140]	validation_0-mae:6.26109
[150]	validation_0-mae:6.25997
[160]	validation_0-mae:6.25892
[170]	validation_0-mae:6.25767
[180]	validation_0-mae:6.25684
[190]	validation_0-mae:6.25614
[200]	validation_0-mae:6.25557
[210]	validation_0-mae:6.25529
[220]	validation_0-mae:6.25448
[230]	validation_0-mae:6.25390
[240]	validation_0-mae:6.25339
[250]	validation_0-mae:6.25268
[260]	validation_0-mae:6.25198
[270]	validation_0-mae:6.25167
[280]	validation_0-mae:6.25093
[290]	validation_0-mae:6.25054
[300]	validation_0-mae:6.24997
[310]	validation_0-mae:6.24966
[320]	validation_0-



[0]	validation_0-mae:6.38605
[10]	validation_0-mae:6.32888
[20]	validation_0-mae:6.31523
[30]	validation_0-mae:6.30468
[40]	validation_0-mae:6.29830
[50]	validation_0-mae:6.29473
[60]	validation_0-mae:6.29101
[70]	validation_0-mae:6.28848
[80]	validation_0-mae:6.28631
[90]	validation_0-mae:6.28424
[100]	validation_0-mae:6.28267
[110]	validation_0-mae:6.28128
[120]	validation_0-mae:6.27967
[130]	validation_0-mae:6.27855
[140]	validation_0-mae:6.27717
[150]	validation_0-mae:6.27601
[160]	validation_0-mae:6.27517
[170]	validation_0-mae:6.27427
[180]	validation_0-mae:6.27338
[190]	validation_0-mae:6.27285
[200]	validation_0-mae:6.27217
[210]	validation_0-mae:6.27109
[220]	validation_0-mae:6.27037
[230]	validation_0-mae:6.26956
[240]	validation_0-mae:6.26885
[250]	validation_0-mae:6.26803
[260]	validation_0-mae:6.26735
[270]	validation_0-mae:6.26692
[280]	validation_0-mae:6.26621
[290]	validation_0-mae:6.26567
[300]	validation_0-mae:6.26507
[310]	validation_0-mae:6.26445
[320]	validation_0-

In [24]:
#Testing
scores

Unnamed: 0,name,mean absolute error
0,lgb,6.265238
1,xgb,6.248527
2,cbt,6.379824
3,lgb,6.264602
4,xgb,6.248145
5,cbt,6.379606
6,lgb,6.264306
7,xgb,6.248612
8,cbt,6.379726
9,lgb,6.265185


In [25]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test() 

ModuleNotFoundError: No module named 'optiver2023'

In [26]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    feat = generate_features(test)

    sample_prediction['target'] = np.mean([model.predict(feat) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1

NameError: name 'iter_test' is not defined