In [1]:
!pip install pandas
!pip install scikit-learn
!pip install seaborn
!pip install lightgbm[scikit-learn]
!pip install plotly
!pip install xgboost
!pip install catboost
!pip install joblib
!pip install kaggle

zsh:1: no matches found: lightgbm[scikit-learn]


In [4]:
#imports
# Data Analytic Tools
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Computing Tools 
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
import numpy as np 
import joblib 
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/__init__.py
data/public_timeseries_testing_util.py
data/train.csv
data/models/{modelname}_{i}.model
data/example_test_files/revealed_targets.csv
data/example_test_files/test.csv
data/example_test_files/sample_submission.csv
data/optiver2023/__init__.py
data/optiver2023/competition.cpython-310-x86_64-linux-gnu.so
data/optiver2023/__pycache__/__init__.cpython-311.pyc
data/.ipynb_checkpoints/public_timeseries_testing_util-checkpoint.py
data/catboost_info/learn_error.tsv
data/catboost_info/test_error.tsv
data/catboost_info/time_left.tsv
data/catboost_info/catboost_training.json
data/catboost_info/learn/events.out.tfevents
data/catboost_info/test/events.out.tfevents


In [5]:
train = pd.read_csv('data/train.csv')

train.head()

train.isnull().sum()

train = train.dropna(subset=['target'])
targets = train['target'].values
train = train.drop('time_id', axis=1)
train = train.drop('row_id', axis=1)
# Drop the index inplace 
train.reset_index(drop=True, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(train, targets, train_size=0.7, random_state=3)

In [6]:
from numba import njit, prange

def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2'
               ]

    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    return df[features]
    

In [7]:
TRAINING = True
if TRAINING:
   df_ = generate_features(X_train)


In [8]:
model_path ='data/RKCP0219'
os.makedirs('models', exist_ok=True)

N_fold = 2

if TRAINING:
    X = df_.values
    Y = y_train
    X = X[np.isfinite(Y)]
    Y = Y[np.isfinite(Y)]
    index = np.arange(len(X))
    
models = []
scores = pd.DataFrame(columns=['name', 'score'])

def train_and_test(model_dict, modelname='lgb'):
    if TRAINING:
        model = model_dict[modelname]
        model.fit(X[index%N_fold!=i], Y[index%N_fold!=i], 
                    eval_set=[(X[index%N_fold==i], Y[index%N_fold==i])], 
                    # verbose=10, 
                    # early_stopping_rounds=100
                    )
        models.append(model)
        joblib.dump(model, f'data/models/{modelname}_{i}.model')
        feat = generate_features(X_test) 
        # Make predictions using the trained models and compute the mean
        train_preds = model.predict(X_train)
        train_score = mean_absolute_error(y_train, train_preds)
        scores.loc[len(scores)]=[modelname, train_score] 

    else:
        models.append(joblib.load(f'{model_path}/{modelname}_{i}.model'))
    return 

model_dict = {
    'lgb': lgb.LGBMRegressor(objective='regression_l1', n_estimators=500),
    'xgb': xgb.XGBRegressor(tree_method='hist', objective='reg:absoluteerror', n_estimators=500),
    'cbt': cbt.CatBoostRegressor(objective='MAE', iterations=10),
}

for i in range(N_fold):
    # train_and_test(model_dict, 'lgb')
    # train_and_test(model_dict, 'xgb')
    train_and_test(model_dict, 'cbt')

0:	learn: 6.4081870	test: 6.4037484	best: 6.4037484 (0)	total: 160ms	remaining: 1.44s
1:	learn: 6.4052824	test: 6.4008356	best: 6.4008356 (1)	total: 239ms	remaining: 958ms
2:	learn: 6.4026341	test: 6.3981684	best: 6.3981684 (2)	total: 318ms	remaining: 743ms
3:	learn: 6.4001768	test: 6.3956939	best: 6.3956939 (3)	total: 396ms	remaining: 594ms
4:	learn: 6.3977608	test: 6.3932777	best: 6.3932777 (4)	total: 475ms	remaining: 475ms
5:	learn: 6.3954941	test: 6.3909847	best: 6.3909847 (5)	total: 553ms	remaining: 369ms
6:	learn: 6.3932770	test: 6.3887697	best: 6.3887697 (6)	total: 629ms	remaining: 269ms
7:	learn: 6.3911895	test: 6.3866636	best: 6.3866636 (7)	total: 705ms	remaining: 176ms
8:	learn: 6.3892143	test: 6.3846931	best: 6.3846931 (8)	total: 783ms	remaining: 87ms
9:	learn: 6.3873532	test: 6.3828202	best: 6.3828202 (9)	total: 861ms	remaining: 0us

bestTest = 6.382820243
bestIteration = 9
0:	learn: 6.4036671	test: 6.4081266	best: 6.4081266 (0)	total: 126ms	remaining: 1.13s
1:	learn: 6.400

In [9]:
#Testing
scores

Unnamed: 0,name,score
0,cbt,6.420514
1,cbt,6.418807


In [10]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test() 

ModuleNotFoundError: No module named 'optiver2023'

In [None]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    feat = generate_features(test)

    sample_prediction['target'] = np.mean([model.predict(feat) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1