In [6]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jsbaselinezyz/cbt_0.model
/kaggle/input/jsbaselinezyz/lgb_3.model
/kaggle/input/jsbaselinezyz/cbt_2.model
/kaggle/input/jsbaselinezyz/lgb_0.model
/kaggle/input/jsbaselinezyz/lgb_4.model
/kaggle/input/jsbaselinezyz/xgb_1.model
/kaggle/input/jsbaselinezyz/xgb_2.model
/kaggle/input/jsbaselinezyz/cbt_3.model
/kaggle/input/jsbaselinezyz/xgb_3.model
/kaggle/input/jsbaselinezyz/lgb_1.model
/kaggle/input/jsbaselinezyz/xgb_4.model
/kaggle/input/jsbaselinezyz/xgb_0.model
/kaggle/input/jsbaselinezyz/lgb_2.model
/kaggle/input/jsbaselinezyz/cbt_1.model
/kaggle/input/jsbaselinezyz/cbt_4.model
/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.

In [7]:
import pandas as pd
import numpy as np
import os
import joblib
import polars as pl
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from joblib import Parallel, delayed
import kaggle_evaluation.jane_street_inference_server

In [8]:
def reduce_mem_usage(self, float16_as32=True):
    start_mem = df.memory_usage().sum()/1024**2; print('Memory usage (START): {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type) != 'category':
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
            
    end_mem = df.memory_usage().sum()/1024**2; print('Memory usage (END): {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100*(start_mem-end_mem)/start_mem))
    return df
    
            

In [9]:
input_path = './jane-street-real-time-market-data-forecasting/' if os.path.exists('./jane-street-real-time-market-data-forecasting') else '/kaggle/input/jane-street-real-time-market-data-forecasting/'

feature_names = [f"feature_{i:02d}" for i in range(79)]
num_valid_dates = 100
skip_dates = 500
N_fold = 5 # cv


In [10]:
TRAINING = False

if TRAINING:
    df = pd.read_parquet(f'{input_path}/train.parquet')
    df = reduce_mem_usage(df,False)
    df = df[df['date_id'] >= skip_dates].reset_index(drop=True)
    dates = df['date_id'].unique()
    valid_dates = dates[-num_valid_dates:]
    train_dates = dates[:-num_valid_dates]
    print(df.tail())

In [13]:
model_path = '/kaggle/input/jsbaselinezyz'

if TRAINING:
    X_valid = df[feature_names].loc[df['date_id'].isin(valid_dates)]
    y_valid = df['responder_6'].loc[df['date_id'].isin(valid_dates)]
    w_valid = df['weight'].loc[df['date_id'].isin(valid_dates)]

models = []

def train(model_dict, model_name='lgb'):
    if TRAINING:
        if(N_fold>1):
            selected_dates = [date for ii, date in enumerate(train_dates) if ii % N_fold != i]
        else:
            selected_dates=dates

        model = model_dict[model_name]
        
        X_train = df[feature_names].loc[df['date_id'].isin(selected_dates)]
        # print(X_train.head(5))
        y_train = df['responder_6'].loc[df['date_id'].isin(selected_dates)]
        # print(y_train.head(5))
        w_train = df['weight'].loc[df['date_id'].isin(selected_dates)]
        # print(X_train.shape,y_train.shape,w_train.shape)

        if model_name == 'lgb':
            model.fit(X_train, y_train, w_train,  
                      eval_metric=[r2_lgb],
                      eval_set=[(X_valid, y_valid, w_valid)], 
                      callbacks=[
                          lgb.early_stopping(100), 
                          lgb.log_evaluation(10)
                      ])        
        elif model_name == 'cbt':
            evalset = cbt.Pool(X_valid, y_valid, weight=w_valid)
            model.fit(X_train, y_train, sample_weight=w_train, 
                      eval_set=[evalset], 
                      verbose=10, 
                      early_stopping_rounds=150)
        else:
            model.fit(X_train, y_train, sample_weight=w_train, 
                      eval_set=[(X_valid, y_valid)], 
                      sample_weight_eval_set=[w_valid], 
                      verbose=10, 
                      early_stopping_rounds=100)

        models.append(model)
        
        joblib.dump(model, f'./models/{model_name}_{i}.model')
        
        del X_train
        del y_train
        del w_train
        
        import gc
        gc.collect()
        
    else:
        models.append(joblib.load(f'{model_path}/{model_name}_{i}.model'))
        
    return 

# Custom R2 metric for XGBoost
def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return -r2

# Custom R2 metric for LightGBM
def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return 'r2', r2, True

# Custom R2 metric for CatBoost
class r2_cbt(object):
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)

        return error_sum, weight_sum

# Dictionary to store different models with their configurations
# model_dict = {
#     'lgb': lgb.LGBMRegressor(n_estimators=500, device='gpu', gpu_use_dp=True, objective='l2'),
#     'xgb': xgb.XGBRegressor(n_estimators=2000, learning_rate=0.1, max_depth=6, tree_method='hist', device="cuda", objective='reg:squarederror', eval_metric=r2_xgb, disable_default_eval_metric=True),
#     'cbt': cbt.CatBoostRegressor(iterations=1000, learning_rate=0.05, task_type='GPU', loss_function='RMSE', eval_metric=r2_cbt()),
# }
model_dict = {
    'lgb': lgb.LGBMRegressor(n_estimators=500,objective='l2')
}

for i in range(N_fold):
    train(model_dict, 'lgb')
    # train(model_dict, 'xgb')
    # train(model_dict, 'cbt')

In [14]:
print(models)

[LGBMRegressor(device='gpu', gpu_use_dp=True, n_estimators=500, objective='l2'), LGBMRegressor(device='gpu', gpu_use_dp=True, n_estimators=500, objective='l2'), LGBMRegressor(device='gpu', gpu_use_dp=True, n_estimators=500, objective='l2'), LGBMRegressor(device='gpu', gpu_use_dp=True, n_estimators=500, objective='l2'), LGBMRegressor(device='gpu', gpu_use_dp=True, n_estimators=500, objective='l2')]


In [15]:
import polars as pl
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    
    feat = test[feature_names].to_numpy()
    
    pred = [model.predict(feat) for model in models]
    pred = np.mean(pred, axis=0)
    
    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))

    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    assert list(predictions.columns) == ['row_id', 'responder_6']
    assert len(predictions) == len(test)

    return predictions

In [16]:
import kaggle_evaluation.jane_street_inference_server
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )