In [1]:
#necessary
import polars as pl#similar to pandas, but with better performance when dealing with large datasets.
import pandas as pd#read csv,parquet
import numpy as np#for scientific computation of matrices
#model
from sklearn.linear_model import Ridge
import os#Libraries that interact with the operating system
import gc
import warnings#avoid some negligible errors
#The filterwarnings () method is used to set warning filters, which can control the output method and level of warning information.
warnings.filterwarnings('ignore')
#environment provided by competition hoster
import kaggle_evaluation.jane_street_inference_server

import random#provides some functions for generating random numbers
#set random seed,make sure model can be recurrented.
def seed_everything(seed):
    np.random.seed(seed)#numpy random seed
    random.seed(seed)#python built-in random seed
seed_everything(seed=2025)

In [2]:
def custom_metric(y_true,y_pred,weight):
    weighted_r2=1-(np.sum(weight*(y_true-y_pred)**2)/np.sum(weight*y_true**2))
    return weighted_r2
    
print("< read parquet >")
datas=[]
weights=[]
for i in range(6,10):
    train=pl.read_parquet(f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet")
    train=train.to_pandas().sample(frac=0.82, random_state=2025)
    weights+=list(train['weight'].values)
    train.drop(['weight'],axis=1,inplace=True)
    datas.append(train)
train=pd.concat(datas)
del datas
gc.collect()
print(f"train.shape:{train.shape}")

print("< get X,y >")
cols=[f'feature_0{i}' if i<10 else f'feature_{i}' for i in range(79)]
X=train[cols].fillna(3).values
y=train['responder_6'].values
del train
gc.collect()

print("< train test split >")
split=200000#around 1%
train_X,train_y,test_X,test_y,train_weight,test_weight=X[:-split],y[:-split],X[-split:],y[-split:],weights[:-split],weights[-split:]
print(f"train_X.shape:{train_X.shape},test_X.shape:{test_X.shape}")

print("< fit and predict >")
model=Ridge()
model.fit(train_X,train_y)
train_pred=model.predict(train_X)
test_pred=model.predict(test_X)
print(f"train weighted_r2:{custom_metric(train_y,train_pred,weight=train_weight)}")
print(f"test weighted_r2:{custom_metric(test_y,test_pred,weight=test_weight)}")

< read parquet >
train.shape:(20462339, 91)
< get X,y >
< train test split >
train_X.shape:(20262339, 79),test_X.shape:(200000, 79)
< fit and predict >
train weighted_r2:0.006252706050872803
test weighted_r2:0.0024012327194213867


In [3]:
def predict(test,lags):
    cols=[f'feature_0{i}' if i<10 else f'feature_{i}' for i in range(79)]
    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    test=test.to_pandas()[cols].fillna(3)
    test_preds=model.predict(test.values)
    predictions = predictions.with_columns(pl.Series('responder_6', test_preds.ravel()))
    return predictions

In [4]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )