In [1]:
import os
import joblib
import polars as pl
import xgboost as xgb
import numpy as np
import pandas as pd
import kaggle_evaluation.jane_street_inference_server

In [2]:
# Paths and constants
input_path = '/kaggle/input/jane-street-real-time-market-data-forecasting'
def read_selected_data(input_path):
    # Define the directory containing your data files

    # List three specific Parquet files you want to read
    selected_files = [f"partition_id={i}/part-0.parquet" for i in range(1)]
    # Load and filter the data from only the selected Parquet files
    dfs = []
    for file_name in selected_files:
        file_path = f'{input_path}/train.parquet/{file_name}'
        lazy_df = pl.scan_parquet(file_path)
        df = lazy_df.collect()
        dfs.append(df)

    # Concatenate all dataframes into a single dataframe
    full_df = pl.concat(dfs)

    return full_df

In [3]:
df = read_selected_data(input_path)
df = df.fill_null(strategy='forward')

# Prepare feature names
feature_names = [f"feature_{i:02d}" for i in range(79)]

# Prepare training and validation data
num_valid_dates = 100
dates = df['date_id'].unique().to_numpy()
valid_dates = dates[-num_valid_dates:]
train_dates = dates[:-num_valid_dates]


In [4]:
# Extract features, target, and weights for validation and training sets
X_valid = df.filter(pl.col('date_id').is_in(valid_dates)).select(feature_names).to_numpy()
y_valid = df.filter(pl.col('date_id').is_in(valid_dates)).select('responder_6').to_numpy().ravel()
w_valid = df.filter(pl.col('date_id').is_in(valid_dates)).select('weight').to_numpy().ravel()

X_train = df.filter(pl.col('date_id').is_in(train_dates)).select(feature_names).to_numpy()
y_train = df.filter(pl.col('date_id').is_in(train_dates)).select('responder_6').to_numpy().ravel()
w_train = df.filter(pl.col('date_id').is_in(train_dates)).select('weight').to_numpy().ravel()

### Define Customized Evaluation Method
which is R2 that specified by Jane Street

In [5]:
def r2_xgb(y_true, y_pred, sample_weight=None):
    if sample_weight is None:
        sample_weight = np.ones_like(y_true)
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return -r2


### Train Models

In [6]:
# Train the XGBoost model
model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=6,
    tree_method='hist',
#     device="cuda",
    objective='reg:squarederror',
    eval_metric=r2_xgb,
    disable_default_eval_metric=True,
    early_stopping_rounds=2
)


In [7]:
model.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_valid, y_valid)],
    sample_weight_eval_set=[w_valid],
    verbose=2)

[0]	validation_0-r2_xgb:-0.00187
[2]	validation_0-r2_xgb:-0.00485
[4]	validation_0-r2_xgb:-0.00652
[6]	validation_0-r2_xgb:-0.00754
[8]	validation_0-r2_xgb:-0.00886
[10]	validation_0-r2_xgb:-0.00959
[12]	validation_0-r2_xgb:-0.00989
[14]	validation_0-r2_xgb:-0.01035
[16]	validation_0-r2_xgb:-0.01059
[18]	validation_0-r2_xgb:-0.01106
[19]	validation_0-r2_xgb:-0.01102


## Save Model to Output

In [8]:
if not os.path.exists("./model_save"):
    # Create the directory if it does not exist
    os.mkdir("./model_save")
model.save_model('./model_save/xgboost_model_baseline.json')

### Load Models

In [9]:
model_loaded = xgb.XGBRegressor()
model_loaded.load_model('/kaggle/working/model_save/xgboost_model_baseline.json')

### Load Test Dataset

In [10]:
test = pl.scan_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet")
test = test.collect()
test = test.to_pandas()

In [11]:
test.head()

Unnamed: 0,row_id,date_id,time_id,symbol_id,weight,is_scored,feature_00,feature_01,feature_02,feature_03,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
0,0,0,0,0,3.169998,True,0.0,0.0,0.0,0.0,...,-0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
1,1,0,0,1,2.165993,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,0.0,0.0
2,2,0,0,2,3.06555,True,0.0,-0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
3,3,0,0,3,2.698642,True,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,,,0.0,0.0,-0.0,-0.0
4,4,0,0,4,1.80333,True,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,-0.0,,,0.0,0.0,0.0,0.0


In [12]:
test = test[feature_names].values

In [13]:
predictions = model_loaded.predict(test)

In [14]:
predictions

array([0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994, 0.08834994,
       0.08834994, 0.08834994, 0.08834994, 0.08834994], dtype=float32)

## debug the submission

In [15]:
test = pl.scan_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet")
test = test.collect()
test = test.to_pandas()

test_df = test[feature_names].values
predictions = model_loaded.predict(test_df)

output_df = pd.DataFrame({"row_id": test['row_id'], "responder_6": predictions})

In [16]:
output_df.head()

Unnamed: 0,row_id,responder_6
0,0,0.08835
1,1,0.08835
2,2,0.08835
3,3,0.08835
4,4,0.08835


# Submission API

In [17]:
# Global lags storage
lags_: pl.DataFrame | None = None
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame:
    global lags_, model_loaded # Declare models as global
    
    # Logic for saving or loading lags
    if lags is not None:
        lags_ = lags
    
    test = test.to_pandas()
    test_df = test[feature_names].values
    predictions = model_loaded.predict(test_df)

    output_df = pd.DataFrame({"row_id": test['row_id'], "responder_6": predictions})

        
    return pl.from_pandas(output_df)

In [18]:
# Setup the inference server
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

# Running the inference server
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway((
        '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
        '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
    ))