In [1]:
import os
import polars as pl
import xgboost as xgb
import numpy as np
import pandas as pd
import sys

sys.path.append("/home/pqian/data/kaggle/js-2024")
import kaggle_evaluation.jane_street_inference_server

# data process

In [2]:
PATH = "/home/pqian/data/kaggle/js-2024/"


def read_selected_data(input_path):
    # Define the directory containing your data files

    # List three specific Parquet files you want to read
    selected_files = [f"partition_id={i}/part-0.parquet" for i in range(1)]
    # Load and filter the data from only the selected Parquet files
    dfs = []
    for file_name in selected_files:
        file_path = f"{input_path}/train.parquet/{file_name}"
        lazy_df = pl.scan_parquet(file_path)
        df = lazy_df.collect()
        dfs.append(df)

    # Concatenate all dataframes into a single dataframe
    full_df = pl.concat(dfs)

    return full_df

In [3]:
df = read_selected_data(PATH)
df = df.fill_null(strategy="forward")

# Prepare feature names
feature_names = [f"feature_{i:02d}" for i in range(79)]

# Prepare training and validation data
num_valid_dates = 100
dates = df["date_id"].unique().to_numpy()
valid_dates = dates[-num_valid_dates:]
train_dates = dates[:-num_valid_dates]

In [4]:
# Extract features, target, and weights for validation and training sets
X_valid = (
    df.filter(pl.col("date_id").is_in(valid_dates)).select(feature_names).to_numpy()
)
y_valid = (
    df.filter(pl.col("date_id").is_in(valid_dates))
    .select("responder_6")
    .to_numpy()
    .ravel()
)
w_valid = (
    df.filter(pl.col("date_id").is_in(valid_dates)).select("weight").to_numpy().ravel()
)

X_train = (
    df.filter(pl.col("date_id").is_in(train_dates)).select(feature_names).to_numpy()
)
y_train = (
    df.filter(pl.col("date_id").is_in(train_dates))
    .select("responder_6")
    .to_numpy()
    .ravel()
)
w_train = (
    df.filter(pl.col("date_id").is_in(train_dates)).select("weight").to_numpy().ravel()
)

In [5]:
def r2_xgb(y_true, y_pred, sample_weight=None):
    if sample_weight is None:
        sample_weight = np.ones_like(y_true)
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (
        np.average((y_true) ** 2, weights=sample_weight) + 1e-38
    )
    return -r2

# train model

In [6]:
# Train the XGBoost model
model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=6,
    tree_method="hist",
    device="cuda",
    objective="reg:squarederror",
    eval_metric=r2_xgb,
    disable_default_eval_metric=True,
    early_stopping_rounds=2,
)

In [7]:
model.fit(
    X_train,
    y_train,
    sample_weight=w_train,
    eval_set=[(X_valid, y_valid)],
    sample_weight_eval_set=[w_valid],
    verbose=2,
)

[0]	validation_0-r2_xgb:-0.00187
[2]	validation_0-r2_xgb:-0.00485
[4]	validation_0-r2_xgb:-0.00651
[6]	validation_0-r2_xgb:-0.00752
[8]	validation_0-r2_xgb:-0.00884
[10]	validation_0-r2_xgb:-0.00957
[12]	validation_0-r2_xgb:-0.00988
[14]	validation_0-r2_xgb:-0.01034
[16]	validation_0-r2_xgb:-0.01059
[18]	validation_0-r2_xgb:-0.01105
[20]	validation_0-r2_xgb:-0.01097


# save model

In [8]:
if not os.path.exists("./model_save"):
    # Create the directory if it does not exist
    os.mkdir("./model_save")
model.save_model("./model_save/js_xgboost_model_baseline.json")

# load model

In [9]:
model_loaded = xgb.XGBRegressor()
model_loaded.load_model("./model_save/js_xgboost_model_baseline.json")

# load test data

In [10]:
test = pl.scan_parquet(
    "/home/pqian/data/kaggle/js-2024/test.parquet/date_id=0/part-0.parquet"
)
test = test.collect()
test = test.to_pandas()

In [11]:
test_df = test[feature_names].values
predictions = model_loaded.predict(test_df)

output_df = pd.DataFrame({"row_id": test["row_id"], "responder_6": predictions})

# submission API

In [12]:
lags_: pl.DataFrame | None = None


# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(
    test: pl.DataFrame, lags: pl.DataFrame | None
) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags
    # Replace this section with your own predictions
    predictions = test.select(
        "row_id",
        pl.lit(0.0).alias("responder_6"),
    )

    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ["row_id", "responder_6"]
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ["row_id", "responder_6"]).all()
    else:
        raise TypeError("The predict function must return a DataFrame")
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [13]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(
    predict
)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            "/home/pqian/data/kaggle/js-2024/test.parquet",
            "/home/pqian/data/kaggle/js-2024/lags.parquet",
        )
    )