In [1]:
import polars as pl
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split

# Score evaluation
def weighted_r2_score(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return r2

print("Loading dataset...")
data = pl.read_parquet("/home/jupyter/data/XGFeatures_partitioned/")

# Split data into training test data
print("Splitting dataset...")
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Separate features and labels
X_train = train_data.select(pl.exclude(["responder_6", "date_id", "time_id", "weight"]))
y_train = train_data["responder_6"]
train_weights = train_data["weight"]

X_test = test_data.select(pl.exclude(["responder_6", "date_id", "time_id", "weight"]))
y_test = test_data["responder_6"]
test_weights = test_data["weight"]

print("Sampling a subset of the training data for feature importance calculation...")
X_train_with_idx = X_train.with_row_index(name="row_idx")
X_train_sample = X_train_with_idx.sample(fraction=0.1, seed=42)
sampled_indices = X_train_sample["row_idx"]
y_train_sample = y_train[sampled_indices]
train_weights_sample = train_weights[sampled_indices]
X_train_sample = X_train_sample.drop("row_idx")

# Train a temporary model to find the top 23 most important features
print("Training a temporary model to determine feature importance...")
dtrain_sample = xgb.DMatrix(X_train_sample.to_numpy(), label=y_train_sample.to_numpy(), weight=train_weights_sample.to_numpy())
params_temp = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 4,
    "seed": 42,
}
model_temp = xgb.train(params=params_temp, dtrain=dtrain_sample, num_boost_round=50)

# Map XGBoost feature indices to actual column names
print("Extracting top 23 features...")
importance = model_temp.get_score(importance_type="weight")
sorted_features = sorted(importance.items(), key=lambda x: x[1], reverse=True)
top_feature_indices = [int(f[1:]) for f, _ in sorted_features[:23]]  # Extract feature indices
top_features = [X_train.columns[i] for i in top_feature_indices]
print(f"Top 23 features: {top_features}")

# Prepare the training and test sets with the top 23 features
X_train_top = X_train.select(top_features)
X_test_top = X_test.select(top_features)

# Train the final model using the top 23 features
print("Training the final model...")
dtrain_top = xgb.DMatrix(X_train_top.to_numpy(), label=y_train.to_numpy(), weight=train_weights.to_numpy())
dtest_top = xgb.DMatrix(X_test_top.to_numpy())
params_final = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 6,
    "seed": 42,
}
final_model = xgb.train(params=params_final, dtrain=dtrain_top, num_boost_round=100)

# Test the final model on unseen data
print("Testing the final model on unseen data...")
y_pred = final_model.predict(dtest_top)
test_r2 = weighted_r2_score(y_test.to_numpy(), y_pred, test_weights.to_numpy())

print(f"Test Weighted R² Score: {test_r2:.4f}")

Loading dataset...
Splitting dataset...
Sampling a subset of the training data for feature importance calculation...
Training a temporary model to determine feature importance...
Extracting top 23 features...
Top 23 features: ['responder_0', 'responder_3', 'responder_3_lag_1', 'responder_7', 'responder_8', 'responder_0_lag_1', 'responder_5', 'responder_4', 'feature_61', 'responder_3_lag_diff_1', 'feature_24', 'feature_24_lag_1', 'responder_3_lag_ratio_1', '__index_level_0__', 'responder_7_lag_1', 'responder_4_lag_1', 'responder_0_lag_diff_1', 'responder_5_lag_1', 'responder_8_lag_1', 'partition_id', 'feature_47_lag_1', 'feature_61_lag_1', '__index_level_0___lag_1']
Training the final model...
Testing the final model on unseen data...
Test Weighted R² Score: 0.8705
