In [1]:
import polars as pl
import xgboost as xgb
import numpy as np

# Weighted R² evaluation metric
def weighted_r2_score(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return r2

# Load dataset
print("Loading dataset...")
data = pl.read_parquet("/home/jupyter/data/XGFeatures_partitioned/")

# Sample 20% of the dataset for feature importance calculation
print("Sampling 20% of the dataset for feature importance calculation...")
sample_fraction = 0.2
sampled_data = data.sample(fraction=sample_fraction, seed=42)

# Split sampled data into training and test sets
print("Splitting sampled dataset...")
split_date = sampled_data["date_id"].quantile(0.8)  # Returns a float directly
train_sample = sampled_data.filter(pl.col("date_id") < split_date)
test_sample = sampled_data.filter(pl.col("date_id") >= split_date)

# Separate features and labels in the sampled dataset
X_train_sample = train_sample.select(pl.exclude(["responder_6", "date_id", "time_id", "weight"]))
y_train_sample = train_sample["responder_6"]
train_weights_sample = train_sample["weight"]

# Train a temporary model for feature importance on the sampled data
print("Training temporary model on sampled data for feature importance...")
dtrain_temp = xgb.DMatrix(X_train_sample.to_numpy(), label=y_train_sample.to_numpy(), weight=train_weights_sample.to_numpy())
params_temp = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 4,
    "tree_method": "hist",
    "seed": 42,
}
temp_model = xgb.train(params=params_temp, dtrain=dtrain_temp, num_boost_round=50)

# Extract top 50% features
print("Extracting top 50% features...")
feature_importance = temp_model.get_score(importance_type="weight")
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# Calculate the number of top features to retain (50%)
num_features_to_keep = len(sorted_features) // 2
top_features = [X_train_sample.columns[int(f[1:])] for f, _ in sorted_features[:num_features_to_keep]]
print(f"Top 50% Features: {top_features}")

# Filter the entire dataset based on top 50% features
print("Filtering the entire dataset...")
filtered_data = data.select(top_features + ["responder_6", "weight", "date_id", "time_id"])

# Save the filtered dataset
print("Saving filtered dataset with top 50% features...")
filtered_data.write_parquet("/home/jupyter/data/final_filtered_top_half_data.parquet")
print("Filtered dataset saved!")


Loading dataset...
Sampling 20% of the dataset for feature importance calculation...
Splitting sampled dataset...
Training temporary model on sampled data for feature importance...
Extracting top 30 features...
Initial Top 30 Features: ['responder_0', 'responder_3', 'responder_3_lag_1', 'responder_7', 'responder_0_lag_1', 'responder_8', 'feature_24_lag_1', 'responder_3_lag_ratio_1', 'responder_4', '__index_level_0___lag_ratio_1', 'feature_24', 'responder_3_lag_diff_1', 'feature_61', 'responder_5', 'feature_59', 'feature_61_lag_1', 'responder_7_lag_1', 'responder_0_lag_diff_1', 'feature_47_lag_1', 'responder_4_lag_1', 'feature_36', 'feature_57', 'responder_8_lag_diff_1', 'responder_0_lag_ratio_1', 'feature_62', 'feature_36_lag_1', 'feature_39', 'feature_47', 'feature_50_lag_1', 'responder_5_lag_1']
Calculating correlation matrix...
Highly Correlated Features to Drop: ['responder_4', 'feature_24', 'feature_61_lag_1', 'responder_7_lag_1', 'responder_4_lag_1', 'feature_47']
Filtered Featur