In [1]:
import polars as pl
import xgboost as xgb
import numpy as np

partitioned_dir = "/home/jupyter/data/XGFeatures_partitioned/"
data = pl.read_parquet(partitioned_dir, use_pyarrow=True)

#Competition scoring
def weighted_r2_score(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return r2

# Use percentage of data(20%)
sample_fraction = 0.35
data = data.sample(fraction=sample_fraction, seed=42)

# 80-20 split
split_date = data.select(pl.col("date_id").quantile(0.8)).to_numpy().item()
train_data = data.filter(pl.col("date_id") < split_date)
valid_data = data.filter(pl.col("date_id") >= split_date)

# Convert training data to DMatrix
X_train = train_data.select(pl.exclude(["responder_6", "date_id", "time_id", "weight"])).to_pandas()
y_train = train_data["responder_6"].to_pandas()
train_weights = train_data["weight"].to_pandas()

dtrain = xgb.DMatrix(X_train, label=y_train, weight=train_weights)

# Convert validation data to DMatrix
X_valid = valid_data.select(pl.exclude(["responder_6", "date_id", "time_id", "weight"])).to_pandas()
y_valid = valid_data["responder_6"].to_pandas()
valid_weights = valid_data["weight"].to_pandas()

dvalid = xgb.DMatrix(X_valid, label=y_valid, weight=valid_weights)


# Training --> base
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.1,
    "max_depth": 6,
    "seed": 42,
}

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# Evaluate models with top 10, 15, and 20 features
k_features = [23] #23 is top

for k in k_features:
    # Select top k features using univariate feature selection
    selector = SelectKBest(score_func=f_regression, k=k)
    X_train_np = X_train.to_numpy()
    y_train_np = y_train.to_numpy()
    selector.fit(X_train_np, y_train_np)
    
    # Get the selected top features
    top_feature_indices = selector.get_support(indices=True)
    top_features = X_train.columns[top_feature_indices]
    print(top_features)
    
    # Train and evaluate the model with top features
    dtrain_top = xgb.DMatrix(X_train[top_features].to_numpy(), label=y_train, weight=train_weights)
    dvalid_top = xgb.DMatrix(X_valid[top_features].to_numpy(), label=y_valid, weight=valid_weights)
    
    model_top = xgb.train(
        params=params,
        dtrain=dtrain_top,
        num_boost_round=100,
        evals=[(dtrain_top, "train"), (dvalid_top, "valid")],
        early_stopping_rounds=10,
    )
    
    y_pred_top = model_top.predict(dvalid_top)
    r2_score_top = weighted_r2_score(y_valid.to_numpy(), y_pred_top, valid_weights.to_numpy())
    print(f"Top {k} Features Model Weighted R² Score: {r2_score_top:.4f}")

Index(['feature_04', 'feature_06', 'feature_07', 'responder_0', 'responder_1',
       'responder_2', 'responder_3', 'responder_4', 'responder_5',
       'responder_7', 'responder_8', 'responder_0_lag_1', 'responder_1_lag_1',
       'responder_2_lag_1', 'responder_3_lag_1', 'responder_4_lag_1',
       'responder_5_lag_1', 'responder_7_lag_1', 'responder_8_lag_1',
       'responder_3_lag_diff_1', 'responder_5_lag_diff_1',
       'responder_7_lag_diff_1', 'responder_8_lag_diff_1'],
      dtype='object')
[0]	train-rmse:0.79647	valid-rmse:0.70635
[1]	train-rmse:0.73464	valid-rmse:0.65294
[2]	train-rmse:0.68022	valid-rmse:0.60692
[3]	train-rmse:0.63231	valid-rmse:0.56718
[4]	train-rmse:0.59038	valid-rmse:0.53353
[5]	train-rmse:0.55324	valid-rmse:0.50426


In [None]:
model = xgb.train( #-->no longer needed
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=[(dtrain, "train"), (dvalid, "valid")],
    early_stopping_rounds=10,
)

# Predict on validation set
y_pred = model.predict(dvalid)

# Calculate weighted R² score
r2_score = weighted_r2_score(y_valid.to_numpy(), y_pred, valid_weights.to_numpy())
#print(f"Weighted R² Score: {r2_score:.4f}")


In [None]:
# Calculate correlations of features with the target variable

X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
correlations = np.abs(np.corrcoef(X_train_np.T, y_train_np, rowvar=True)[:-1, -1])
correlation_threshold = 0.3

# Filter out constant columns from X_train
non_constant_columns = X_train.columns[X_train.std(axis=0) > 0]
X_train_filtered = X_train[non_constant_columns]

# Select features with correlation above the threshold
high_corr_indices = np.where(correlations > correlation_threshold)[0]
high_corr_features = X_train.columns[high_corr_indices]

# Train and evaluate the model with high-correlation features
dtrain_corr = xgb.DMatrix(X_train[high_corr_features].to_numpy(), label=y_train, weight=train_weights)
dvalid_corr = xgb.DMatrix(X_valid[high_corr_features].to_numpy(), label=y_valid, weight=valid_weights)

model_corr = xgb.train(
    params=params,
    dtrain=dtrain_corr,
    num_boost_round=100,
    evals=[(dtrain_corr, "train"), (dvalid_corr, "valid")],
    early_stopping_rounds=10,
)

y_pred_corr = model_corr.predict(dvalid_corr)
r2_score_corr = weighted_r2_score(y_valid.to_numpy(), y_pred_corr, valid_weights.to_numpy())
print(f"Low-Correlation Filtered Model Weighted R² Score: {r2_score_corr:.4f}")


In [None]:
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=50,
    nfold=5,  # Number of folds
    metrics="rmse",
    early_stopping_rounds=10,
    seed=42,
)
print(f"Best RMSE: {cv_results['test-rmse-mean'].min()}")

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

plot_importance(model, importance_type='weight')  # Other types: 'gain', 'cover'
plt.show()

In [None]:
# Step 1: Random Search
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth": [3, 5, 7, 9],
    "n_estimators": [50, 100, 150, 200],
    "subsample": [0.6, 0.8, 1.0],
}

random_search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(),
    param_distributions=param_distributions,
    n_iter=30,  # Number of random combinations to test
    cv=3,  # Cross-validation folds
    verbose=1
)
random_search.fit(X_train, y_train)
best_params_random = random_search.best_params_

# Step 2: Grid Search
param_grid = {
    "learning_rate": [0.05, 0.1],
    "max_depth": [5, 7],
    "n_estimators": [100, 150],
}

grid_search = GridSearchCV(
    estimator=xgb.XGBRegressor(),
    param_grid=param_grid,
    cv=3,  # Cross-validation folds
    verbose=1
)
grid_search.fit(X_train, y_train)
best_params_grid = grid_search.best_params_


In [None]:
import optuna

def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
    }
    model = xgb.XGBRegressor(**params, objective="reg:squarederror", seed=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    rmse = np.sqrt(np.mean((preds - y_valid) ** 2))
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print(f"Best Parameters: {study.best_params}")
