In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score # Using F1 score
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
import all_metrics as all_metrics # Assuming your custom metrics function is available




# This is the recommended way to handle imbalance directly within XGBoost
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
# Handle potential division by zero if pos_count is 0 (though unlikely if it's the minority class)
scale_pos_weight_value = 1.0 if pos_count == 0 else neg_count / pos_count
print(f"Target scale_pos_weight for XGBoost: {scale_pos_weight_value:.2f}")
# ---

# 1. Define Pipeline with Scaler and XGBoost Classifier
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()), # Keep scaler for consistency
    ('resampler', None),          # Placeholder for resampling step
    ('classifier', XGBClassifier(
        random_state=42,
        objective='binary:logistic', # Objective for binary classification
        eval_metric='logloss',       # Common evaluation metric during training
        use_label_encoder=False      # Suppress warning in newer XGBoost versions
    ))
])

# 2. Define Expanded Parameter Grid for XGBoost

# Common XGBoost parameter ranges to explore
xgb_param_ranges = {
    'classifier__n_estimators': [100, 200, 300, 400, 500], # More trees?
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7, 9], # Deeper trees?
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0], # Row sampling
    'classifier__colsample_bytree': [0.7, 0.8, 0.9, 1.0], # Column sampling per tree
    'classifier__gamma': [0, 0.1, 0.5, 1, 2], # Min loss reduction for split
    'classifier__reg_alpha': [0, 0.01, 0.1, 1], # L1 regularization
    'classifier__reg_lambda': [0.1, 1, 5, 10] # L2 regularization (often start > 0)
}

# Expanded resampler parameters
smote_k_neighbors = [3, 5, 10]
adasyn_n_neighbors = [3, 5, 10]
rus_sampling_strategy = [0.2, 0.35, 0.5] # Ratio minority/majority after RUS
ros_sampling_strategy = [0.5, 0.7, 1.0] # Ratio minority/majority after ROS

# List to hold all grid dictionaries
param_grid_xgb = []

# --- Configuration for NO Resampling ---

param_grid_xgb.append({
    'resampler': [None],
    **xgb_param_ranges,
    'classifier__scale_pos_weight': [1, scale_pos_weight_value] # Test default vs calculated weight
})

# --- Configurations for Resampling Methods --
resampling_options_list = [
    {'resampler': [SMOTE(random_state=42)], 'resampler__k_neighbors': smote_k_neighbors},
    {'resampler': [ADASYN(random_state=42)], 'resampler__n_neighbors': adasyn_n_neighbors},
    {'resampler': [RandomOverSampler(random_state=42)], 'resampler__sampling_strategy': ros_sampling_strategy},
    {'resampler': [RandomUnderSampler(random_state=42)], 'resampler__sampling_strategy': rus_sampling_strategy},
    {'resampler': [TomekLinks(sampling_strategy='auto')]},
    {'resampler': [NearMiss(version=1)]}
]

for resampler_config in resampling_options_list:
    config = {
        **resampler_config,
        **xgb_param_ranges,
        'classifier__scale_pos_weight': [1] # Set to 1 (no weight) when resampling
    }
    param_grid_xgb.append(config)

# 3. Configure RandomizedSearchCV
n_iterations_xgb = 75 # Adjust based on time/resources (e.g., 50-150)
scoring_metric = 'f1' # Target F1 score directly
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search_xgb = RandomizedSearchCV(
    pipeline_xgb,
    param_distributions=param_grid_xgb,
    n_iter=n_iterations_xgb,
    scoring=scoring_metric,
    refit=scoring_metric, # Refit the best model based on F1 score
    cv=cv_strategy,
    verbose=2,            # Show detailed progress
    n_jobs=-1,            # Use all CPU cores
    random_state=42
)

# 4. Run the Search
print(f"\nStarting RandomizedSearchCV for XGBoost with n_iter={n_iterations_xgb}...")
print(f"Optimizing for '{scoring_metric}' score.")
print(f"Best Logistic Regression Test F1 was: 0.6951") # Remind ourselves of the target



In [None]:

random_search_xgb.fit(X_train, y_train)

# 5. Process Results (After Fitting)
print("\nBest XGBoost Parameters Found:")
print(random_search_xgb.best_params_)
print("\nBest CV " + scoring_metric.upper() + " Score:")
print(random_search_xgb.best_score_)

In [None]:

print("\n--- Evaluating Best XGBoost Model on Test Set ---")
best_pipeline_xgb = random_search_xgb.best_estimator_
results_xgb = all_metrics.calculate_model_metrics(
    best_pipeline_xgb, X_test, y_test, 'XGBoost (RandomizedSearch Best)'
)
# Compare the key metric
print(f"\nComparison of Test F1-Scores:")
print(f"  Best XGBoost Test F1: {results_xgb.get('F1-score', 'N/A'):.4f}")
print(f"  Best Logistic Regression Test F1: 0.6951")

In [None]:
model_xgb = XGBClassifier()
model_xgb.fit(Xsmote, ysmote)
y_pred_xgb = model_xgb.predict(X_test)

xgb_results = all_metrics.calculate_model_metrics(model_xgb, X_test, y_test, 'XGBoost')