# XGBoost Classification on HIGGS Dataset

This notebook demonstrates binary classification using XGBoost on the HIGGS dataset with hyperparameter tuning and comprehensive visualization.

## Load Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn: data splitting, preprocessing, and metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# XGBoost model
from xgboost import XGBClassifier

## Load and Prepare Data

In [None]:
# Load raw HIGGS dataset with no header
df = pd.read_csv("HIGGS_8K.csv", header=None)

FileNotFoundError: [Errno 2] No such file or directory: '/content/HIGGS_8K.csv'

### Data Cleaning
Column 17 occasionally has bad values (e.g. strings); force coercion to numeric and drop those rows

In [None]:
df[17] = pd.to_numeric(df[17], errors='coerce')
df.dropna(inplace=True)  # Drop rows with NaNs after coercion

### Column Naming and Data Shuffling
Name columns: first is label (0/1), rest are feature_1 to feature_28

In [None]:
df.columns = ['label'] + [f'feature_{i}' for i in range(1, 29)]

# Shuffle dataset to randomize instance order (important if source data is sorted)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split features and labels
X = df.drop('label', axis=1)
y = df['label']

## Preprocessing

### Feature Standardization
Standardize features to mean=0, std=1 for better convergence and model stability

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Train-Test Split
Split data into train/test with stratified sampling to preserve label distribution

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

## Hyperparameter Tuning with GridSearchCV

### Define Parameter Grid
Define parameter grid for coarse search

In [None]:
param_grid = {
    'max_depth': [4, 6],               # Tree depth controls model complexity
    'learning_rate': [0.03, 0.05],     # Step size shrinkage
    'n_estimators': [200, 300],        # Number of trees
    'subsample': [0.8],                # Row subsampling
    'colsample_bytree': [0.8]          # Feature subsampling per tree
}

### Initialize XGBoost Model
Initialize XGBoost with AUC as eval metric and disable label encoder warning

In [None]:
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='auc',
    random_state=42
)

### Perform Grid Search
Perform 3-fold cross-validation using ROC AUC as scoring metric

In [None]:
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=0
)

# Fit the model and retrieve the best combination of hyperparameters
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

## Final Training

### Fit Model with Early Stopping
Fit model on full training set with early stopping on test set AUC (only for info, not for tuning)

In [None]:
best_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

## Model Evaluation

### Generate Predictions

In [None]:
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]  # Probability scores for class 1

### Display Results

In [None]:
# Display best hyperparameter set
print("\nBest Hyperparameters:")
print(grid_search.best_params_)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ROC AUC score — overall ability to rank positive > negative
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

## Visualization Dashboard (2x2 Grid)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('XGBoost Classification Model Performance Dashboard', fontsize=16, fontweight='bold')

# (1) ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
axes[0, 0].plot(fpr, tpr, linewidth=2, label=f'ROC Curve (AUC = {roc_auc:.3f})', color='darkorange')
axes[0, 0].plot([0, 1], [0, 1], linestyle='--', color='gray', alpha=0.7)
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('ROC Curve')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# (2) Probability Distribution Histogram
axes[0, 1].hist(y_proba[y_test == 0], bins=50, alpha=0.7, color='blue', label='Class 0 (Actual)', density=True)
axes[0, 1].hist(y_proba[y_test == 1], bins=50, alpha=0.7, color='red', label='Class 1 (Actual)', density=True)
axes[0, 1].axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
axes[0, 1].set_xlabel('Predicted Probability')
axes[0, 1].set_ylabel('Density')
axes[0, 1].set_title('Prediction Probability Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# (3) Feature Importance (Top 10)
importances = best_model.feature_importances_
sorted_idx = np.argsort(importances)[::-1]
top_n = 10
y_pos = np.arange(top_n)

axes[1, 0].barh(y_pos, importances[sorted_idx[:top_n]], color='skyblue', alpha=0.8)
axes[1, 0].set_yticks(y_pos)
axes[1, 0].set_yticklabels([f'feature_{i+1}' for i in sorted_idx[:top_n]])
axes[1, 0].set_xlabel('Importance Score')
axes[1, 0].set_title('Top 10 Feature Importances')
axes[1, 0].invert_yaxis()
axes[1, 0].grid(True, alpha=0.3, axis='x')

# (4) Confusion Matrix + Additional Metrics
cm = confusion_matrix(y_test, y_pred)
im = axes[1, 1].imshow(cm, interpolation='nearest', cmap='Blues')

# Add matrix cell labels
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        axes[1, 1].text(j, i, format(cm[i, j], 'd'),
                       ha="center", va="center",
                       color="white" if cm[i, j] > thresh else "black",
                       fontsize=14, fontweight='bold')

axes[1, 1].set_title('Confusion Matrix')
axes[1, 1].set_ylabel('True Label')
axes[1, 1].set_xlabel('Predicted Label')
axes[1, 1].set_xticks([0, 1])
axes[1, 1].set_yticks([0, 1])
axes[1, 1].set_xticklabels(['Class 0', 'Class 1'])
axes[1, 1].set_yticklabels(['Class 0', 'Class 1'])

# Compute and display summary stats
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

metrics_text = f'Accuracy: {accuracy:.3f}\nPrecision: {precision:.3f}\nRecall: {recall:.3f}\nF1-Score: {f1:.3f}\nAUC: {roc_auc:.3f}'
axes[1, 1].text(1.3, 0.5, metrics_text, transform=axes[1, 1].transAxes, 
                fontsize=10, verticalalignment='center',
                bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))

plt.tight_layout()
plt.show()

## Decision Boundary Visualization using Logistic Regression (2D Projection)

### Import Additional Libraries and Setup 2D Projection
Use logistic regression for clean 2D decision boundary visualization

In [None]:
from sklearn.linear_model import LogisticRegression

# Select 2 most important features for 2D projection
top_2_idx = np.argsort(importances)[-2:]
X_test_2d = X_test[:, top_2_idx]
X_train_2d = X_train[:, top_2_idx]
feature_names = [f'feature_{i+1}' for i in range(28)]
top_2_features = [feature_names[i] for i in top_2_idx]

### Train Logistic Regression Model
Train logistic regression model on reduced 2D space

In [None]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_2d, y_train)

### Create Decision Boundary Plot

In [None]:
# Create mesh grid for plotting
h = 0.05
x_min, x_max = X_test_2d[:, 0].min() - 1, X_test_2d[:, 0].max() + 1
y_min, y_max = X_test_2d[:, 1].min() - 1, X_test_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

# Predict probabilities over the grid
Z = lr_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)

# Plot decision boundary + data points
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], colors=['lightblue', 'lightcoral'], alpha=0.4)
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linestyles='-', linewidths=2)

# Sample subset of test data for scatterplot overlay
n_sample = min(1000, len(X_test_2d))
sample_idx = np.random.choice(len(X_test_2d), n_sample, replace=False)

# Mask classes for color separation
class_0_mask = y_test.iloc[sample_idx] == 0
class_1_mask = y_test.iloc[sample_idx] == 1

plt.scatter(X_test_2d[sample_idx][class_0_mask, 0], X_test_2d[sample_idx][class_0_mask, 1], 
           c='blue', alpha=0.6, s=30, label='Class 0', edgecolors='white', linewidth=0.5)
plt.scatter(X_test_2d[sample_idx][class_1_mask, 0], X_test_2d[sample_idx][class_1_mask, 1], 
           c='red', alpha=0.6, s=30, label='Class 1', edgecolors='white', linewidth=0.5)

# Labels and legend
plt.xlabel(f'{top_2_features[0]} (Most Important Feature)')
plt.ylabel(f'{top_2_features[1]} (Second Most Important Feature)')
plt.title('Decision Boundary Visualization (Logistic Regression on Top 2 Features)\n'
          f'XGBoost Model Performance: ROC AUC = {roc_auc:.4f}')
plt.legend(loc='best', frameon=True, fancybox=True, shadow=True)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Final Diagnostic Information

In [None]:
print(f"\nVisualization Info:")
print(f"Top 2 most important features used: {top_2_features}")
print(f"Feature importance scores: {importances[top_2_idx]}")
print(f"Total points plotted: {n_sample} (sampled from {len(X_test_2d)} test points)")