In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import gc
import os

In [3]:
# Create directory for model files if it doesn't exist
os.makedirs('ensemble_models', exist_ok=True)

# Load data from split files
print("Loading training data...")
train_data = pd.read_csv('split_data/train_data.csv')
X_train = train_data.drop(['Header', 'Position', 'target'], axis=1)
y_train = train_data['target']
del train_data
gc.collect()

print("Loading validation data...")
val_data = pd.read_csv('split_data/val_data.csv')
X_val = val_data.drop(['Header', 'Position', 'target'], axis=1)
y_val = val_data['target']
del val_data
gc.collect()

print("Loading test data...")
test_data = pd.read_csv('split_data/test_data.csv')
X_test = test_data.drop(['Header', 'Position', 'target'], axis=1)
y_test = test_data['target']
id_test = test_data[['Header', 'Position']]

# Combine train and validation for final training
X_train_full = pd.concat([X_train, X_val], axis=0)
y_train_full = pd.concat([y_train, y_val], axis=0)

Loading training data...
Loading validation data...
Loading test data...


In [4]:
# Initialize base models
print("Initializing models...")

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric=['logloss', 'auc'],
    # use_label_encoder=False,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method = 'hist',
    device='cuda',  # Use GPU
    n_estimators=500,
    random_state=42
)

lgb_model = lgb.LGBMClassifier(
    objective='binary',
    metric=['binary_logloss'],  # List of evaluation metrics
    learning_rate=0.1,
    max_depth=6,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    device='gpu',  # Use GPU
    n_estimators=500,
    random_state=42,
    verbosity = -1,
)


# CatBoost model
cb_model = cb.CatBoostClassifier(
    objective='Logloss',
    learning_rate=0.1,
    depth=6,
    subsample=0.8,
    colsample_bylevel=0.8,
    task_type='GPU',  # Use GPU
    devices='0',      # Specify GPU device ID
    iterations=500,   # Number of boosting iterations
    random_seed=42,
    verbose=100
)

Initializing models...


In [5]:
# Train and save individual models first
print("Training XGBoost model...")
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    # early_stopping_rounds=50,
    verbose=50
)
xgb_model.save_model('ensemble_models/xgb_model.json')
print("XGBoost model saved")

Training XGBoost model...
[0]	validation_0-logloss:0.67147	validation_0-auc:0.76759
[50]	validation_0-logloss:0.51124	validation_0-auc:0.83099
[100]	validation_0-logloss:0.50037	validation_0-auc:0.83713
[150]	validation_0-logloss:0.49706	validation_0-auc:0.83901
[200]	validation_0-logloss:0.49485	validation_0-auc:0.84035
[250]	validation_0-logloss:0.49314	validation_0-auc:0.84143
[300]	validation_0-logloss:0.49149	validation_0-auc:0.84248
[350]	validation_0-logloss:0.49057	validation_0-auc:0.84304
[400]	validation_0-logloss:0.48963	validation_0-auc:0.84364
[450]	validation_0-logloss:0.48944	validation_0-auc:0.84378
[499]	validation_0-logloss:0.48860	validation_0-auc:0.84437
XGBoost model saved


In [None]:
print("Training LightGBM model...")
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(50, verbose=True)]
)
lgb_model.booster_.save_model('ensemble_models/lgb_model.txt')
print("LightGBM model saved")

Training LightGBM model...


In [None]:
print("Training CatBoost model...")
cb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50,
    verbose=100
)
cb_model.save_model('ensemble_models/cb_model.cbm')
print("CatBoost model saved")

In [None]:
# Free up some memory
del X_train, X_val, y_train, y_val
gc.collect()

# Create the ensemble - Hard voting (majority rule)
print("Creating ensemble for hard voting...")
ensemble_hard = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cb', cb_model)
    ],
    voting='hard'
)

# Train the ensemble on the combined training data
print("Training hard voting ensemble...")
ensemble_hard.fit(X_train_full, y_train_full)

In [None]:
# Free more memory
del X_train_full, y_train_full
gc.collect()

# Create the ensemble - Soft voting (probability average)
print("Creating ensemble for soft voting...")
ensemble_soft = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cb', cb_model)
    ],
    voting='soft'
)

In [None]:
# Load full training data again
print("Loading full training data for soft voting ensemble...")
train_data = pd.read_csv('split_data/train_data.csv')
val_data = pd.read_csv('split_data/val_data.csv')
X_train_full = pd.concat([
    train_data.drop(['Header', 'Position', 'target'], axis=1),
    val_data.drop(['Header', 'Position', 'target'], axis=1)
], axis=0)
y_train_full = pd.concat([train_data['target'], val_data['target']], axis=0)
del train_data, val_data
gc.collect()

In [None]:
# Train the soft voting ensemble
print("Training soft voting ensemble...")
ensemble_soft.fit(X_train_full, y_train_full)

# Free memory again
del X_train_full, y_train_full
gc.collect()

In [None]:
# Evaluate individual models and ensembles
print("Evaluating models on test data...")

results = {}

# XGBoost predictions
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
xgb_pred = (xgb_pred_proba > 0.5).astype(int)

# LightGBM predictions
lgb_pred_proba = lgb_model.predict_proba(X_test)[:, 1]
lgb_pred = (lgb_pred_proba > 0.5).astype(int)

# CatBoost predictions
cb_pred_proba = cb_model.predict_proba(X_test)[:, 1]
cb_pred = (cb_pred_proba > 0.5).astype(int)

# Hard voting ensemble predictions
hard_pred = ensemble_hard.predict(X_test)
hard_pred_proba = np.mean([xgb_pred_proba, lgb_pred_proba, cb_pred_proba], axis=0)

# Soft voting ensemble predictions
soft_pred_proba = ensemble_soft.predict_proba(X_test)[:, 1]
soft_pred = (soft_pred_proba > 0.5).astype(int)

In [None]:
# Calculate metrics for all models
models = {
    "XGBoost": (xgb_pred, xgb_pred_proba),
    "LightGBM": (lgb_pred, lgb_pred_proba),
    "CatBoost": (cb_pred, cb_pred_proba),
    "Hard Voting Ensemble": (hard_pred, hard_pred_proba),
    "Soft Voting Ensemble": (soft_pred, soft_pred_proba)
}

for model_name, (y_pred, y_pred_proba) in models.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f"\n{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    
    results[model_name] = {
        "Accuracy": float(accuracy),
        "Precision": float(precision),
        "Recall": float(recall),
        "F1": float(f1),
        "ROC_AUC": float(roc_auc),
        "Confusion_Matrix": conf_matrix.tolist()
    }

In [None]:
# Save results
import json
with open('ensemble_results.json', 'w') as f:
    json.dump(results, f, indent=4)

print("\nAll results saved to ensemble_results.json")

# Save predictions for analysis
predictions_df = pd.DataFrame({
    'Header': id_test['Header'],
    'Position': id_test['Position'],
    'True_Label': y_test,
    'XGB_Prob': xgb_pred_proba,
    'LGB_Prob': lgb_pred_proba,
    'CB_Prob': cb_pred_proba,
    'Hard_Ensemble_Pred': hard_pred,
    'Soft_Ensemble_Prob': soft_pred_proba
})
predictions_df.to_csv('ensemble_predictions.csv', index=False)
print("Predictions saved to ensemble_predictions.csv")

# Optional: Create a comparative bar chart
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC_AUC']
    metrics_data = {model: [results[model][metric] for metric in metrics] for model in results}
    
    plt.figure(figsize=(15, 8))
    bar_width = 0.15
    index = np.arange(len(metrics))
    
    for i, (model, values) in enumerate(metrics_data.items()):
        plt.bar(index + i*bar_width, values, bar_width, label=model)
    
    plt.xlabel('Metrics')
    plt.ylabel('Score')
    plt.title('Comparison of Model Performance')
    plt.xticks(index + bar_width*2, metrics)
    plt.legend()
    plt.tight_layout()
    plt.savefig('ensemble_comparison.png')
    print("Performance comparison chart saved to ensemble_comparison.png")
except Exception as e:
    print(f"Could not create chart: {e}")