In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


In [6]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)

# ----------- Load Test Data -------------
# Replace with your actual test data loading logic
# It should match what your model pipeline expects (raw or preprocessed)
test_df = pd.read_csv(r"resources\analytical_base_table.csv")
X_test = test_df.drop(['Exited'], axis=1)  # replace 'target' with your actual label column
y_test = test_df["Exited"]

# ----------- Paths and Setup -------------
models_folder = r'C:\Users\dell\OneDrive\Desktop\bank churn\models'
model_files = [f for f in os.listdir(models_folder) if f.endswith(".pkl")]

# ----------- Storage for plotting and comparison -------------
results = []
roc_data = []
pr_data = []

for file in model_files:
    with open(os.path.join(models_folder, file), "rb") as f:
        data = pickle.load(f)
        model = data['model']
        threshold = data['threshold']

    # Predict probabilities
    y_probs = model.predict_proba(X_test)[:, 1]
    y_pred = (y_probs >= threshold).astype(int)

    # Metrics
    auc = roc_auc_score(y_test, y_probs)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        "Model": file,
        "Threshold": threshold,
        "AUC": auc,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })

    # Store curves
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    precs, recalls, _ = precision_recall_curve(y_test, y_probs)
    roc_data.append((file, fpr, tpr))
    pr_data.append((file, precs, recalls))

# ----------- Print Tabular Results -------------
print("\nModel Evaluation Summary:\n")
df_results = pd.DataFrame(results)
print(df_results.sort_values(by="F1", ascending=False))

# ----------- Plot ROC Curve -------------
plt.figure(figsize=(8, 6))
for name, fpr, tpr in roc_data:
    plt.plot(fpr, tpr, label=name)
plt.plot([0, 1], [0, 1], 'k--')
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("roc_comparison.png")
plt.show()

# ----------- Plot Precision-Recall Curve -------------
plt.figure(figsize=(8, 6))
for name, precs, recalls in pr_data:
    plt.plot(recalls, precs, label=name)
plt.title("Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("pr_comparison.png")
plt.show()


ModuleNotFoundError: No module named 'Pipeline'