# 02 - Model Evaluation: Final Test Set Assessment

**Objective**: Evaluate trained models on the held-out test set.

**Important**: This is the ONLY time we touch the test set.

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix
)

project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))
print('Libraries loaded')

In [None]:
from src.preprocessing import prepare_data
from src.train import create_splits

data_path = project_root / 'data' / 'raw' / 'telco_customer_churn.csv'
X, y, schema = prepare_data(str(data_path))
X_train, X_val, X_test, y_train, y_val, y_test = create_splits(X, y)
print(f'Test set: {len(X_test)} samples')

In [None]:
pipeline = joblib.load(project_root / 'artifacts' / 'model.joblib')
print(f'Model: {type(pipeline.named_steps["classifier"]).__name__}')

## Test Set Metrics

In [None]:
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

print('Test Set Metrics:')
print(f'  Accuracy:  {accuracy_score(y_test, y_pred):.4f}')
print(f'  Precision: {precision_score(y_test, y_pred):.4f}')
print(f'  Recall:    {recall_score(y_test, y_pred):.4f}')
print(f'  F1 Score:  {f1_score(y_test, y_pred):.4f}')
print(f'  ROC-AUC:   {roc_auc_score(y_test, y_prob):.4f}')

In [None]:
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix')
plt.show()