In [None]:
# -*- coding: utf-8 -*-
"""Pertemuan12_Evaluasi_Model_Klasifikasi.ipynb

Automatically generated by Colab.

**Pertemuan 12: Evaluasi Performa Model Klasifikasi**
Confusion Matrix, Precision, Recall, F1-Score, ROC-AUC
"""

# Install library jika diperlukan
# scikit-plot version 0.3.7 is incompatible with Python 3.12 due to scipy dependency.
# Scipy versions older than 1.9.0 (which scikit-plot 0.3.7 requires) are not available for Python 3.12.
!pip install imbalanced-learn

# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                           confusion_matrix, classification_report, roc_curve, auc,
                           roc_auc_score, precision_recall_curve, average_precision_score)

# Model classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Visualization (scikit-plot removed due to incompatibility, using matplotlib/seaborn directly for plots)
# import scikitplot as skplt # Removed
from matplotlib import cm

print("Semua library berhasil diimport!")

In [None]:
# =============================================
# BAGIAN 1: DATASET & PREPROCESSING
# =============================================

print("BAGIAN 1: DATASET & PREPROCESSING")
print("=" * 60)

In [None]:
# Langkah 1: Pilih dataset untuk evaluasi model
print("PILIH DATASET UNTUK EVALUASI MODEL:")

# Opsi 1: Gunakan dataset yang tersedia (Churn Prediction)
print("\n1. Customer Churn Dataset (Telecom)")
print("2. Bank Marketing Dataset")
print("3. Credit Risk Dataset")
print("4. Upload dataset sendiri")

# Kita akan gunakan dataset Bank Marketing (lebih balanced)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"

print(f"\n Mendownload dataset dari: {url}")

# Download dan extract dataset
import zipfile
import io
import requests

# Download dataset
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall()

# Load dataset
df = pd.read_csv('bank-additional/bank-additional-full.csv', sep=';')

print(f"Dataset berhasil diload")
print(f"Shape: {df.shape}")
print(f"\n5 data pertama:")
print(df.head())

In [None]:
# Langkah 2: Exploratory Data Analysis

print("\n EXPLORATORY DATA ANALYSIS (EDA)")

# Info dataset
print("INFORMASI DATASET:")
print(df.info())

# Statistik deskriptif
print("\n STATISTIK DESKRIPTIF (Numerik):")
print(df.describe())

# Target variable analysis
print("\n ANALISIS VARIABLE TARGET ('y'):")
target_dist = df['y'].value_counts()
print(target_dist)
print(f"\nPersentase:")
print((target_dist / len(df) * 100).round(2))

# Visualisasi distribusi target
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.countplot(data=df, x='y', palette='Set2')
plt.title('Distribusi Target Variable')
plt.xlabel('Deposit Subscription')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.pie(target_dist.values, labels=target_dist.index, autopct='%1.1f%%',
        colors=['lightcoral', 'lightgreen'])
plt.title('Persentase Target Variable')

plt.tight_layout()
plt.show()

# Check class imbalance
imbalance_ratio = target_dist[0] / target_dist[1]
print(f"\n CLASS IMBALANCE RATIO: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 3:
    print("WARNING: Dataset memiliki class imbalance yang signifikan!")
else:
    print("Dataset cukup balanced untuk evaluasi")

In [None]:
# Langkah 3: Data Preprocessing untuk Modeling

print("\n DATA PREPROCESSING")

# Pisahkan features dan target
X = df.drop('y', axis=1)
y = df['y']

# Encode target variable (yes=1, no=0)
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # yes=1, no=0
print(f"Target encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Pisahkan categorical dan numerical features
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

print(f"\n JENIS FEATURES:")
print(f"Categorical features: {len(categorical_cols)}")
print(f"Numerical features: {len(numerical_cols)}")

# One-hot encoding untuk categorical features
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print(f"\n DATA SETELAH ENCODING:")
print(f"Shape: {X_encoded.shape}")
print(f"Jumlah features setelah encoding: {X_encoded.shape[1]}")

# Split data: 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

print(f"\n DATA SPLITTING:")
print(f"Train set: {X_train.shape} samples")
print(f"Test set: {X_test.shape} samples")
print(f"Train target distribution: {np.bincount(y_train)}")
print(f"Test target distribution: {np.bincount(y_test)}")

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Preprocessing selesai!")

In [None]:
# =============================================
# BAGIAN 2: TRAINING MULTIPLE MODELS
# =============================================

print("\n BAGIAN 2: TRAINING MULTIPLE MODELS")
print("=" * 60)

In [None]:
# Langkah 4: Inisialisasi dan Training Multiple Models

print("INISIALISASI MODEL KLASIFIKASI")

# Define models dengan parameter default
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)  # probability=True untuk ROC curve
}

# Dictionary untuk menyimpan hasil
results = {
    'Model': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': [],
    'AUC-ROC': []
}

# Dictionary untuk menyimpan model yang sudah trained
trained_models = {}
predictions = {}
probabilities = {}

print("TRAINING MODEL...")
print("-" * 50)

for name, model in models.items():
    print(f"Training {name}...")

    # Train model
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model

    # Predictions
    y_pred = model.predict(X_test_scaled)
    predictions[name] = y_pred

    # Probabilities (untuk ROC curve)
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
        probabilities[name] = y_prob
    else:
        # Untuk model tanpa predict_proba, gunakan decision function
        y_prob = model.decision_function(X_test_scaled)
        probabilities[name] = y_prob

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # AUC-ROC
    auc_roc = roc_auc_score(y_test, y_prob)

    # Store results
    results['Model'].append(name)
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1-Score'].append(f1)
    results['AUC-ROC'].append(auc_roc)

    print(f"  {name} - Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")

print("\n SEMUA MODEL SELESAI DITRAINING!")

In [None]:
# Langkah 5: Tampilkan Perbandingan Hasil Model

print("\nüìä PERBANDINGAN HASIL MODEL")
print("=" * 60)

# Convert results ke DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('F1-Score', ascending=False)

# Format untuk tampilan yang lebih baik
display_df = results_df.copy()
for col in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']:
    display_df[col] = display_df[col].apply(lambda x: f"{x:.4f}")

print("üèÜ PERINGKAT MODEL BERDASARKAN F1-SCORE:")
print(display_df.to_string(index=False))

# Visualisasi perbandingan metrics
plt.figure(figsize=(15, 10))

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']
colors = plt.cm.Set2(np.linspace(0, 1, len(metrics_to_plot)))

for idx, metric in enumerate(metrics_to_plot):
    plt.subplot(2, 3, idx+1)
    sorted_idx = results_df[metric].argsort()
    plt.barh(range(len(models)), results_df[metric].iloc[sorted_idx],
             color=colors[idx])
    plt.yticks(range(len(models)), results_df['Model'].iloc[sorted_idx])
    plt.xlabel(metric)
    plt.title(f'{metric} Comparison')

    # Tambahkan nilai pada bar
    for i, v in enumerate(results_df[metric].iloc[sorted_idx]):
        plt.text(v + 0.01, i, f'{v:.3f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

# Heatmap perbandingan metrics
plt.figure(figsize=(12, 8))
heatmap_data = results_df.set_index('Model')[metrics_to_plot]
sns.heatmap(heatmap_data, annot=True, fmt='.3f', cmap='YlOrRd',
            linewidths=1, linecolor='black')
plt.title('Model Performance Comparison Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# =============================================
# BAGIAN 3: DETAILED MODEL EVALUATION
# =============================================

print("\n BAGIAN 3: DETAILED MODEL EVALUATION")
print("=" * 60)

In [None]:
# Langkah 6: Analisis Confusion Matrix untuk Setiap Model

print("CONFUSION MATRIX ANALYSIS")

# Plot confusion matrix untuk semua model
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (name, model) in enumerate(trained_models.items()):
    if idx < len(axes):
        y_pred = predictions[name]
        cm = confusion_matrix(y_test, y_pred)

        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    ax=axes[idx], cbar=False,
                    xticklabels=['No (0)', 'Yes (1)'],
                    yticklabels=['No (0)', 'Yes (1)'])

        axes[idx].set_title(f'{name}\nAccuracy: {accuracy_score(y_test, y_pred):.3f}')
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('Actual')

        # Hitung metrics dari confusion matrix
        tn, fp, fn, tp = cm.ravel()

        # Tambahkan informasi metrics
        info_text = f'TP: {tp}\nFN: {fn}\nFP: {fp}\nTN: {tn}'
        axes[idx].text(1.2, 0.5, info_text, transform=axes[idx].transAxes,
                      fontsize=10, verticalalignment='center',
                      bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

# Analisis detail untuk model terbaik (berdasarkan F1-Score)
best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]
best_predictions = predictions[best_model_name]

print(f"\n ANALISIS DETAIL UNTUK MODEL TERBAIK: {best_model_name}")
print("-" * 50)

# Confusion matrix detail
cm_best = confusion_matrix(y_test, best_predictions)
tn, fp, fn, tp = cm_best.ravel()

print(f"Confusion Matrix:")
print(f"                Predicted")
print(f"               No     Yes")
print(f"Actual No   [{tn:5d}  {fp:5d}]")
print(f"       Yes  [{fn:5d}  {tp:5d}]")

print(f"\n INTERPRETASI:")
print(f"True Positives (TP): {tp} - Correctly predicted 'Yes'")
print(f"True Negatives (TN): {tn} - Correctly predicted 'No'")
print(f"False Positives (FP): {fp} - Predicted 'Yes' but actually 'No' (Type I Error)")
print(f"False Negatives (FN): {fn} - Predicted 'No' but actually 'Yes' (Type II Error)")

# Business context interpretation
print(f"\n BUSINESS INTERPRETATION:")
print(f"‚Ä¢ {fp} customer yang TIDAK akan subscribe diprediksi AKAN subscribe")
print(f"  (Marketing cost wasted: targeting wrong customers)")
print(f"‚Ä¢ {fn} customer yang AKAN subscribe diprediksi TIDAK akan subscribe")
print(f"  (Lost opportunity: missing potential customers)")

In [None]:
# Langkah 7: Analisis Precision, Recall, dan F1-Score

print("\n PRECISION, RECALL, F1-SCORE ANALYSIS")

# Classification report untuk setiap model
print("\n CLASSIFICATION REPORTS:")
print("=" * 60)

for name in trained_models.keys():
    y_pred = predictions[name]

    print(f"\n {name}:")
    print("-" * 40)

    # Classification report
    report = classification_report(y_test, y_pred, target_names=['No Deposit', 'Deposit'])
    print(report)

    # Hitung metrics per class
    precision_0 = precision_score(y_test, y_pred, pos_label=0)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    f1_0 = f1_score(y_test, y_pred, pos_label=0)

    precision_1 = precision_score(y_test, y_pred, pos_label=1)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)

    print(f"Class 0 (No Deposit):")
    print(f"  Precision: {precision_0:.3f} - Dari yang diprediksi 'No', {precision_0*100:.1f}% benar")
    print(f"  Recall: {recall_0:.3f} - Dari yang sebenarnya 'No', {recall_0*100:.1f}% terdeteksi")
    print(f"  F1-Score: {f1_0:.3f}")

    print(f"\nClass 1 (Deposit):")
    print(f"  Precision: {precision_1:.3f} - Dari yang diprediksi 'Yes', {precision_1*100:.1f}% benar")
    print(f"  Recall: {recall_1:.3f} - Dari yang sebenarnya 'Yes', {recall_1*100:.1f}% terdeteksi")
    print(f"  F1-Score: {f1_1:.3f}")

# Visualisasi Precision-Recall Trade-off
print("\n PRECISION-RECALL TRADE-OFF ANALYSIS")

plt.figure(figsize=(15, 10))

# Plot untuk setiap model
for idx, (name, model) in enumerate(trained_models.items()):
    if name in probabilities:
        y_prob = probabilities[name]

        # Precision-Recall curve
        precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
        avg_precision = average_precision_score(y_test, y_prob)

        plt.subplot(2, 3, idx+1)
        plt.plot(recall, precision, lw=2, label=f'AP={avg_precision:.3f}')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'{name}\nPrecision-Recall Curve')
        plt.legend(loc='best')
        plt.grid(True, alpha=0.3)

        # Highlight titik optimal (threshold yang menghasilkan F1 maksimal)
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        optimal_idx = np.argmax(f1_scores)
        plt.scatter(recall[optimal_idx], precision[optimal_idx],
                   color='red', s=100, zorder=5,
                   label=f'Optimal (F1={f1_scores[optimal_idx]:.3f})')
        plt.legend()

plt.tight_layout()
plt.show()

# Business recommendation berdasarkan Precision-Recall trade-off
print("\n BUSINESS RECOMMENDATION:")
print("Berdasarkan Precision-Recall Trade-off:")

for name in trained_models.keys():
    if name in probabilities:
        y_prob = probabilities[name]
        precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
        f1_scores = 2 * (precision[:-1] * recall[:-1]) / (precision[:-1] + recall[:-1] + 1e-10)
        optimal_idx = np.argmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]

        print(f"\n{name}:")
        print(f"  Optimal Threshold: {optimal_threshold:.3f}")
        print(f"  Precision at optimal: {precision[optimal_idx]:.3f}")
        print(f"  Recall at optimal: {recall[optimal_idx]:.3f}")

        # Rekomendasi berdasarkan business context
        if precision[optimal_idx] > recall[optimal_idx]:
            print(f"  ‚Üí Model lebih PRECISION-ORIENTED")
            print(f"    Cocok untuk: Targeting marketing, minimize wasted cost")
        else:
            print(f"  ‚Üí Model lebih RECALL-ORIENTED")
            print(f"    Cocok untuk: Customer retention, minimize missed opportunities")

In [None]:
# Langkah 8: ROC Curve dan AUC Analysis

print("\n ROC CURVE DAN AUC ANALYSIS")

# Plot ROC curves untuk semua model
plt.figure(figsize=(12, 10))

# Plot diagonal line (random classifier)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.5)')

# Plot ROC curve untuk setiap model
for name in trained_models.keys():
    if name in probabilities:
        y_prob = probabilities[name]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr, lw=2,
                label=f'{name} (AUC = {roc_auc:.3f})')

plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Recall/Sensitivity)')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)

# Tambahkan grid untuk interpretasi AUC
plt.axhline(y=0.9, color='gray', linestyle='--', alpha=0.3)
plt.axvline(x=0.1, color='gray', linestyle='--', alpha=0.3)

plt.tight_layout()
plt.show()

# Analisis detail AUC
print("\n AUC INTERPRETATION GUIDE:")
print("AUC Range    | Interpretation")
print("-" * 40)
print("0.90 - 1.00  | Excellent discrimination")
print("0.80 - 0.90  | Good discrimination")
print("0.70 - 0.80  | Fair discrimination")
print("0.60 - 0.70  | Poor discrimination")
print("0.50 - 0.60  | Fail discrimination")

print("\n MODEL PERFORMANCE BERDASARKAN AUC:")
for name in trained_models.keys():
    if name in probabilities:
        y_prob = probabilities[name]
        roc_auc = roc_auc_score(y_test, y_prob)

        if roc_auc >= 0.9:
            performance = "EXCELLENT"
        elif roc_auc >= 0.8:
            performance = "GOOD"
        elif roc_auc >= 0.7:
            performance = "FAIR"
        elif roc_auc >= 0.6:
            performance = "POOR"
        else:
            performance = "FAIL"

        print(f"{name:25} AUC: {roc_auc:.3f} ‚Üí {performance}")

# Visualisasi komparasi FPR, TPR, Threshold
print("\n DETAILED THRESHOLD ANALYSIS")
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (name, model) in enumerate(trained_models.items()):
    if idx < len(axes) and name in probabilities:
        y_prob = probabilities[name]
        fpr, tpr, thresholds = roc_curve(y_test, y_prob)

        ax = axes[idx]
        ax.plot(thresholds, tpr, 'b-', label='True Positive Rate', lw=2)
        ax.plot(thresholds, fpr, 'r-', label='False Positive Rate', lw=2)
        ax.set_xlabel('Threshold')
        ax.set_ylabel('Rate')
        ax.set_title(f'{name}\nTPR vs FPR by Threshold')
        ax.legend(loc='best')
        ax.grid(True, alpha=0.3)

        # Highlight optimal threshold (Youden's J statistic)
        youden_j = tpr - fpr
        optimal_idx = np.argmax(youden_j)
        optimal_threshold = thresholds[optimal_idx]

        ax.axvline(x=optimal_threshold, color='green', linestyle='--',
                  label=f'Optimal: {optimal_threshold:.2f}')
        ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# =============================================
# BAGIAN 4: CROSS-VALIDATION DAN ROBUST EVALUATION
# =============================================

print("\n BAGIAN 4: CROSS-VALIDATION DAN ROBUST EVALUATION")
print("=" * 60)

In [None]:
# Langkah 9: K-Fold Cross Validation untuk Robust Evaluation

print("K-FOLD CROSS VALIDATION")

# Setup cross-validation
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Metrics untuk cross-validation
cv_results = {
    'Model': [],
    'CV_Accuracy_mean': [],
    'CV_Accuracy_std': [],
    'CV_Precision_mean': [],
    'CV_Precision_std': [],
    'CV_Recall_mean': [],
    'CV_Recall_std': [],
    'CV_F1_mean': [],
    'CV_F1_std': [],
    'CV_AUC_mean': [],
    'CV_AUC_std': []
}

print(f"Menjalankan {k_folds}-Fold Cross Validation...")
print("-" * 50)

for name, model in models.items():
    print(f"Cross-validating {name}...")

    # Lists untuk menyimpan scores setiap fold
    acc_scores, prec_scores, rec_scores, f1_scores, auc_scores = [], [], [], [], []

    # Cross-validation
    for train_idx, val_idx in skf.split(X_encoded, y_encoded):
        # Split data
        X_train_cv, X_val_cv = X_encoded.iloc[train_idx], X_encoded.iloc[val_idx]
        y_train_cv, y_val_cv = y_encoded[train_idx], y_encoded[val_idx]

        # Scale data
        X_train_cv_scaled = scaler.fit_transform(X_train_cv)
        X_val_cv_scaled = scaler.transform(X_val_cv)

        # Train model
        model_cv = model.__class__(**model.get_params())
        model_cv.fit(X_train_cv_scaled, y_train_cv)

        # Predict
        y_pred_cv = model_cv.predict(X_val_cv_scaled)

        # Probabilities untuk AUC
        if hasattr(model_cv, "predict_proba"):
            y_prob_cv = model_cv.predict_proba(X_val_cv_scaled)[:, 1]
        else:
            y_prob_cv = model_cv.decision_function(X_val_cv_scaled)

        # Calculate metrics
        acc_scores.append(accuracy_score(y_val_cv, y_pred_cv))
        prec_scores.append(precision_score(y_val_cv, y_pred_cv))
        rec_scores.append(recall_score(y_val_cv, y_pred_cv))
        f1_scores.append(f1_score(y_val_cv, y_pred_cv))
        auc_scores.append(roc_auc_score(y_val_cv, y_prob_cv))

    # Store results
    cv_results['Model'].append(name)
    cv_results['CV_Accuracy_mean'].append(np.mean(acc_scores))
    cv_results['CV_Accuracy_std'].append(np.std(acc_scores))
    cv_results['CV_Precision_mean'].append(np.mean(prec_scores))
    cv_results['CV_Precision_std'].append(np.std(prec_scores))
    cv_results['CV_Recall_mean'].append(np.mean(rec_scores))
    cv_results['CV_Recall_std'].append(np.std(rec_scores))
    cv_results['CV_F1_mean'].append(np.mean(f1_scores))
    cv_results['CV_F1_std'].append(np.std(f1_scores))
    cv_results['CV_AUC_mean'].append(np.mean(auc_scores))
    cv_results['CV_AUC_std'].append(np.std(auc_scores))

    print(f"  {name} - CV F1-Score: {np.mean(f1_scores):.4f} (¬±{np.std(f1_scores):.4f})")

# Convert ke DataFrame
cv_results_df = pd.DataFrame(cv_results)
cv_results_df = cv_results_df.sort_values('CV_F1_mean', ascending=False)

print("\n CROSS-VALIDATION RESULTS:")
print("=" * 70)
print(cv_results_df.to_string(index=False))

# Visualisasi cross-validation results
plt.figure(figsize=(15, 10))

metrics_cv = ['CV_Accuracy_mean', 'CV_Precision_mean', 'CV_Recall_mean', 'CV_F1_mean', 'CV_AUC_mean']
errors_cv = ['CV_Accuracy_std', 'CV_Precision_std', 'CV_Recall_std', 'CV_F1_std', 'CV_AUC_std']
titles = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']

for idx, (metric, error, title) in enumerate(zip(metrics_cv, errors_cv, titles)):
    plt.subplot(2, 3, idx+1)

    # Sort data
    sorted_idx = cv_results_df[metric].argsort()
    x_pos = range(len(cv_results_df))

    # Plot dengan error bars
    plt.barh(x_pos, cv_results_df[metric].iloc[sorted_idx],
             xerr=cv_results_df[error].iloc[sorted_idx],
             color=plt.cm.Set2(idx/len(metrics_cv)),
             alpha=0.7, ecolor='black', capsize=5)

    plt.yticks(x_pos, cv_results_df['Model'].iloc[sorted_idx])
    plt.xlabel(f'{title} Score')
    plt.title(f'Cross-Validation {title}\n(Mean ¬± Std)')

    # Tambahkan nilai
    for i, (v, e) in enumerate(zip(cv_results_df[metric].iloc[sorted_idx],
                                  cv_results_df[error].iloc[sorted_idx])):
        plt.text(v + 0.01, i, f'{v:.3f}¬±{e:.3f}', va='center', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Langkah 10: Model Selection dan Final Recommendation

print("\n MODEL SELECTION DAN FINAL RECOMMENDATION")
print("=" * 60)

# Gabungkan hasil single test dan cross-validation
final_comparison = pd.merge(results_df, cv_results_df, on='Model')

# Tambahkan kolom stability (1 - CV_std/CV_mean)
final_comparison['Stability_Score'] = 1 - (final_comparison['CV_F1_std'] / final_comparison['CV_F1_mean'])
final_comparison['Overall_Score'] = (
    final_comparison['F1-Score'] * 0.3 +
    final_comparison['CV_F1_mean'] * 0.4 +
    final_comparison['AUC-ROC'] * 0.2 +
    final_comparison['Stability_Score'] * 0.1
)

# Sort berdasarkan overall score
final_comparison = final_comparison.sort_values('Overall_Score', ascending=False)

print("FINAL MODEL COMPARISON:")
print("=" * 80)

# Format untuk display
display_cols = ['Model', 'F1-Score', 'CV_F1_mean', 'CV_F1_std',
                'AUC-ROC', 'Stability_Score', 'Overall_Score']
display_final = final_comparison[display_cols].copy()

for col in ['F1-Score', 'CV_F1_mean', 'CV_F1_std', 'AUC-ROC',
            'Stability_Score', 'Overall_Score']:
    if col != 'Model':
        display_final[col] = display_final[col].apply(lambda x: f"{x:.4f}")

print(display_final.to_string(index=False))

# Best model berdasarkan berbagai kriteria
best_f1 = final_comparison.iloc[0]['Model']
best_cv = final_comparison.sort_values('CV_F1_mean', ascending=False).iloc[0]['Model']
best_auc = final_comparison.sort_values('AUC-ROC', ascending=False).iloc[0]['Model']
best_stable = final_comparison.sort_values('Stability_Score', ascending=False).iloc[0]['Model']

print(f"\n BEST MODEL BY CRITERIA:")
print(f"‚Ä¢ Best F1-Score (Test Set): {best_f1}")
print(f"‚Ä¢ Best CV F1-Score: {best_cv}")
print(f"‚Ä¢ Best AUC-ROC: {best_auc}")
print(f"‚Ä¢ Most Stable (Lowest Variance): {best_stable}")

# Final recommendation
print(f"\n FINAL RECOMMENDATION:")

if best_f1 == best_cv:
    print(f" RECOMMENDED: {best_f1}")
    print(f"   Alasan: Konsisten performa tinggi di test set dan cross-validation")
else:
    print(f" Model berbeda berdasarkan kriteria:")
    print(f"   ‚Ä¢ Untuk deployment cepat: Gunakan {best_f1}")
    print(f"   ‚Ä¢ Untuk robustness: Gunakan {best_cv}")

# Business context based recommendation
print(f"\n BUSINESS CONTEXT RECOMMENDATION:")

# Analisis kebutuhan bisnis
print("Pertimbangkan kebutuhan spesifik bisnis:")
print("1. Jika FOCUS pada MINIMIZE FALSE POSITIVES (hemat biaya marketing):")
precision_ranking = final_comparison.sort_values('Precision', ascending=False)
best_precision_model = precision_ranking.iloc[0]['Model']
print(f"   ‚Üí Gunakan {best_precision_model} (Precision tertinggi)")

print("\n2. Jika FOCUS pada MINIMIZE FALSE NEGATIVES (tidak ingin kehilangan customer):")
recall_ranking = final_comparison.sort_values('Recall', ascending=False)
best_recall_model = recall_ranking.iloc[0]['Model']
print(f"   ‚Üí Gunakan {best_recall_model} (Recall tertinggi)")

print("\n3. Jika FOCUS pada BALANCE antara Precision dan Recall:")
print(f"   ‚Üí Gunakan {best_f1} (F1-Score tertinggi)")

print("\n4. Jika BUTUH MODEL YANG STABLE dan RELIABLE:")
print(f"   ‚Üí Gunakan {best_stable} (Stability Score tertinggi)")

# Visualisasi final comparison
plt.figure(figsize=(14, 8))

# Radar chart untuk comparison
categories = ['F1-Score', 'CV_F1_mean', 'AUC-ROC', 'Precision', 'Recall', 'Stability_Score']
N = len(categories)

# Normalize scores untuk radar chart
normalized_scores = {}
for category in categories:
    max_val = final_comparison[category].max()
    normalized_scores[category] = final_comparison[category] / max_val

# Plot untuk top 3 models
top_models = final_comparison.head(3)['Model'].tolist()
colors = ['b', 'g', 'r']

angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]  # Close the loop

ax = plt.subplot(111, polar=True)
plt.xticks(angles[:-1], categories, color='grey', size=10)

for idx, model in enumerate(top_models):
    values = []
    for category in categories:
        model_idx = final_comparison[final_comparison['Model'] == model].index[0]
        values.append(normalized_scores[category].iloc[model_idx])
    values += values[:1]

    ax.plot(angles, values, color=colors[idx], linewidth=2, linestyle='-')