# MLOps Workshop: Modell-Evaluierung und Validierung

## EinfÃ¼hrung
In diesem Notebook werden wir die Modelle aus dem vorherigen Training evaluieren und validieren. Wir werden verschiedene Metriken berechnen, Cross-Validation durchfÃ¼hren und die Ergebnisse im MLflow Model Registry dokumentieren.

## Lernziele
- Evaluierung von ML-Modellen mit verschiedenen Metriken
- Validierung von Modellvorhersagen
- Speichern und Laden von Train/Test DatensÃ¤tzen
- Effektive Nutzung des MLflow Model Registry

## 1. Setup und Daten laden

In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
from mlflow.tracking import MlflowClient
import joblib
import os

# MLflow Tracking lokal einrichten
mlflow.set_tracking_uri("file:./mlruns")
client = MlflowClient()

# Plotting Style setzen
plt.style.use('seaborn-v0_8-darkgrid')


In [8]:
# Daten laden und vorbereiten
processed_data = pd.read_csv('../data/processed/telco_customer_churn_processed.csv')

# Numerische Spalten standardisieren
numeric_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
processed_data[numeric_columns] = scaler.fit_transform(processed_data[numeric_columns])

# One-Hot-Encoding fÃ¼r kategorische Variablen
categorical_columns = processed_data.select_dtypes(include=['object']).columns
categorical_columns = [col for col in categorical_columns if col not in ['Churn', 'customerID']]

# One-Hot-Encoding anwenden
X = pd.get_dummies(processed_data.drop(['Churn', 'customerID'], axis=1), columns=categorical_columns)
y = processed_data['Churn']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 2. Detaillierte Modellevaluierung

### Aufgabe 1
Implementieren Sie eine umfassende Evaluierungsfunktion fÃ¼r das Modell.

<details>
<summary>ðŸ‘‰ LÃ¶sung anzeigen</summary>

```python
def evaluate_model_comprehensive(model, X_test, y_test, model_name="Modell"):
    """
    FÃ¼hrt eine umfassende Modellevaluierung durch
    """
    # Vorhersagen generieren
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Confusion Matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('TatsÃ¤chlicher Wert')
    plt.xlabel('Vorhergesagter Wert')
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    # ROC Curve
    plt.figure(figsize=(10, 8))
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc="lower right")
    plt.savefig('roc_curve.png')
    plt.close()
    
    # Classification Report
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    # Ergebnisse in MLflow loggen
    with mlflow.start_run(run_name=f"evaluation_{model_name}"):
        mlflow.log_metric("auc_roc", roc_auc)
        mlflow.log_artifact("confusion_matrix.png")
        mlflow.log_artifact("roc_curve.png")
        
        # Classification Report metrics loggen
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)
    
    return {
        'confusion_matrix': cm,
        'roc_auc': roc_auc,
        'classification_report': report_df
    }

# Beispiel fÃ¼r die Verwendung
# Laden der Daten
X_train, X_test, y_train, y_test = load_train_test_data()

# Laden des besten Modells aus MLflow
best_model_version = client.get_latest_versions("customer_churn_predictor", stages=["Staging"])[0]
best_model = mlflow.sklearn.load_model(f"runs:/{best_model_version.run_id}/model")

# Evaluierung durchfÃ¼hren
evaluation_results = evaluate_model_comprehensive(best_model, X_test, y_test, "Best_Model")

# Ergebnisse anzeigen
print("\nClassification Report:")
print(evaluation_results['classification_report'])
print(f"\nROC AUC Score: {evaluation_results['roc_auc']:.4f}")
```
</details>


## 3. Modellvalidierung mit Cross-Validation

### Aufgabe 2
Implementieren Sie eine Cross-Validation zur Modellvalidierung.

<details>
<summary>ðŸ‘‰ LÃ¶sung anzeigen</summary>

```python
from sklearn.model_selection import cross_val_score, StratifiedKFold

def validate_model_cv(model, X, y, cv=5):
    """
    Validiert das Modell mittels Stratified K-Fold Cross-Validation
    """
    # Stratified K-Fold fÃ¼r unbalancierte Daten
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    
    # Verschiedene Metriken berechnen
    metrics = {
        'accuracy': cross_val_score(model, X, y, cv=skf, scoring='accuracy'),
        'precision': cross_val_score(model, X, y, cv=skf, scoring='precision'),
        'recall': cross_val_score(model, X, y, cv=skf, scoring='recall'),
        'f1': cross_val_score(model, X, y, cv=skf, scoring='f1')
    }
    
    # Ergebnisse zusammenfassen
    results = {
        metric: {
            'mean': scores.mean(),
            'std': scores.std(),
            'scores': scores
        } for metric, scores in metrics.items()
    }
    
    # MLflow logging
    with mlflow.start_run(run_name="cross_validation"):
        for metric, metric_dict in results.items():
            mlflow.log_metric(f"{metric}_mean", metric_dict['mean'])
            mlflow.log_metric(f"{metric}_std", metric_dict['std'])
    
    return results

# Beispiel fÃ¼r die Verwendung
cv_results = validate_model_cv(best_model, X_train, y_train)

# Ergebnisse ausgeben
for metric, values in cv_results.items():
    print(f"\n{metric.capitalize()}:")
    print(f"Mean: {values['mean']:.4f} (Â±{values['std']:.4f})")
```
</details>


## 4. Performance-Dashboard erstellen

### Aufgabe 3
Erstellen Sie ein Dashboard zur Visualisierung der Modellperformance.

<details>
<summary>ðŸ‘‰ LÃ¶sung anzeigen</summary>

```python
def create_performance_dashboard(evaluation_results, cv_results, output_path='performance_dashboard.png'):
    """
    Erstellt ein umfassendes Performance-Dashboard
    """
    fig = plt.figure(figsize=(20, 12))
    gs = fig.add_gridspec(2, 3)
    
    # 1. Confusion Matrix
    ax1 = fig.add_subplot(gs[0, 0])
    sns.heatmap(evaluation_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues', ax=ax1)
    ax1.set_title('Confusion Matrix')
    ax1.set_ylabel('TatsÃ¤chlicher Wert')
    ax1.set_xlabel('Vorhergesagter Wert')
    
    # 2. ROC Curve
    ax2 = fig.add_subplot(gs[0, 1])
    fpr, tpr, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:, 1])
    ax2.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {evaluation_results["roc_auc"]:.2f})')
    ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax2.set_title('ROC Curve')
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.legend()
    
    # 3. Cross-Validation Ergebnisse
    ax3 = fig.add_subplot(gs[0, 2])
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    means = [cv_results[m]['mean'] for m in metrics]
    stds = [cv_results[m]['std'] for m in metrics]
    ax3.bar(metrics, means, yerr=stds, capsize=5)
    ax3.set_title('Cross-Validation Metriken')
    ax3.set_ylabel('Score')
    plt.xticks(rotation=45)
    
    # 4. Feature Importance (falls verfÃ¼gbar)
    ax4 = fig.add_subplot(gs[1, :])
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=True)
        
        ax4.barh(range(len(feature_importance)), feature_importance['importance'])
        ax4.set_yticks(range(len(feature_importance)))
        ax4.set_yticklabels(feature_importance['feature'])
        ax4.set_title('Feature Importance')
        ax4.set_xlabel('Importance')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    # Dashboard in MLflow loggen
    with mlflow.start_run(run_name="performance_dashboard"):
        mlflow.log_artifact(output_path)

# Dashboard erstellen
create_performance_dashboard(evaluation_results, cv_results)
```
</details>


## 5. Modellversionen verwalten

### Aufgabe 4
Aktualisieren Sie die Modellversion im Registry basierend auf den Evaluierungsergebnissen.

<details>
<summary>ðŸ‘‰ LÃ¶sung anzeigen</summary>

```python
def update_model_version(model_name, run_id, evaluation_results, cv_results):
    """
    Aktualisiert die Modellversion im Registry mit den neuen Evaluierungsergebnissen
    """
    # Neue Version registrieren
    new_version = mlflow.register_model(
        f"runs:/{run_id}/model",
        model_name
    )
    
    # Beschreibung aktualisieren
    description = f"""
    Churn Prediction Model v{new_version.version}
    
    Evaluierungsmetriken:
    - ROC AUC: {evaluation_results['roc_auc']:.4f}
    
    Cross-Validation Ergebnisse:
    - Accuracy: {cv_results['accuracy']['mean']:.4f} (Â±{cv_results['accuracy']['std']:.4f})
    - Precision: {cv_results['precision']['mean']:.4f} (Â±{cv_results['precision']['std']:.4f})
    - Recall: {cv_results['recall']['mean']:.4f} (Â±{cv_results['recall']['std']:.4f})
    - F1: {cv_results['f1']['mean']:.4f} (Â±{cv_results['f1']['std']:.4f})
    
    Erstellt am: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}
    """
    
    client.update_model_version(
        name=model_name,
        version=new_version.version,
        description=description
    )
    
    return new_version

# Beispiel fÃ¼r die Verwendung
new_version = update_model_version(
    "customer_churn_predictor",
    best_model_version.run_id,
    evaluation_results,
    cv_results
)
```
</details>







## Hausaufgaben
1. Implementieren Sie zusÃ¤tzliche Evaluierungsmetriken (z.B. Kolmogorov-Smirnov-Test)
2. Erstellen Sie eine Funktion zur Modellkalibrierung
3. Implementieren Sie ein automatisches A/B-Testing-System
4. Erweitern Sie das Dashboard um interaktive Visualisierungen

## NÃ¼tzliche MLflow-Befehle
```bash
# MLflow UI starten
mlflow ui

# Experimentenliste anzeigen
mlflow experiments list

# Modellversionen anzeigen
mlflow models list
```