In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
=================================================================
Aktives und Passives Lernen ohne modAL - Korrigierte Version
=================================================================
Professionelles Skript für MNIST-Experimente mit fairem Vergleich
zwischen aktivem und passivem Lernen.

Hauptkorrektur: Der aktive Lerner hat jetzt Zugriff auf den GESAMTEN
Trainingspool und kann daraus die informativsten Samples auswählen,
anstatt nur aus einer vorab zufällig ausgewählten Teilmenge.

Wichtig: KEIN Warm-Start - CNN wird bei jedem Schritt neu initialisiert

Metriken:
- Anzahl gelabelter Instanzen
- Accuracy, F1-Score (macro)
- Trainingszeit pro Query-Zyklus
- Auswahl-Diversität im flachen Feature-Space

Classifiers:
- CNN (PyTorch)
- SVM, RandomForest, LogisticRegression, NaiveBayes (scikit-learn)

Optimierungen:
- Reproduzierbarkeit (Seed)
- Batch-Mode bei Queries (50 Instanzen pro Zyklus)
- Subsampling des Pools (max. 10k Trainingsbeispiele)
- Effiziente Pool-Verwaltung mit Boolean-Masken
- Fairer Vergleich: CNN wird bei jedem Schritt neu trainiert (kein Warm-Start)

Version: 2.1 - Fehlerkorrektur und Verbesserungen (MNIST-Version)
"""

import time
import logging
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from functools import partial
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# -------------------------------------------------------------------------------
# Reproduzierbarkeit
# -------------------------------------------------------------------------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# -------------------------------------------------------------------------------
# Logging konfigurieren
# -------------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger(__name__)

# -------------------------------------------------------------------------------
# 1) MNIST laden, Subsampling, Normalisierung, Split
# -------------------------------------------------------------------------------
def load_mnist_data(test_size=0.2, random_state=SEED, max_train=10000):
    """
    Lädt MNIST, normalisiert (mean=0.1307, std=0.3081),
    subsamplet bis max_train und splittet in Train/Val/Test.
    """
    train_ds = torchvision.datasets.MNIST(
        root='./data', train=True, download=True, transform=transforms.ToTensor()
    )
    test_ds = torchvision.datasets.MNIST(
        root='./data', train=False, download=True, transform=transforms.ToTensor()
    )

    # (N, 1, 28, 28)
    X_full = train_ds.data.unsqueeze(1).float() / 255.0
    # Normalisieren mit Standard-Mittelwert/Std. für MNIST:
    mean_val = 0.1307
    std_val  = 0.3081
    X_full = (X_full - mean_val) / std_val

    y_full = train_ds.targets.numpy()

    # Subsampling auf max_train
    idx_sub = np.random.choice(len(X_full), size=min(max_train, len(X_full)), replace=False)
    X_sub = X_full[idx_sub].numpy()
    y_sub = y_full[idx_sub]

    # Testdaten vorbereiten (auch hier normalisieren):
    X_test = test_ds.data.unsqueeze(1).float() / 255.0
    X_test = (X_test - mean_val) / std_val
    y_test = test_ds.targets.numpy()

    # Split in (Train/Val)
    X_train, X_val, y_train, y_val = train_test_split(
        X_sub, y_sub, test_size=test_size, random_state=random_state
    )

    logger.info(f"Daten geladen: Train={len(X_train)}, Val={len(X_val)}, Test={len(y_test)}")

    return X_train, y_train, X_val, y_val, X_test.numpy(), y_test

# -------------------------------------------------------------------------------
# 2) Einfaches CNN (PyTorch)
# -------------------------------------------------------------------------------
class SimpleCNN(nn.Module):
    """
    Einfache CNN-Architektur für MNIST.
    """
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool  = nn.MaxPool2d(2)
        self.relu  = nn.ReLU()
        self.fc1   = nn.Linear(32 * 14 * 14, 128)
        self.fc2   = nn.Linear(128, 10)
        self.dropout = nn.Dropout(0.2)

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

    def fit_cnn(self, X_np, y_np, epochs=2, lr=1e-3, batch_size=128):
        """
        Trainiert das CNN über 'epochs' mit Adam und CrossEntropyLoss.
        """
        self.train()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        loss_fn = nn.CrossEntropyLoss()

        ds = torch.utils.data.TensorDataset(
            torch.from_numpy(X_np).float(),
            torch.from_numpy(y_np).long()
        )
        loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)

        for _ in range(epochs):
            for xb, yb in loader:
                xb, yb = xb.to(self.device), yb.to(self.device)
                optimizer.zero_grad()
                loss = loss_fn(self(xb), yb)
                loss.backward()
                optimizer.step()
        return self

    def predict_proba(self, X_np):
        """
        Gibt die Modellwahrscheinlichkeiten zurück.
        """
        self.eval()
        with torch.no_grad():
            X_t = torch.from_numpy(X_np).float().to(self.device)
            logits = self(X_t)
            probs = torch.softmax(logits, dim=1)
        return probs.cpu().numpy()

# -------------------------------------------------------------------------------
# 3) Klassische Klassifikatoren (Sklearn)
# -------------------------------------------------------------------------------
CLASSIFIERS = {
    'SVM': partial(SVC, probability=True, kernel='rbf', random_state=SEED),
    'RandomForest': partial(RandomForestClassifier, n_estimators=100, n_jobs=-1, random_state=SEED),
    'LogisticRegression': partial(LogisticRegression, solver='lbfgs', max_iter=1000, random_state=SEED, n_jobs=-1),
    'NaiveBayes': partial(GaussianNB),
    'CNN': SimpleCNN
}

def train_and_predict(clf_key, X_tr, y_tr, X_te):
    """
    Trainiert das gewünschte Modell (CNN oder Sklearn)
    und gibt Predictions + Modell zurück.
    """
    # CNN-Sonderfall
    if clf_key == 'CNN':
        # Immer ein neues CNN initialisieren für fairen Vergleich
        cnn = SimpleCNN().fit_cnn(X_tr, y_tr, epochs=2)
        preds = np.argmax(cnn.predict_proba(X_te), axis=1)
        return preds, cnn
    else:
        # Klassischer Sklearn-Klassifikator
        model = CLASSIFIERS[clf_key]()
        # Flatten für Sklearn (28*28 = 784 Features)
        model.fit(X_tr.reshape(len(X_tr), -1), y_tr)
        preds = model.predict(X_te.reshape(len(X_te), -1))
        return preds, model

# -------------------------------------------------------------------------------
# 4) Query-/Uncertainty-Funktionen
# -------------------------------------------------------------------------------
def predict_proba(model, X):
    """
    Vereinheitlichte Probability-Abfrage für CNN und Sklearn-Modelle.
    """
    if isinstance(model, SimpleCNN):
        return model.predict_proba(X)
    else:
        # Für Sklearn: Flatten auf (N, 28*28)
        return model.predict_proba(X.reshape(len(X), -1))

def least_confidence(model, X, k=1):
    """
    Wählt k Samples mit geringster "Confidence" (max. Klasse).
    """
    p = predict_proba(model, X)
    scores = np.max(p, axis=1)  # highest predicted probability
    return np.argsort(scores)[:k]

def margin(model, X, k=1):
    """
    Wählt k Samples mit kleinstem Margin zwischen den Top-2 Klassen.
    """
    p = predict_proba(model, X)
    top2 = np.sort(p, axis=1)[:, -2:]
    margins = top2[:, 1] - top2[:, 0]
    return np.argsort(margins)[:k]

def entropy_uncertainty(model, X, k=1):
    """
    Wählt k Samples mit höchster Entropie aus.
    """
    p = predict_proba(model, X)
    ent = (-p * np.log(p + 1e-9)).sum(axis=1)
    return np.argsort(ent)[-k:]

def information_density(model, X, k=1, subsample_size=1000):
    """
    Mischt Entropie mit "Density" im Feature-Space.
    Für Effizienz: Berechnet Density nur auf einem Subsample.
    """
    p = predict_proba(model, X)
    ent = (-p * np.log(p + 1e-9)).sum(axis=1)

    # Für große Pools: Subsample für Density-Berechnung
    if len(X) > subsample_size:
        sub_idx = np.random.choice(len(X), subsample_size, replace=False)
        X_sub = X[sub_idx]
    else:
        X_sub = X
        sub_idx = np.arange(len(X))

    flat = X_sub.reshape(len(X_sub), -1)

    # Berechne Durchschnittsdistanz zu k nächsten Nachbarn
    k_neighbors = min(10, len(X_sub) - 1)
    densities = np.zeros(len(X))

    for i in range(len(X_sub)):
        dists = np.sqrt(((flat[i] - flat) ** 2).sum(axis=1))
        dists[i] = np.inf  # Exclude self
        nearest = np.sort(dists)[:k_neighbors]
        densities[sub_idx[i]] = 1.0 / (nearest.mean() + 1e-9)

    # Für nicht-subsample Punkte: Durchschnittliche Density
    if len(X) > subsample_size:
        mean_density = densities[sub_idx].mean()
        densities[densities == 0] = mean_density

    scores = ent * densities
    return np.argsort(scores)[-k:]

# -------------------------------------------------------------------------------
# 5) Active-Learning-Schleife im Batch-Mode (KORRIGIERT)
# -------------------------------------------------------------------------------
def active_learning(X_train, y_train, X_val, y_val, X_test, y_test,
                    strategy, clf_key, budget, runs, run_offset=0,
                    batch_size=50):
    """
    Führt Active Learning mit gegebener Strategie, Klassifikator und Budget durch.

    WICHTIGE KORREKTUR: Der aktive Lerner hat jetzt Zugriff auf den GESAMTEN
    Trainingspool, nicht nur auf eine vorab ausgewählte Teilmenge.

    - batch_size: Anzahl Instanzen pro Query
    - runs: Anzahl Wiederholungen für Statistik
    - Budget: Anteil (0.0 - 1.0) an den verfügbaren Trainingsdaten
    """
    results = []
    n = len(X_train)
    n_label = int(budget * n)

    if n_label < batch_size:
        logger.warning(f"Budget zu klein (n_label={n_label} < batch_size={batch_size})")
        return results

    for run_i in range(runs):
        run_id = run_offset + run_i
        logger.info(f"[{clf_key}][{strategy}] Run {run_i+1}/{runs} — Budget {budget:.1%} ({n_label} samples)")

        # ================================
        # PASSIVES LERNEN (Baseline)
        # ================================
        idx_passive = np.random.choice(n, n_label, replace=False)

        # Messen der Train-Zeit im passiven Modus
        t0 = time.time()
        y_pred_passive, _ = train_and_predict(
            clf_key,
            X_train[idx_passive],
            y_train[idx_passive],
            X_test
        )
        pass_train_time = time.time() - t0

        # Ergebnisse (passiv)
        acc_passive = accuracy_score(y_test, y_pred_passive)
        f1_passive = f1_score(y_test, y_pred_passive, average='macro')

        results.append([
            "passiv",
            clf_key,
            strategy,
            budget,
            run_id,
            "final",
            n_label,
            acc_passive,
            f1_passive,
            pass_train_time,
            0.0  # diversity
        ])

        logger.info(f"  Passiv: {n_label} labels → Acc={acc_passive:.3f}, F1={f1_passive:.3f}")

        # ================================
        # AKTIVES LERNEN (KORRIGIERT)
        # ================================
        # KRITISCHE ÄNDERUNG: Der Pool ist das GESAMTE Trainingsset
        X_pool = X_train.copy()
        y_pool = y_train.copy()

        # Tracking mit Boolean-Maske (effizienter als np.delete)
        is_labeled = np.zeros(n, dtype=bool)

        # Gelabelte Daten sammeln
        labeled_indices = []

        q_steps = int(np.ceil(n_label / batch_size))

        for q in range(q_steps):
            # Anzahl zu labelender Samples in diesem Schritt
            remaining_budget = n_label - len(labeled_indices)
            b_size_current = min(batch_size, remaining_budget)

            if b_size_current <= 0:
                break

            # Unlabeled pool indices
            unlabeled_mask = ~is_labeled
            unlabeled_indices = np.where(unlabeled_mask)[0]

            # Modell trainieren, falls schon gelabelte Daten vorliegen
            if len(labeled_indices) > 0:
                t0 = time.time()
                X_lab = X_train[labeled_indices]
                y_lab = y_train[labeled_indices]
                # KORREKTUR: Kein warm_cnn Parameter
                _, model = train_and_predict(clf_key, X_lab, y_lab, X_test)
                train_time = time.time() - t0
            else:
                train_time = 0.0
                model = None

            # Auswahl via aktiver Strategie
            if model is None or len(labeled_indices) < 10:  # Bootstrap mit Random für erste Samples
                # Zufällige Auswahl für initiale Samples
                selected_pool_idx = np.random.choice(
                    len(unlabeled_indices),
                    size=min(b_size_current, len(unlabeled_indices)),
                    replace=False
                )
            else:
                # Aktive Auswahl aus dem unlabeled Pool
                X_unlabeled = X_train[unlabeled_indices]

                if strategy == 'least_confidence':
                    selected_pool_idx = least_confidence(model, X_unlabeled, k=b_size_current)
                elif strategy == 'margin':
                    selected_pool_idx = margin(model, X_unlabeled, k=b_size_current)
                elif strategy == 'entropy':
                    selected_pool_idx = entropy_uncertainty(model, X_unlabeled, k=b_size_current)
                elif strategy == 'information_density':
                    selected_pool_idx = information_density(model, X_unlabeled, k=b_size_current)
                else:
                    selected_pool_idx = entropy_uncertainty(model, X_unlabeled, k=b_size_current)

            # Konvertiere Pool-Indizes zu globalen Indizes
            selected_global_idx = unlabeled_indices[selected_pool_idx]

            # Diversität der ausgewählten Samples berechnen
            if len(selected_global_idx) > 1:
                feats = X_train[selected_global_idx].reshape(len(selected_global_idx), -1)
                pairwise = list(combinations(feats, 2))
                diversity = np.mean([np.linalg.norm(a - b) for a, b in pairwise]) if pairwise else 0.0
            else:
                diversity = 0.0

            # Update labeled indices und mask
            labeled_indices.extend(selected_global_idx.tolist())
            is_labeled[selected_global_idx] = True

            # Test-Evaluation
            if len(labeled_indices) > 0:
                X_lab = X_train[labeled_indices]
                y_lab = y_train[labeled_indices]
                y_eval, _ = train_and_predict(clf_key, X_lab, y_lab, X_test)
                acc = accuracy_score(y_test, y_eval)
                f1  = f1_score(y_test, y_eval, average='macro')
            else:
                acc = 0.0
                f1 = 0.0

            # Schritt-Ergebnis loggen
            results.append([
                "aktiv",
                clf_key,
                strategy,
                budget,
                run_id,
                q,
                len(labeled_indices),
                acc,
                f1,
                train_time,
                diversity
            ])

            # Jeden Schritt loggen für bessere Transparenz
            logger.info(f"  Aktiv [{strategy}] Schritt {q+1}/{q_steps}: "
                      f"{len(labeled_indices)} labels → Acc={acc:.3f}, F1={f1:.3f}")

    return results

# -------------------------------------------------------------------------------
# 6) Evaluation Utilities
# -------------------------------------------------------------------------------
def print_summary_statistics(df):
    """
    Gibt eine übersichtliche Zusammenfassung der Ergebnisse aus.
    """
    # Finale Ergebnisse (letzte Iteration jedes Runs)
    final_results = df.groupby(['lernmodus', 'klassifizierer', 'strategie', 'budget', 'run_id']).last().reset_index()

    # Aggregierte Statistiken
    summary = final_results.groupby(['lernmodus', 'klassifizierer', 'strategie', 'budget']).agg({
        'accuracy': ['mean', 'std'],
        'f1_macro': ['mean', 'std'],
        'train_time': ['mean', 'std']
    }).round(4)

    print("\n" + "="*80)
    print("ZUSAMMENFASSUNG DER ERGEBNISSE")
    print("="*80)
    print(summary)

    # Vergleich Aktiv vs Passiv pro Budget
    for budget in sorted(df['budget'].unique()):
        print(f"\n{'='*60}")
        print(f"BUDGET: {budget:.1%}")
        print(f"{'='*60}")

        budget_data = final_results[final_results['budget'] == budget]

        for clf in sorted(df['klassifizierer'].unique()):
            clf_data = budget_data[budget_data['klassifizierer'] == clf]
            passive_clf = clf_data[clf_data['lernmodus'] == 'passiv']['accuracy']

            if passive_clf.empty:
                continue

            passive_mean = passive_clf.mean()
            passive_std = passive_clf.std()

            print(f"\n{clf}:")
            print(f"  Passiv: {passive_mean:.3f} ± {passive_std:.3f}")

            improvements = []
            for strategy in sorted(df['strategie'].unique()):
                active_data = clf_data[(clf_data['lernmodus'] == 'aktiv') &
                                      (clf_data['strategie'] == strategy)]
                if not active_data.empty:
                    active_acc = active_data['accuracy']
                    active_mean = active_acc.mean()
                    active_std = active_acc.std()
                    improvement = (active_mean - passive_mean) * 100
                    improvements.append((strategy, active_mean, active_std, improvement))
                    print(f"  Aktiv ({strategy:20}): {active_mean:.3f} ± {active_std:.3f} ({improvement:+.1f}%)")

            # Beste Strategie markieren
            if improvements:
                best_strategy = max(improvements, key=lambda x: x[1])
                print(f"  → Beste Strategie: {best_strategy[0]} mit {best_strategy[3]:+.1f}% Verbesserung")

# -------------------------------------------------------------------------------
# Hauptprogramm
# -------------------------------------------------------------------------------
def main():
    """
    Haupt-Einstiegspunkt:
    1) MNIST-Daten laden
    2) Budget- und Strategie-Settings definieren
    3) Experiment-Schleife
    4) Ergebnisse speichern und zusammenfassen
    """
    # 1) MNIST-Daten laden
    X_train, y_train, X_val, y_val, X_test, y_test = load_mnist_data(
        test_size=0.2,
        random_state=SEED,
        max_train=10000  # Kann erhöht werden für umfangreichere Experimente
    )

    # 2) Mehrere Budget-Stufen definieren
    budgets = [0.3, 0.6, 0.9]  # 30%, 60%, 90% des Trainingssets

    # Mögliche AL-Strategien und Klassifikatoren
    strategies = ["least_confidence", "margin", "entropy", "information_density"]
    clf_keys   = ["CNN", "SVM", "RandomForest", "LogisticRegression", "NaiveBayes"]

    # Anzahl Wiederholungen pro Setting
    runs = 5

    # 3) Experiment-Schleife
    all_results = []
    total_experiments = len(strategies) * len(clf_keys) * len(budgets)
    exp_count = 0

    for budget in budgets:
        for clf_key in clf_keys:
            for strategy in strategies:
                exp_count += 1
                print(f"\n[{exp_count}/{total_experiments}] "
                      f"Budget={budget:.1%}, Classifier={clf_key}, Strategy={strategy}")

                res = active_learning(
                    X_train, y_train,
                    X_val,   y_val,
                    X_test,  y_test,
                    strategy=strategy,
                    clf_key=clf_key,
                    budget=budget,
                    runs=runs,
                    batch_size=50
                )
                all_results.extend(res)

    # 4) DataFrame bauen und als CSV ablegen
    df = pd.DataFrame(
        all_results,
        columns=[
            "lernmodus",      # "passiv" oder "aktiv"
            "klassifizierer", # "CNN", "SVM", ...
            "strategie",      # "least_confidence", ...
            "budget",         # 0.3, 0.6, 0.9, ...
            "run_id",
            "zyklus",
            "anz_label",
            "accuracy",
            "f1_macro",
            "train_time",
            "diversity"
        ]
    )

    # Ergebnisse speichern
    output_file = "ergebnisse_mnist_corrected.csv"
    df.to_csv(output_file, index=False)
    logger.info(f"Ergebnisse in '{output_file}' gespeichert.")

    # Zusammenfassung ausgeben
    print_summary_statistics(df)


if __name__ == "__main__":
    main()