In [1]:
#!/usr/bin/env python
# coding: utf-8

# # Active Learning für Dachmaterial-Klassifikation mit F1-Score
# 
# ## Robuste Version mit statistischer Analyse für unausgewogene Datensätze
# 
# Dieses Notebook implementiert Active Learning Experimente für die Klassifikation von Dachmaterialien. 
# Da der Datensatz sehr unausgewogen ist, verwenden wir den **F1-Score (Macro)** als Hauptmetrik anstatt der Accuracy.
# 
# ### Features:
# - Verwendet den kompletten Dachmaterial-Datensatz
# - F1-Score als primäre Evaluationsmetrik
# - Statistische Analyse mit Wilcoxon Signed-Rank Test
# - Cliff's Delta für Effektstärken
# - Label-Einsparungs-Analyse

# ## 1. Import und Setup

# In[1]:


import os
import sys
import time
import logging
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Matplotlib Backend setzen bevor pyplot importiert wird
import matplotlib
matplotlib.use('Agg')  # Für Server ohne GUI
import matplotlib.pyplot as plt

# Seaborn mit Fehlerbehandlung
try:
    import seaborn as sns
    # Prüfe ob der Style verfügbar ist
    try:
        plt.style.use('seaborn-v0_8-whitegrid')
    except:
        try:
            plt.style.use('seaborn-whitegrid')
        except:
            plt.style.use('ggplot')
except ImportError:
    print("Warnung: Seaborn nicht installiert. Verwende Standard-Matplotlib.")
    sns = None

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import sklearn

# Statistische Tests
import scipy
from scipy import stats
from scipy.stats import wilcoxon
import itertools

# Excel-Export
try:
    import openpyxl
    EXCEL_AVAILABLE = True
except ImportError:
    print("Warnung: openpyxl nicht installiert. Excel-Export wird deaktiviert.")
    EXCEL_AVAILABLE = False

# SSL-Fehler beim Download verhindern
import ssl
try:
    ssl._create_default_https_context = ssl._create_unverified_context
except:
    pass


# ## 2. Konfiguration und Reproduzierbarkeit

# In[2]:


# -------------------------------------------------------------------------------
# Reproduzierbarkeit
# -------------------------------------------------------------------------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# -------------------------------------------------------------------------------
# Konfiguration
# -------------------------------------------------------------------------------
BUDGET_PERCENTAGES = [0.2, 0.4, 0.6, 0.8, 1.0]  # 20%, 40%, 60%, 80%, 100%
BATCH_SIZE = 500  # Größere Batches für effizienteres Training
N_RUNS = 5  # Erhöht von 3 auf 5 für bessere statistische Aussagekraft
INITIAL_PERCENTAGE = 0.01  # 1% initial labeling
SIGNIFICANCE_LEVEL = 0.05  # Für statistische Tests
MIN_SAMPLES_PER_CLASS = 20  # Mindestanzahl Samples pro Klasse

# Dachmaterial Klassen (wird dynamisch geladen)
DACHMATERIAL_CLASSES = []

# Erstelle Output-Verzeichnisse
output_dirs = ["plots", "results", "reports"]
for dir_name in output_dirs:
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

# -------------------------------------------------------------------------------
# Logging konfigurieren mit UTF-8 Encoding
# -------------------------------------------------------------------------------
# Erstelle Log-Verzeichnis
log_dir = "logs"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Logging Setup mit UTF-8 Encoding für Windows
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    handlers=[
        logging.FileHandler(
            os.path.join(log_dir, f"dachmaterial_active_learning_{time.strftime('%Y%m%d_%H%M%S')}.log"),
            encoding='utf-8'
        ),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Set encoding for stdout to handle Unicode - nur wenn möglich
if sys.platform == 'win32':
    # Prüfe ob wir in einer Jupyter/IPython Umgebung sind
    try:
        get_ipython()
        # In Jupyter/IPython - keine Änderung nötig
        logger.info("Jupyter/IPython Umgebung erkannt - UTF-8 Handling bereits aktiv")
    except NameError:
        # Normales Python - versuche UTF-8 zu setzen
        try:
            import io
            if hasattr(sys.stdout, 'buffer'):
                sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
                sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
        except Exception as e:
            logger.warning(f"Konnte UTF-8 Encoding nicht setzen: {e}")


# ## 3. Daten laden und vorbereiten

# In[3]:


def load_dachmaterial_data(filepath='umrisse_with_all_data_and_shape_and_patch_and_normal.csv'):
    """
    Lädt den VOLLSTÄNDIGEN Dachmaterial-Datensatz.
    Filtert Klassen mit zu wenigen Samples aus.
    Gibt auch den Preprocessor zurück für konsistente Transformation.
    """
    logger.info("Lade vollständigen Dachmaterial-Datensatz...")
    
    try:
        # Daten laden
        df = pd.read_csv(filepath)
        logger.info(f"[ok] Datensatz geladen: {len(df):,} Zeilen, {len(df.columns)} Spalten")
    except FileNotFoundError:
        logger.error(f"Datei '{filepath}' nicht gefunden!")
        logger.error("Bitte stellen Sie sicher, dass die CSV-Datei im aktuellen Verzeichnis liegt.")
        raise
    except Exception as e:
        logger.error(f"Fehler beim Laden der Daten: {e}")
        raise
    
    # Zielvariable und Features definieren
    target_col = 'mat_qgis'
    feature_cols = ['area', 'area_type', 'Shape', 'ezg']
    
    # Nur Zeilen mit gültiger Zielvariable behalten
    df = df[df[target_col].notna()].copy()
    
    # Klassen-Verteilung anzeigen
    class_dist = df[target_col].value_counts()
    logger.info(f"Ursprüngliche Klassen-Verteilung:\n{class_dist}")
    
    # Filtere Klassen mit zu wenigen Samples
    valid_classes = class_dist[class_dist >= MIN_SAMPLES_PER_CLASS].index.tolist()
    removed_classes = class_dist[class_dist < MIN_SAMPLES_PER_CLASS].index.tolist()
    
    if removed_classes:
        logger.warning(f"Entferne Klassen mit weniger als {MIN_SAMPLES_PER_CLASS} Samples:")
        for cls in removed_classes:
            logger.warning(f"  - {cls}: {class_dist[cls]} Samples")
    
    # Behalte nur Samples der gültigen Klassen
    df = df[df[target_col].isin(valid_classes)].copy()
    
    # Aktualisierte Klassen-Verteilung
    class_dist_filtered = df[target_col].value_counts()
    logger.info(f"Gefilterte Klassen-Verteilung:\n{class_dist_filtered}")
    
    # Features und Target trennen
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    # Label Encoding für Target
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Dachmaterial-Klassen speichern
    global DACHMATERIAL_CLASSES
    DACHMATERIAL_CLASSES = list(label_encoder.classes_)
    
    logger.info(f"[ok] Dachmaterial-Datensatz vorbereitet: {len(X):,} Samples")
    logger.info(f"  Klassen: {len(DACHMATERIAL_CLASSES)} - {', '.join(DACHMATERIAL_CLASSES)}")
    
    # Feature-Typen analysieren
    numeric_features = ['area']
    categorical_features = ['area_type', 'Shape', 'ezg']
    
    # Preprocessing Pipeline erstellen
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Robuster Train/Test Split
    # Verwende StratifiedShuffleSplit für bessere Kontrolle
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
    train_idx, test_idx = next(splitter.split(X, y_encoded))
    
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y_encoded[train_idx]
    y_test = y_encoded[test_idx]
    
    # Preprocessing
    X_train_processed = preprocessor.fit_transform(X_train).astype(np.float32)
    X_test_processed = preprocessor.transform(X_test).astype(np.float32)
    
    # Validierung der Daten - mit verbesserter Prüfung
    train_classes = set(np.unique(y_train))
    test_classes = set(np.unique(y_test))
    all_classes = set(range(len(DACHMATERIAL_CLASSES)))
    
    logger.info(f"Klassen im Trainingsset: {len(train_classes)}")
    logger.info(f"Klassen im Testset: {len(test_classes)}")
    
    # Warnung wenn nicht alle Klassen im Test-Set sind
    missing_in_test = all_classes - test_classes
    if missing_in_test:
        logger.warning(f"Folgende Klassen fehlen im Test-Set: {[DACHMATERIAL_CLASSES[i] for i in missing_in_test]}")
        logger.warning("Dies kann bei sehr unbalancierten Datensätzen vorkommen.")
    
    # Sicherstellen dass mindestens die Mehrheit der Klassen vertreten ist
    if len(test_classes) < len(all_classes) * 0.7:
        logger.error("Zu wenige Klassen im Test-Set! Versuche anderen Random State.")
        # Versuche mit anderem Random State
        for attempt in range(5):
            new_seed = SEED + attempt + 1
            splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=new_seed)
            train_idx, test_idx = next(splitter.split(X, y_encoded))
            
            y_train_temp = y_encoded[train_idx]
            y_test_temp = y_encoded[test_idx]
            
            test_classes_temp = set(np.unique(y_test_temp))
            if len(test_classes_temp) >= len(all_classes) * 0.8:
                logger.info(f"Besserer Split gefunden mit Seed {new_seed}")
                X_train = X.iloc[train_idx]
                X_test = X.iloc[test_idx]
                y_train = y_train_temp
                y_test = y_test_temp
                
                # Re-fit preprocessing
                X_train_processed = preprocessor.fit_transform(X_train).astype(np.float32)
                X_test_processed = preprocessor.transform(X_test).astype(np.float32)
                break
    
    logger.info(f"[ok] Daten vorbereitet: {len(X_train):,} Trainingssamples, {len(X_test):,} Testsamples")
    logger.info(f"  Feature-Dimension nach Preprocessing: {X_train_processed.shape[1]}")
    logger.info(f"  Klassen: {len(np.unique(y_train))} im Training, {len(np.unique(y_test))} im Test")
    logger.info(f"  Speicherbedarf: {(X_train_processed.nbytes + X_test_processed.nbytes) / 1024**2:.1f} MB")
    
    return X_train_processed, y_train, X_test_processed, y_test, label_encoder, preprocessor


# ## 4. Neural Network Modell

# In[4]:


class OptimizedTabularNN(nn.Module):
    """
    Optimierte NN-Architektur für tabellarische Dachmaterial-Daten.
    """
    def __init__(self, input_dim, num_classes=11):
        super(OptimizedTabularNN, self).__init__()
        
        # Architektur für tabellarische Daten
        self.features = nn.Sequential(
            # Layer 1
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.4),
            
            # Layer 2
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            
            # Layer 3
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
        )
        
        self.classifier = nn.Linear(64, num_classes)
        
        # Device handling mit Fehlerbehandlung
        if torch.cuda.is_available():
            try:
                self.device = torch.device('cuda')
                # Test ob CUDA wirklich funktioniert
                test_tensor = torch.zeros(1).cuda()
                del test_tensor
            except:
                logger.warning("CUDA verfügbar aber nicht nutzbar. Verwende CPU.")
                self.device = torch.device('cpu')
        else:
            self.device = torch.device('cpu')
        
        self.to(self.device)
        
        # Initialize weights
        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize network weights."""
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

    def fit(self, X_np, y_np, epochs=10, lr=1e-3, batch_size=256, verbose=False):
        """
        Trainiert das TabularNN mit optimierten Hyperparametern.
        """
        self.train()
        
        # Hyperparameter-Anpassung basierend auf Datensatzgröße
        if len(X_np) < 1000:
            batch_size = min(32, len(X_np))
            lr = lr * 0.1
        
        optimizer = optim.AdamW(self.parameters(), lr=lr, weight_decay=1e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
        loss_fn = nn.CrossEntropyLoss()
        
        # Create dataset
        try:
            dataset = TensorDataset(
                torch.from_numpy(X_np).float(),
                torch.from_numpy(y_np).long()
            )
        except Exception as e:
            logger.error(f"Fehler beim Erstellen des Datasets: {e}")
            raise
        
        # DataLoader mit optimierten Settings
        loader = DataLoader(
            dataset, 
            batch_size=batch_size, 
            shuffle=True,
            num_workers=0,  # Immer 0 für Kompatibilität
            pin_memory=(self.device.type == 'cuda'),
            drop_last=False
        )
        
        # Training loop mit Fehlerbehandlung
        try:
            for epoch in range(epochs):
                total_loss = 0.0
                batch_count = 0
                
                for xb, yb in loader:
                    xb, yb = xb.to(self.device), yb.to(self.device)
                    
                    optimizer.zero_grad()
                    outputs = self(xb)
                    loss = loss_fn(outputs, yb)
                    
                    # Gradient clipping zur Stabilität
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
                    
                    optimizer.step()
                    
                    total_loss += loss.item()
                    batch_count += 1
                
                scheduler.step()
                
                if verbose and (epoch + 1) % 2 == 0:
                    avg_loss = total_loss / max(batch_count, 1)
                    logger.info(f"    Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
                    
        except Exception as e:
            logger.error(f"Fehler während des Trainings: {e}")
            raise
        
        return self

    def predict_proba(self, X_np, batch_size=1024):
        """
        Gibt Wahrscheinlichkeiten für große Datenmengen zurück.
        """
        self.eval()
        probs = []
        
        # Anpassung der Batch-Größe bei wenig Speicher
        if self.device.type == 'cuda':
            try:
                free_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()
                if free_memory < 1024**3:  # Weniger als 1GB frei
                    batch_size = 256
            except:
                pass
        
        with torch.no_grad():
            try:
                for i in range(0, len(X_np), batch_size):
                    batch = torch.from_numpy(X_np[i:i+batch_size]).float().to(self.device)
                    logits = self(batch)
                    batch_probs = F.softmax(logits, dim=1)
                    probs.append(batch_probs.cpu().numpy())
                    
                    # Speicher freigeben
                    del batch, logits, batch_probs
                    if self.device.type == 'cuda':
                        torch.cuda.empty_cache()
                        
            except Exception as e:
                logger.error(f"Fehler bei predict_proba: {e}")
                raise
        
        return np.vstack(probs) if probs else np.array([])

    def predict(self, X_np, batch_size=1024):
        """
        Gibt Vorhersagen zurück.
        """
        probs = self.predict_proba(X_np, batch_size)
        return np.argmax(probs, axis=1) if len(probs) > 0 else np.array([])


# ## 5. Sklearn Wrapper und Klassifikator Factory

# In[5]:


class SklearnWrapper:
    """
    Wrapper-Klasse für sklearn-Klassifikatoren mit NN-ähnlicher API.
    """
    def __init__(self, classifier, scaler=None):
        self.classifier = classifier
        self.scaler = scaler
        self.is_fitted = False
        
    def fit(self, X_np, y_np, **kwargs):
        """Trainiert den Klassifikator mit Fehlerbehandlung."""
        try:
            if self.scaler is not None:
                X_scaled = self.scaler.fit_transform(X_np)
            else:
                X_scaled = X_np
            
            self.classifier.fit(X_scaled, y_np)
            self.is_fitted = True
            
        except Exception as e:
            logger.error(f"Fehler beim Training des sklearn-Modells: {e}")
            raise
            
        return self
    
    def predict_proba(self, X_np):
        """Gibt Wahrscheinlichkeiten zurueck mit Fehlerbehandlung."""
        if not self.is_fitted:
            raise RuntimeError("Modell wurde noch nicht trainiert!")
        
        try:
            if self.scaler is not None:
                X_scaled = self.scaler.transform(X_np)
            else:
                X_scaled = X_np
                
            if hasattr(self.classifier, 'predict_proba'):
                return self.classifier.predict_proba(X_scaled)
            else:
                # Für SVM mit probability=False oder andere Klassifikatoren
                if hasattr(self.classifier, 'decision_function'):
                    decision = self.classifier.decision_function(X_scaled)
                    
                    # Multi-class Fall
                    if len(decision.shape) == 2:
                        # Softmax auf decision values
                        exp_decision = np.exp(decision - np.max(decision, axis=1, keepdims=True))
                        probs = exp_decision / np.sum(exp_decision, axis=1, keepdims=True)
                    else:
                        # Binary Fall - konvertiere zu 2-Klassen-Wahrscheinlichkeiten
                        probs = np.zeros((len(decision), 2))
                        probs[:, 1] = 1 / (1 + np.exp(-decision))
                        probs[:, 0] = 1 - probs[:, 1]
                    return probs
                else:
                    # Fallback: One-hot encoding der Vorhersagen
                    predictions = self.classifier.predict(X_scaled)
                    n_classes = len(np.unique(predictions))
                    probs = np.zeros((len(predictions), n_classes))
                    for i, pred in enumerate(predictions):
                        probs[i, int(pred)] = 1.0
                    return probs
                    
        except Exception as e:
            logger.error(f"Fehler bei predict_proba: {e}")
            raise
    
    def predict(self, X_np):
        """Gibt Vorhersagen zurück mit Fehlerbehandlung."""
        if not self.is_fitted:
            raise RuntimeError("Modell wurde noch nicht trainiert!")
            
        try:
            if self.scaler is not None:
                X_scaled = self.scaler.transform(X_np)
            else:
                X_scaled = X_np
            return self.classifier.predict(X_scaled)
            
        except Exception as e:
            logger.error(f"Fehler bei predict: {e}")
            raise


def create_classifier(classifier_name, input_dim=None, n_classes=None):
    """
    Erstellt einen Klassifikator basierend auf dem Namen.
    
    Args:
        classifier_name: Name des Klassifikators
        input_dim: Input-Dimension für Neural Network
        n_classes: Anzahl der Klassen
    
    Returns:
        Klassifikator-Objekt mit einheitlicher API
    """
    try:
        if classifier_name == 'Neural Network':
            if input_dim is None or n_classes is None:
                raise ValueError("input_dim und n_classes müssen für Neural Network angegeben werden")
            return OptimizedTabularNN(input_dim=input_dim, num_classes=n_classes)
        
        elif classifier_name == 'Naive Bayes':
            return SklearnWrapper(GaussianNB())
        
        elif classifier_name == 'Random Forest':
            # Angepasste Parameter für bessere Performance
            n_jobs = min(os.cpu_count() - 1, -1) if os.cpu_count() else -1
            return SklearnWrapper(
                RandomForestClassifier(
                    n_estimators=100,
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    n_jobs=n_jobs,
                    random_state=SEED,
                    verbose=0
                )
            )
        
        elif classifier_name == 'Logistic Regression':
            return SklearnWrapper(
                LogisticRegression(
                    max_iter=1000,
                    solver='saga',
                    multi_class='multinomial',
                    n_jobs=-1,
                    random_state=SEED,
                    verbose=0
                ),
                scaler=StandardScaler()
            )
        
        elif classifier_name == 'SVM':
            return SklearnWrapper(
                SVC(
                    kernel='rbf',
                    gamma='scale',
                    decision_function_shape='ovr',
                    probability=True,
                    cache_size=500,  # Mehr Cache für bessere Performance
                    random_state=SEED,
                    verbose=False
                ),
                scaler=StandardScaler()
            )
        
        else:
            raise ValueError(f"Unbekannter Klassifikator: {classifier_name}")
            
    except Exception as e:
        logger.error(f"Fehler beim Erstellen des Klassifikators {classifier_name}: {e}")
        raise


# ## 6. Query-Strategien

# In[6]:


def entropy_sampling(model, X_pool, n_instances=1):
    """
    Waehlt Samples mit hoechster Entropie aus.
    H(x) = -Σ p(y|x) * log(p(y|x))
    """
    try:
        probs = model.predict_proba(X_pool)
        
        # Kleine Konstante hinzufügen um log(0) zu vermeiden
        epsilon = 1e-10
        probs = np.clip(probs, epsilon, 1.0 - epsilon)
        
        # Entropie berechnen
        entropies = -np.sum(probs * np.log(probs), axis=1)
        
        # Sicherstellen, dass wir nicht mehr Samples anfordern als verfügbar
        n_instances = min(n_instances, len(X_pool))
        
        # Indizes mit höchster Entropie
        return np.argsort(entropies)[-n_instances:]
        
    except Exception as e:
        logger.error(f"Fehler bei Entropy Sampling: {e}")
        # Fallback zu Random Sampling
        return random_sampling(model, X_pool, n_instances)


def margin_sampling(model, X_pool, n_instances=1):
    """
    Wählt Samples mit kleinstem Margin zwischen Top-2 Klassen.
    margin = P(y1|x) - P(y2|x)
    """
    try:
        probs = model.predict_proba(X_pool)
        
        # Sortiere Wahrscheinlichkeiten
        sorted_probs = np.sort(probs, axis=1)
        
        # Berechne Margin
        if sorted_probs.shape[1] >= 2:
            margins = sorted_probs[:, -1] - sorted_probs[:, -2]
        else:
            # Falls nur eine Klasse, verwende 1 - max_prob als Margin
            margins = 1.0 - sorted_probs[:, -1]
        
        # Sicherstellen, dass wir nicht mehr Samples anfordern als verfügbar
        n_instances = min(n_instances, len(X_pool))
        
        # Indizes mit kleinstem Margin
        return np.argsort(margins)[:n_instances]
        
    except Exception as e:
        logger.error(f"Fehler bei Margin Sampling: {e}")
        # Fallback zu Random Sampling
        return random_sampling(model, X_pool, n_instances)


def least_confidence_sampling(model, X_pool, n_instances=1):
    """
    Wählt Samples mit geringster Konfidenz.
    confidence = max P(y|x)
    """
    try:
        probs = model.predict_proba(X_pool)
        
        # Maximum-Wahrscheinlichkeit als Konfidenz
        confidences = np.max(probs, axis=1)
        
        # Sicherstellen, dass wir nicht mehr Samples anfordern als verfügbar
        n_instances = min(n_instances, len(X_pool))
        
        # Indizes mit geringster Konfidenz
        return np.argsort(confidences)[:n_instances]
        
    except Exception as e:
        logger.error(f"Fehler bei Least Confidence Sampling: {e}")
        # Fallback zu Random Sampling
        return random_sampling(model, X_pool, n_instances)


def random_sampling(model, X_pool, n_instances=1):
    """
    Zufällige Auswahl (Baseline).
    """
    try:
        # Sicherstellen, dass wir nicht mehr Samples anfordern als verfügbar
        n_instances = min(n_instances, len(X_pool))
        
        if n_instances <= 0:
            return np.array([], dtype=int)
            
        return np.random.choice(len(X_pool), size=n_instances, replace=False)
        
    except Exception as e:
        logger.error(f"Fehler bei Random Sampling: {e}")
        # Notfall-Fallback
        return np.arange(min(n_instances, len(X_pool)))


# ## 7. Statistische Analyse Funktionen

# In[7]:


def cliffs_delta(x, y):
    """
    Berechnet Cliff's Delta als nicht-parametrisches Effektstaerkemaß.
    
    Interpretation:
    |d| < 0.147 "negligible"
    |d| < 0.33  "small" 
    |d| < 0.474 "medium"
    |d| >= 0.474 "large"
    """
    try:
        nx = len(x)
        ny = len(y)
        
        if nx == 0 or ny == 0:
            return 0.0
        
        # Konvertiere zu numpy arrays falls nötig
        x = np.asarray(x)
        y = np.asarray(y)
        
        # Berechne die Anzahl der Paare, wo x[i] > y[j]
        greater = 0
        less = 0
        
        # Vektorisierte Berechnung für bessere Performance
        for xi in x:
            greater += np.sum(xi > y)
            less += np.sum(xi < y)
        
        # Cliff's Delta
        d = (greater - less) / (nx * ny)
        
        # Sicherstellen, dass d im Bereich [-1, 1] liegt
        d = np.clip(d, -1.0, 1.0)
        
        return d
        
    except Exception as e:
        logger.error(f"Fehler bei Cliff's Delta Berechnung: {e}")
        return 0.0


def interpret_cliffs_delta(d):
    """
    Interpretiert die Effektstärke nach Cliff's Delta.
    """
    try:
        abs_d = abs(float(d))
        if abs_d < 0.147:
            return "negligible"
        elif abs_d < 0.33:
            return "small"
        elif abs_d < 0.474:
            return "medium"
        else:
            return "large"
    except:
        return "unknown"


def perform_statistical_analysis(results_df, metric='f1_score'):
    """
    Fuehrt statistische Analyse durch: Wilcoxon Signed-Rank Test mit Bonferroni-Korrektur.
    """
    statistical_results = []
    
    try:
        classifiers = results_df['classifier'].unique()
        strategies = results_df['strategy'].unique()
        budget_levels = results_df['budget_pct'].unique()
        
        for classifier in classifiers:
            for budget_pct in budget_levels:
                # Hole Random Sampling als Baseline
                baseline_data = results_df[
                    (results_df['classifier'] == classifier) & 
                    (results_df['strategy'] == 'Random Sampling') & 
                    (results_df['budget_pct'] == budget_pct)
                ][metric].values
                
                # Vergleiche mit anderen Strategien
                for strategy in strategies:
                    if strategy == 'Random Sampling':
                        continue
                        
                    strategy_data = results_df[
                        (results_df['classifier'] == classifier) & 
                        (results_df['strategy'] == strategy) & 
                        (results_df['budget_pct'] == budget_pct)
                    ][metric].values
                    
                    # Mindestens N_RUNS Datenpunkte für statistische Tests
                    if len(baseline_data) >= N_RUNS and len(strategy_data) >= N_RUNS:
                        # Wilcoxon Signed-Rank Test
                        try:
                            # Prüfe ob alle Werte gleich sind
                            if np.allclose(strategy_data, baseline_data):
                                statistic, p_value = 0.0, 1.0
                            else:
                                statistic, p_value = wilcoxon(
                                    strategy_data, baseline_data, 
                                    alternative='greater',
                                    zero_method='zsplit'
                                )
                        except Exception as e:
                            logger.warning(f"Wilcoxon Test fehlgeschlagen für {classifier}-{strategy}: {e}")
                            statistic, p_value = 0.0, 1.0
                        
                        # Effektstärke
                        effect_size = cliffs_delta(strategy_data, baseline_data)
                        effect_interpretation = interpret_cliffs_delta(effect_size)
                        
                        # Mittelwerte und Standardabweichungen
                        baseline_mean = np.mean(baseline_data) if len(baseline_data) > 0 else 0
                        baseline_std = np.std(baseline_data) if len(baseline_data) > 0 else 0
                        strategy_mean = np.mean(strategy_data) if len(strategy_data) > 0 else 0
                        strategy_std = np.std(strategy_data) if len(strategy_data) > 0 else 0
                        
                        # Verbesserung berechnen mit Division-by-Zero-Schutz
                        improvement = strategy_mean - baseline_mean
                        improvement_pct = ((improvement / baseline_mean) * 100) if baseline_mean > 0 else 0
                        
                        statistical_results.append({
                            'classifier': classifier,
                            'budget_pct': budget_pct,
                            'strategy': strategy,
                            'baseline_mean': baseline_mean,
                            'baseline_std': baseline_std,
                            'strategy_mean': strategy_mean,
                            'strategy_std': strategy_std,
                            'improvement': improvement,
                            'improvement_pct': improvement_pct,
                            'wilcoxon_statistic': float(statistic),
                            'p_value': float(p_value),
                            'cliffs_delta': float(effect_size),
                            'effect_size': effect_interpretation,
                            'n_samples': len(strategy_data)
                        })
        
        # Konvertiere zu DataFrame
        stat_df = pd.DataFrame(statistical_results)
        
        if len(stat_df) > 0:
            # Bonferroni-Korrektur für multiple Vergleiche
            n_comparisons = len(stat_df)
            stat_df['p_value_corrected'] = np.minimum(stat_df['p_value'] * n_comparisons, 1.0)
            stat_df['significant'] = stat_df['p_value_corrected'] < SIGNIFICANCE_LEVEL
        else:
            logger.warning("Keine statistischen Ergebnisse generiert!")
            
        return stat_df
        
    except Exception as e:
        logger.error(f"Fehler bei der statistischen Analyse: {e}")
        return pd.DataFrame()  # Leerer DataFrame als Fallback


# ## 8. Active Learning Hauptfunktion

# In[8]:


def run_active_learning_experiment(X_train, y_train, X_test, y_test,
                                 classifier_name, strategy_name, strategy_func,
                                 budget_percentages, batch_size=500,
                                 input_dim=None, n_classes=None):
    """
    Führt ein Active Learning Experiment durch mit umfassender Fehlerbehandlung.
    Verwendet F1-Score als Hauptmetrik.
    """
    results = []
    n_total = len(y_train)
    
    # Input-Dimension und Anzahl Klassen ermitteln
    if input_dim is None:
        input_dim = X_train.shape[1]
    if n_classes is None:
        n_classes = len(np.unique(y_train))
    
    for budget_pct in budget_percentages:
        n_budget = int(budget_pct * n_total)
        
        logger.info(f"\n{classifier_name} + {strategy_name} - Budget: {budget_pct:.0%} ({n_budget:,} Samples)")
        
        for run in range(N_RUNS):
            logger.info(f"  Run {run+1}/{N_RUNS}")
            
            try:
                # Set seed for reproducibility
                np.random.seed(SEED + run)
                torch.manual_seed(SEED + run)
                if torch.cuda.is_available():
                    torch.cuda.manual_seed(SEED + run)
                
                # Initialisierung
                pool_indices = np.arange(n_total)
                labeled_indices = []
                
                # Initiale zufällige Auswahl
                n_initial = max(100, int(INITIAL_PERCENTAGE * n_total))
                n_initial = min(n_initial, len(pool_indices))  # Sicherstellen dass genug Samples da sind
                
                initial_indices = np.random.choice(pool_indices, size=n_initial, replace=False)
                labeled_indices = list(initial_indices)
                pool_indices = np.setdiff1d(pool_indices, labeled_indices)
                
                # Tracking - verwende F1-Scores statt Accuracies
                f1_scores = []
                n_labeled_list = []
                query_times = []
                train_times = []
                
                while len(labeled_indices) < n_budget and len(pool_indices) > 0:
                    start_time = time.time()
                    
                    # Modell erstellen und trainieren
                    model = create_classifier(classifier_name, input_dim=input_dim, n_classes=n_classes)
                    
                    train_start = time.time()
                    
                    # Training mit Fehlerbehandlung
                    try:
                        model.fit(X_train[labeled_indices], y_train[labeled_indices])
                    except Exception as e:
                        logger.error(f"Fehler beim Training in Run {run+1}: {e}")
                        # Skip diesen Durchlauf
                        break
                    
                    train_time = time.time() - train_start
                    train_times.append(train_time)
                    
                    # Evaluation mit F1-Score
                    try:
                        y_pred = model.predict(X_test)
                        # Verwende macro average F1-Score für unausgewogene Datensätze
                        f1 = f1_score(y_test, y_pred, average='macro')
                    except Exception as e:
                        logger.error(f"Fehler bei der Evaluation in Run {run+1}: {e}")
                        f1 = 0.0
                    
                    f1_scores.append(f1)
                    n_labeled_list.append(len(labeled_indices))
                    
                    # Nächste Batch auswählen
                    n_query = min(batch_size, n_budget - len(labeled_indices), len(pool_indices))
                    if n_query <= 0:
                        break
                    
                    # Query mit Zeitmessung und Fehlerbehandlung
                    query_start = time.time()
                    try:
                        query_indices = strategy_func(model, X_train[pool_indices], n_query)
                    except Exception as e:
                        logger.error(f"Fehler bei Query-Strategie {strategy_name}: {e}")
                        # Fallback zu Random Sampling
                        query_indices = random_sampling(model, X_train[pool_indices], n_query)
                    
                    query_time = time.time() - query_start
                    query_times.append(query_time)
                    
                    # Validierung der Query-Indizes
                    query_indices = np.asarray(query_indices)
                    query_indices = query_indices[query_indices < len(pool_indices)]  # Entferne ungültige Indizes
                    
                    if len(query_indices) == 0:
                        logger.warning(f"Keine gueltigen Query-Indizes in Run {run+1}")
                        break
                    
                    selected_indices = pool_indices[query_indices]
                    
                    # Update
                    labeled_indices.extend(selected_indices)
                    pool_indices = np.setdiff1d(pool_indices, selected_indices)
                    
                    # Progress logging - Unicode-Fix: Verwende -> statt →
                    if len(labeled_indices) % 2000 == 0 or len(labeled_indices) == n_budget:
                        logger.info(f"    {len(labeled_indices):,} labeled -> F1-Score: {f1:.4f} "
                                  f"(Train: {train_time:.1f}s, Query: {query_time:.2f}s)")
                    
                    # Speicher freigeben bei NN
                    if classifier_name == 'Neural Network' and hasattr(model, 'device') and model.device.type == 'cuda':
                        torch.cuda.empty_cache()
                
                # Finale Evaluation mit mehr Training
                if len(labeled_indices) > 0:
                    try:
                        model = create_classifier(classifier_name, input_dim=input_dim, n_classes=n_classes)
                        
                        # Mehr Epochs für finale Evaluation
                        if classifier_name == 'Neural Network':
                            model.fit(X_train[labeled_indices], y_train[labeled_indices], 
                                     epochs=20, verbose=False)
                        else:
                            model.fit(X_train[labeled_indices], y_train[labeled_indices])
                        
                        y_pred = model.predict(X_test)
                        final_acc = accuracy_score(y_test, y_pred)
                        final_f1 = f1_score(y_test, y_pred, average='macro')
                        
                    except Exception as e:
                        logger.error(f"Fehler bei finaler Evaluation in Run {run+1}: {e}")
                        final_acc = 0.0
                        final_f1 = f1 if 'f1' in locals() else 0.0
                    
                    results.append({
                        'classifier': classifier_name,
                        'strategy': strategy_name,
                        'budget_pct': budget_pct,
                        'run': run,
                        'n_labeled': len(labeled_indices),
                        'accuracy': final_acc,
                        'f1_score': final_f1,
                        'f1_scores': f1_scores,  # Verwende f1_scores statt accuracies
                        'n_labeled_list': n_labeled_list,
                        'avg_query_time': np.mean(query_times) if query_times else 0,
                        'avg_train_time': np.mean(train_times) if train_times else 0
                    })
                    
                    # Unicode-Fix: Verwende -> statt →
                    logger.info(f"    Final: {len(labeled_indices):,} labeled -> "
                              f"Accuracy: {final_acc:.4f}, F1: {final_f1:.4f}")
                
                # Speicher freigeben
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    
            except Exception as e:
                logger.error(f"Unerwarteter Fehler in Run {run+1}: {e}")
                import traceback
                traceback.print_exc()
                continue
    
    return results


# ## 9. Visualisierungsfunktionen mit F1-Score

# In[9]:


def plot_per_classifier_with_significance(all_results, stat_results):
    """
    Erstellt eine Visualisierung pro Klassifikator mit Signifikanzmarkierungen.
    Verwendet F1-Score als Hauptmetrik.
    """
    try:
        # Style setzen mit Fallback
        try:
            plt.style.use('seaborn-v0_8-whitegrid')
        except:
            try:
                plt.style.use('seaborn-whitegrid')
            except:
                plt.style.use('ggplot')
        
        # Farben für Strategien
        strategy_colors = {
            'Random Sampling': '#808080',
            'Entropy Sampling': '#1f77b4',
            'Margin Sampling': '#ff7f0e',
            'Least Confidence': '#2ca02c'
        }
        
        # Klassifikatoren extrahieren
        classifiers = sorted(list(set(r['classifier'] for r in all_results)))
        
        # Eine Figure pro Klassifikator
        for classifier in classifiers:
            fig, axes = plt.subplots(1, len(BUDGET_PERCENTAGES), figsize=(20, 4))
            
            # Handle für einzelne Subplot
            if len(BUDGET_PERCENTAGES) == 1:
                axes = [axes]
                
            fig.suptitle(f'{classifier} - Active Learning Performance with Statistical Significance (F1-Score)', 
                         fontsize=16, y=1.02)
            
            # Gesamtanzahl Samples
            n_total_samples = max(r['n_labeled'] for r in all_results if r['budget_pct'] == 1.0)
            
            for budget_idx, budget_pct in enumerate(BUDGET_PERCENTAGES):
                ax = axes[budget_idx]
                
                # Daten für diesen Klassifikator und Budget
                for strategy, color in strategy_colors.items():
                    strategy_results = [r for r in all_results 
                                      if r['classifier'] == classifier 
                                      and r['strategy'] == strategy 
                                      and r['budget_pct'] == budget_pct]
                    
                    if strategy_results:
                        # Lernkurven aggregieren - verwende f1_scores
                        max_samples = int(budget_pct * n_total_samples)
                        x_common = np.linspace(100, max_samples, 100)
                        y_interpolated = []
                        
                        for r in strategy_results:
                            if len(r['n_labeled_list']) > 1 and 'f1_scores' in r:
                                try:
                                    y_interp = np.interp(x_common, r['n_labeled_list'], r['f1_scores'])
                                    y_interpolated.append(y_interp)
                                except:
                                    logger.warning(f"Interpolation fehlgeschlagen für {classifier}-{strategy}")
                        
                        if y_interpolated:
                            y_mean = np.mean(y_interpolated, axis=0)
                            y_std = np.std(y_interpolated, axis=0)
                            
                            # Überprüfe Signifikanz
                            is_significant = False
                            effect_size = ""
                            if strategy != 'Random Sampling' and not stat_results.empty:
                                sig_data = stat_results[
                                    (stat_results['classifier'] == classifier) & 
                                    (stat_results['strategy'] == strategy) & 
                                    (stat_results['budget_pct'] == budget_pct)
                                ]
                                if not sig_data.empty:
                                    is_significant = sig_data.iloc[0]['significant']
                                    effect_size = sig_data.iloc[0]['effect_size']
                            
                            # Label mit Signifikanz
                            label = strategy
                            if is_significant:
                                label += f" *({effect_size})"
                            
                            # Plot mit Konfidenzintervall
                            ax.plot(x_common, y_mean, 
                                   label=label, 
                                   color=color, 
                                   linewidth=2.5,
                                   linestyle='-' if not is_significant or strategy == 'Random Sampling' else '--',
                                   marker='o' if strategy == 'Random Sampling' else None,
                                   markevery=10 if strategy == 'Random Sampling' else None,
                                   markersize=4)
                            
                            ax.fill_between(x_common, 
                                          y_mean - y_std, 
                                          y_mean + y_std, 
                                          color=color, 
                                          alpha=0.2)
                
                # Achsenbeschriftung
                ax.set_xlabel('Number of Labeled Samples', fontsize=11)
                ax.set_ylabel('Test F1-Score (Macro)', fontsize=11)
                ax.set_title(f'Budget: {int(budget_pct*100)}%', fontsize=12)
                
                # Grid und Limits
                ax.grid(True, alpha=0.3)
                ax.set_ylim([0.0, 1.0])
                
                # X-Achse formatieren
                ax.ticklabel_format(style='plain', axis='x')
                ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x/1000)}k' if x >= 1000 else str(int(x))))
                
                # Legende nur beim ersten Plot
                if budget_idx == 0:
                    ax.legend(loc='lower right', fontsize=9, framealpha=0.9)
            
            # Signifikanz-Erklärung
            fig.text(0.5, -0.05, 
                    '* = statistically significant (p < 0.05 with Bonferroni correction); ' +
                    'Effect size in parentheses (negligible/small/medium/large)',
                    ha='center', fontsize=10, style='italic')
            
            plt.tight_layout()
            
            # Speichern mit Fehlerbehandlung
            filename = f'plots/dachmaterial_{classifier.lower().replace(" ", "_")}_f1_with_significance.png'
            try:
                plt.savefig(filename, dpi=300, bbox_inches='tight')
                logger.info(f"[ok] Visualisierung mit F1-Score und Signifikanz für {classifier} erstellt: {filename}")
            except Exception as e:
                logger.error(f"Fehler beim Speichern der Visualisierung für {classifier}: {e}")
                
            plt.close()
            
    except Exception as e:
        logger.error(f"Fehler bei plot_per_classifier_with_significance: {e}")
        import traceback
        traceback.print_exc()


# In[10]:


def plot_statistical_summary(stat_results):
    """
    Erstellt eine Visualisierung der statistischen Ergebnisse für F1-Score.
    """
    try:
        # Style setzen
        try:
            plt.style.use('seaborn-v0_8-whitegrid')
        except:
            try:
                plt.style.use('seaborn-whitegrid')
            except:
                plt.style.use('ggplot')
        
        if stat_results.empty:
            logger.warning("Keine statistischen Ergebnisse zum Visualisieren!")
            return
            
        # Filtere nur signifikante Ergebnisse
        sig_results = stat_results[stat_results['significant']].copy() if 'significant' in stat_results.columns else pd.DataFrame()
        
        if sig_results.empty:
            logger.warning("Keine signifikanten Ergebnisse gefunden!")
            # Erstelle trotzdem eine Visualisierung mit allen Ergebnissen
            sig_results = stat_results.copy()
        
        # Figure mit Subplots
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Statistical Analysis Summary - Dachmaterial Dataset (F1-Score)', fontsize=16)
        
        # 1. Heatmap der p-Werte
        try:
            ax1 = axes[0, 0]
            pivot_p = stat_results.pivot_table(
                values='p_value_corrected',
                index=['classifier', 'strategy'],
                columns='budget_pct',
                fill_value=1.0
            )
            
            if sns is not None and not pivot_p.empty:
                sns.heatmap(pivot_p, 
                            annot=True, 
                            fmt='.3f', 
                            cmap='RdYlGn_r',
                            vmin=0, 
                            vmax=0.1,
                            cbar_kws={'label': 'Corrected p-value'},
                            ax=ax1)
            else:
                ax1.text(0.5, 0.5, 'Heatmap nicht verfügbar', ha='center', va='center')
                
            ax1.set_title('Corrected p-values (Wilcoxon Signed-Rank Test)')
            ax1.set_xlabel('Budget (%)')
        except Exception as e:
            logger.warning(f"Fehler bei p-Wert Heatmap: {e}")
            axes[0, 0].text(0.5, 0.5, 'Fehler bei der Erstellung', ha='center', va='center')
        
        # 2. Heatmap der Effektstärken
        try:
            ax2 = axes[0, 1]
            pivot_effect = stat_results.pivot_table(
                values='cliffs_delta',
                index=['classifier', 'strategy'],
                columns='budget_pct',
                fill_value=0.0
            )
            
            if sns is not None and not pivot_effect.empty:
                sns.heatmap(pivot_effect, 
                            annot=True, 
                            fmt='.3f', 
                            cmap='coolwarm',
                            center=0,
                            vmin=-1, 
                            vmax=1,
                            cbar_kws={'label': "Cliff's Delta"},
                            ax=ax2)
            else:
                ax2.text(0.5, 0.5, 'Heatmap nicht verfügbar', ha='center', va='center')
                
            ax2.set_title("Effect Size (Cliff's Delta)")
            ax2.set_xlabel('Budget (%)')
        except Exception as e:
            logger.warning(f"Fehler bei Effektstärken Heatmap: {e}")
            axes[0, 1].text(0.5, 0.5, 'Fehler bei der Erstellung', ha='center', va='center')
        
        # 3. Anzahl signifikanter Verbesserungen pro Strategie
        try:
            ax3 = axes[1, 0]
            if not sig_results.empty and 'strategy' in sig_results.columns:
                sig_counts = sig_results.groupby('strategy').size().sort_values(ascending=False)
                sig_counts.plot(kind='bar', ax=ax3, color='steelblue')
                ax3.set_title('Number of Significant F1-Score Improvements per Strategy')
                ax3.set_xlabel('Strategy')
                ax3.set_ylabel('Count')
                ax3.grid(axis='y', alpha=0.3)
            else:
                ax3.text(0.5, 0.5, 'Keine signifikanten Ergebnisse', ha='center', va='center')
        except Exception as e:
            logger.warning(f"Fehler bei Signifikanz-Barplot: {e}")
            axes[1, 0].text(0.5, 0.5, 'Fehler bei der Erstellung', ha='center', va='center')
        
        # 4. Durchschnittliche Effektstärke pro Klassifikator
        try:
            ax4 = axes[1, 1]
            if 'classifier' in stat_results.columns and 'cliffs_delta' in stat_results.columns:
                avg_effect = stat_results.groupby('classifier')['cliffs_delta'].agg(['mean', 'std'])
                avg_effect['mean'].plot(kind='bar', ax=ax4, yerr=avg_effect['std'], 
                                       capsize=5, color='darkorange')
                ax4.set_title("Average Effect Size per Classifier (F1-Score)")
                ax4.set_xlabel('Classifier')
                ax4.set_ylabel("Mean Cliff's Delta")
                ax4.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
                ax4.grid(axis='y', alpha=0.3)
            else:
                ax4.text(0.5, 0.5, 'Daten nicht verfügbar', ha='center', va='center')
        except Exception as e:
            logger.warning(f"Fehler bei Effektstärken-Barplot: {e}")
            axes[1, 1].text(0.5, 0.5, 'Fehler bei der Erstellung', ha='center', va='center')
        
        plt.tight_layout()
        
        # Speichern
        filename = 'plots/dachmaterial_f1_statistical_summary.png'
        try:
            plt.savefig(filename, dpi=300, bbox_inches='tight')
            logger.info(f"[ok] Statistische F1-Score Zusammenfassung erstellt: {filename}")
        except Exception as e:
            logger.error(f"Fehler beim Speichern der statistischen Zusammenfassung: {e}")
            
        plt.close()
        
    except Exception as e:
        logger.error(f"Fehler bei plot_statistical_summary: {e}")
        import traceback
        traceback.print_exc()


# In[11]:


def create_statistical_report(stat_results):
    """
    Erstellt einen detaillierten statistischen Bericht für F1-Score Ergebnisse.
    """
    try:
        # Sortiere nach Effektstärke
        if not stat_results.empty and 'cliffs_delta' in stat_results.columns:
            stat_results_sorted = stat_results.sort_values('cliffs_delta', ascending=False)
        else:
            stat_results_sorted = stat_results
        
        # Erstelle formatierten Bericht
        report = []
        report.append("\n" + "="*100)
        report.append("DETAILLIERTER STATISTISCHER BERICHT - DACHMATERIAL (F1-SCORE)")
        report.append("="*100)
        report.append(f"Primäre Metrik: F1-Score (Macro Average)")
        report.append(f"Signifikanzniveau: {SIGNIFICANCE_LEVEL} (mit Bonferroni-Korrektur)")
        report.append(f"Anzahl Runs pro Experiment: {N_RUNS}")
        report.append(f"Statistischer Test: Wilcoxon Signed-Rank Test")
        report.append(f"Effektstärkemaß: Cliff's Delta")
        report.append("\n")
        
        # Signifikante Ergebnisse
        if 'significant' in stat_results_sorted.columns:
            sig_results = stat_results_sorted[stat_results_sorted['significant']]
        else:
            sig_results = pd.DataFrame()
        
        if not sig_results.empty:
            report.append("SIGNIFIKANTE F1-SCORE VERBESSERUNGEN GEGENÜBER RANDOM SAMPLING:")
            report.append("-"*100)
            report.append(f"{'Klassifikator':<20} {'Strategie':<20} {'Budget':<10} {'Verbesserung':<15} "
                         f"{'p-Wert':<12} {'Effekt':<15} {'Interpretation':<15}")
            report.append("-"*100)
            
            for _, row in sig_results.iterrows():
                report.append(f"{row['classifier']:<20} {row['strategy']:<20} "
                             f"{int(row['budget_pct']*100):>8}% "
                             f"{row['improvement_pct']:>13.2f}% "
                             f"{row['p_value_corrected']:>11.4f} "
                             f"{row['cliffs_delta']:>14.3f} "
                             f"{row['effect_size']:<15}")
        else:
            report.append("Keine signifikanten F1-Score Verbesserungen gefunden!")
        
        # Zusammenfassung nach Strategie
        report.append("\n\nZUSAMMENFASSUNG NACH STRATEGIE (F1-SCORE):")
        report.append("-"*100)
        
        for strategy in ['Entropy Sampling', 'Margin Sampling', 'Least Confidence']:
            if 'strategy' in stat_results.columns:
                strategy_data = stat_results[stat_results['strategy'] == strategy]
                if not strategy_data.empty:
                    sig_count = strategy_data['significant'].sum() if 'significant' in strategy_data.columns else 0
                    avg_improvement = strategy_data['improvement_pct'].mean() if 'improvement_pct' in strategy_data.columns else 0
                    avg_effect = strategy_data['cliffs_delta'].mean() if 'cliffs_delta' in strategy_data.columns else 0
                    
                    report.append(f"\n{strategy}:")
                    report.append(f"  - Signifikante F1-Score Verbesserungen: {sig_count}/{len(strategy_data)} "
                                 f"({sig_count/len(strategy_data)*100:.1f}%)")
                    report.append(f"  - Durchschnittliche F1-Score Verbesserung: {avg_improvement:.2f}%")
                    report.append(f"  - Durchschnittliche Effektstärke: {avg_effect:.3f}")
        
        # Zusammenfassung nach Klassifikator
        report.append("\n\nZUSAMMENFASSUNG NACH KLASSIFIKATOR (F1-SCORE):")
        report.append("-"*100)
        
        if 'classifier' in stat_results.columns:
            for classifier in stat_results['classifier'].unique():
                classifier_data = stat_results[stat_results['classifier'] == classifier]
                sig_count = classifier_data['significant'].sum() if 'significant' in classifier_data.columns else 0
                
                report.append(f"\n{classifier}:")
                report.append(f"  - Signifikante F1-Score Verbesserungen: {sig_count}/{len(classifier_data)} "
                             f"({sig_count/len(classifier_data)*100:.1f}%)")
                
                if sig_count > 0 and 'cliffs_delta' in classifier_data.columns:
                    best_strategy = classifier_data.loc[classifier_data['cliffs_delta'].idxmax()]
                    report.append(f"  - Beste Strategie: {best_strategy['strategy']} "
                                 f"bei {int(best_strategy['budget_pct']*100)}% Budget "
                                 f"(Effekt: {best_strategy['cliffs_delta']:.3f})")
        
        report.append("\n" + "="*100)
        
        # Ausgabe
        report_text = "\n".join(report)
        print(report_text)
        
        # Speichern
        report_filename = 'reports/dachmaterial_f1_statistical_report.txt'
        try:
            with open(report_filename, 'w', encoding='utf-8') as f:
                f.write(report_text)
            logger.info(f"[ok] Statistischer F1-Score Bericht gespeichert: {report_filename}")
        except Exception as e:
            logger.error(f"Fehler beim Speichern des Berichts: {e}")
        
        return report_text
        
    except Exception as e:
        logger.error(f"Fehler bei create_statistical_report: {e}")
        return "Fehler bei der Berichterstellung"


# In[12]:


def plot_final_comparison(all_results):
    """
    Erstellt eine Visualisierung, die zeigt, welche Kombination aus Klassifikator
    und Query-Strategie bei 100% Budget am besten abgeschnitten hat (F1-Score).
    """
    try:
        # Style setzen
        try:
            plt.style.use('seaborn-v0_8-whitegrid')
        except:
            try:
                plt.style.use('seaborn-whitegrid')
            except:
                plt.style.use('ggplot')
        
        # Nur Ergebnisse mit 100% Budget
        final_results = [r for r in all_results if r['budget_pct'] == 1.0]
        
        if not final_results:
            logger.warning("Keine Ergebnisse mit 100% Budget gefunden!")
            return
        
        # Aggregiere Ergebnisse
        summary_data = []
        classifiers = sorted(list(set(r['classifier'] for r in final_results)))
        strategies = ['Random Sampling', 'Entropy Sampling', 'Margin Sampling', 'Least Confidence']
        
        for classifier in classifiers:
            for strategy in strategies:
                results = [r for r in final_results 
                          if r['classifier'] == classifier and r['strategy'] == strategy]
                
                if results:
                    mean_acc = np.mean([r['accuracy'] for r in results])
                    std_acc = np.std([r['accuracy'] for r in results])
                    mean_f1 = np.mean([r['f1_score'] for r in results])
                    std_f1 = np.std([r['f1_score'] for r in results])
                    
                    summary_data.append({
                        'Classifier': classifier,
                        'Strategy': strategy,
                        'Accuracy': mean_acc,
                        'Accuracy_std': std_acc,
                        'F1-Score': mean_f1,
                        'F1-Score_std': std_f1
                    })
        
        if not summary_data:
            logger.warning("Keine Daten für finale Vergleichsmatrix!")
            return
            
        # DataFrame erstellen
        df = pd.DataFrame(summary_data)
        
        # Pivot für Heatmap - fokussiere auf F1-Score
        pivot_f1 = df.pivot(index='Classifier', columns='Strategy', values='F1-Score')
        pivot_acc = df.pivot(index='Classifier', columns='Strategy', values='Accuracy')
        
        # Figure mit zwei Subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        # Heatmap für F1-Score (Hauptfokus)
        if sns is not None and not pivot_f1.empty:
            sns.heatmap(pivot_f1, 
                        annot=True, 
                        fmt='.4f', 
                        cmap='RdYlGn', 
                        vmin=0.0, 
                        vmax=1.0,
                        cbar_kws={'label': 'Test F1-Score (Macro)'},
                        ax=ax1)
        else:
            ax1.text(0.5, 0.5, 'Heatmap nicht verfügbar', ha='center', va='center')
            
        ax1.set_title('Test F1-Score at 100% Budget', fontsize=14)
        ax1.set_xlabel('Query Strategy', fontsize=12)
        ax1.set_ylabel('Classifier', fontsize=12)
        
        # Heatmap für Accuracy (zum Vergleich)
        if sns is not None and not pivot_acc.empty:
            sns.heatmap(pivot_acc, 
                        annot=True, 
                        fmt='.4f', 
                        cmap='RdYlGn', 
                        vmin=0.0, 
                        vmax=1.0,
                        cbar_kws={'label': 'Test Accuracy'},
                        ax=ax2)
        else:
            ax2.text(0.5, 0.5, 'Heatmap nicht verfügbar', ha='center', va='center')
            
        ax2.set_title('Test Accuracy at 100% Budget', fontsize=14)
        ax2.set_xlabel('Query Strategy', fontsize=12)
        ax2.set_ylabel('Classifier', fontsize=12)
        
        plt.suptitle('Final Performance Comparison - Full Dachmaterial Dataset (100% Budget)', fontsize=16)
        plt.tight_layout()
        
        # Speichern
        filename = 'plots/dachmaterial_f1_final_comparison.png'
        try:
            plt.savefig(filename, dpi=300, bbox_inches='tight')
            logger.info(f"[ok] Finale F1-Score Vergleichsvisualisierung erstellt: {filename}")
        except Exception as e:
            logger.error(f"Fehler beim Speichern der finalen Vergleichsmatrix: {e}")
            
        plt.close()
        
        # Beste Kombination finden basierend auf F1-Score
        if not df.empty:
            best_idx = df['F1-Score'].idxmax()
            best_result = df.iloc[best_idx]
            
            print("\n" + "="*60)
            print("BESTE KOMBINATION BEI 100% BUDGET (F1-SCORE):")
            print("="*60)
            print(f"Klassifikator: {best_result['Classifier']}")
            print(f"Query-Strategie: {best_result['Strategy']}")
            print(f"Test F1-Score: {best_result['F1-Score']:.4f} (±{best_result['F1-Score_std']:.4f})")
            print(f"Test Accuracy: {best_result['Accuracy']:.4f} (±{best_result['Accuracy_std']:.4f})")
            print("="*60)
            
            # Top 5 Kombinationen basierend auf F1-Score
            print("\nTOP 5 KOMBINATIONEN (F1-SCORE):")
            print("-"*60)
            top5 = df.nlargest(5, 'F1-Score')[['Classifier', 'Strategy', 'F1-Score', 'Accuracy']]
            for idx, (_, row) in enumerate(top5.iterrows()):
                print(f"{idx+1}. {row['Classifier']} + {row['Strategy']}: "
                      f"F1={row['F1-Score']:.4f}, Acc={row['Accuracy']:.4f}")
        
    except Exception as e:
        logger.error(f"Fehler bei plot_final_comparison: {e}")
        import traceback
        traceback.print_exc()


# In[13]:


def plot_improvement_analysis(all_results):
    """
    Zeigt die Verbesserung der Active Learning Strategien gegenüber Random Sampling (F1-Score).
    """
    try:
        # Style setzen
        try:
            plt.style.use('seaborn-v0_8-whitegrid')
        except:
            try:
                plt.style.use('seaborn-whitegrid')
            except:
                plt.style.use('ggplot')
        
        # Berechne Verbesserungen basierend auf F1-Score
        improvements = []
        classifiers = sorted(list(set(r['classifier'] for r in all_results)))
        strategies = ['Entropy Sampling', 'Margin Sampling', 'Least Confidence']
        
        for classifier in classifiers:
            for strategy in strategies:
                for budget_pct in BUDGET_PERCENTAGES:
                    # Random Sampling Baseline
                    random_results = [r for r in all_results 
                                    if r['classifier'] == classifier 
                                    and r['strategy'] == 'Random Sampling' 
                                    and r['budget_pct'] == budget_pct]
                    
                    # Active Learning Strategy
                    strategy_results = [r for r in all_results 
                                      if r['classifier'] == classifier 
                                      and r['strategy'] == strategy 
                                      and r['budget_pct'] == budget_pct]
                    
                    if random_results and strategy_results:
                        random_f1 = np.mean([r['f1_score'] for r in random_results])
                        strategy_f1 = np.mean([r['f1_score'] for r in strategy_results])
                        
                        # Prozentuale Verbesserung
                        improvement = ((strategy_f1 - random_f1) / random_f1) * 100 if random_f1 > 0 else 0
                        
                        improvements.append({
                            'Classifier': classifier,
                            'Strategy': strategy,
                            'Budget': int(budget_pct * 100),
                            'F1-Score Improvement (%)': improvement
                        })
        
        if not improvements:
            logger.warning("Keine Verbesserungsdaten gefunden!")
            return
            
        # DataFrame erstellen
        imp_df = pd.DataFrame(improvements)
        
        # Figure
        fig, ax = plt.subplots(figsize=(12, 8))
        
        # Gruppierter Barplot
        x = np.arange(len(BUDGET_PERCENTAGES))
        width = 0.15
        
        # Erstelle eine eindeutige Farbe für jede Klassifikator-Strategie-Kombination
        colors = plt.cm.tab20(np.linspace(0, 1, len(classifiers) * len(strategies)))
        color_idx = 0
        
        for i, classifier in enumerate(classifiers):
            for j, strategy in enumerate(strategies):
                data = imp_df[(imp_df['Classifier'] == classifier) & 
                             (imp_df['Strategy'] == strategy)]
                
                if not data.empty:
                    values = []
                    for b in BUDGET_PERCENTAGES:
                        budget_data = data[data['Budget'] == int(b*100)]
                        if not budget_data.empty:
                            values.append(budget_data['F1-Score Improvement (%)'].values[0])
                        else:
                            values.append(0)
                    
                    offset = (i * len(strategies) + j - len(classifiers) * len(strategies) / 2) * width
                    bars = ax.bar(x + offset, values, width, 
                                 label=f'{classifier} - {strategy}',
                                 color=colors[color_idx])
                    color_idx += 1
        
        # Achsenbeschriftung
        ax.set_xlabel('Budget (%)', fontsize=12)
        ax.set_ylabel('F1-Score Improvement over Random Sampling (%)', fontsize=12)
        ax.set_title('Active Learning F1-Score Improvement Analysis - Dachmaterial Dataset', fontsize=14)
        ax.set_xticks(x)
        ax.set_xticklabels([f'{int(b*100)}%' for b in BUDGET_PERCENTAGES])
        
        # Nulllinie
        ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
        
        # Legende
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
        
        # Grid
        ax.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        
        # Speichern
        filename = 'plots/dachmaterial_f1_improvement_analysis.png'
        try:
            plt.savefig(filename, dpi=300, bbox_inches='tight')
            logger.info(f"[ok] F1-Score Improvement-Analyse erstellt: {filename}")
        except Exception as e:
            logger.error(f"Fehler beim Speichern der F1-Score Improvement-Analyse: {e}")
            
        plt.close()
        
    except Exception as e:
        logger.error(f"Fehler bei plot_improvement_analysis: {e}")
        import traceback
        traceback.print_exc()


# ## 10. Label-Einsparungs-Analyse mit F1-Score

# In[14]:


def calculate_label_savings(all_results, target_performance_percentages=[0.90, 0.95, 0.98]):
    """
    Berechnet die Label-Einsparung für Active Learning Strategien basierend auf F1-Score.
    
    Args:
        all_results: Liste aller Experiment-Ergebnisse
        target_performance_percentages: Prozentsätze der Random Sampling 100% F1-Score Performance
    
    Returns:
        DataFrame mit Label-Einsparungen
    """
    savings_results = []
    
    # Gruppiere nach Klassifikator
    classifiers = sorted(list(set(r['classifier'] for r in all_results)))
    
    # Gesamtanzahl Samples
    n_total_samples = max(r['n_labeled'] for r in all_results if r['budget_pct'] == 1.0)
    
    for classifier in classifiers:
        # Hole Random Sampling F1-Score Performance bei 100% Budget als Referenz
        random_100_results = [r for r in all_results 
                            if r['classifier'] == classifier 
                            and r['strategy'] == 'Random Sampling' 
                            and r['budget_pct'] == 1.0]
        
        if not random_100_results:
            continue
            
        # Durchschnittliche F1-Score Performance bei 100% Budget
        random_100_f1 = np.mean([r['f1_score'] for r in random_100_results])
        
        # Für verschiedene Ziel-Performance-Level
        for target_pct in target_performance_percentages:
            target_f1_score = random_100_f1 * target_pct
            
            # Für jede Strategie
            for strategy in ['Random Sampling', 'Entropy Sampling', 'Margin Sampling', 'Least Confidence']:
                strategy_results = [r for r in all_results 
                                  if r['classifier'] == classifier 
                                  and r['strategy'] == strategy]
                
                if not strategy_results:
                    continue
                
                # Aggregiere Lernkurven über alle Runs
                all_curves = []
                for r in strategy_results:
                    if 'n_labeled_list' in r and 'f1_scores' in r:
                        all_curves.append((r['n_labeled_list'], r['f1_scores']))
                
                if not all_curves:
                    continue
                
                # Finde minimale Label-Anzahl um Ziel-F1-Score zu erreichen
                labels_needed = []
                
                for n_labeled_list, f1_scores in all_curves:
                    # Interpoliere um den Punkt zu finden, wo target_f1_score erreicht wird
                    if len(f1_scores) > 0 and max(f1_scores) >= target_f1_score:
                        # Finde ersten Punkt, der target_f1_score überschreitet
                        for i, f1 in enumerate(f1_scores):
                            if f1 >= target_f1_score:
                                labels_needed.append(n_labeled_list[i])
                                break
                    else:
                        # Ziel nicht erreicht - verwende Maximum
                        labels_needed.append(n_total_samples)
                
                if labels_needed:
                    avg_labels_needed = np.mean(labels_needed)
                    std_labels_needed = np.std(labels_needed)
                    
                    # Berechne Einsparung gegenüber 100%
                    savings_pct = ((n_total_samples - avg_labels_needed) / n_total_samples) * 100
                    
                    # Berechne Einsparung gegenüber Random Sampling
                    if strategy != 'Random Sampling':
                        random_labels = next((s['avg_labels_needed'] for s in savings_results 
                                            if s['classifier'] == classifier 
                                            and s['strategy'] == 'Random Sampling' 
                                            and s['target_performance'] == int(target_pct*100)), n_total_samples)
                        relative_savings_pct = ((random_labels - avg_labels_needed) / random_labels) * 100 if random_labels > 0 else 0
                    else:
                        relative_savings_pct = 0
                    
                    savings_results.append({
                        'classifier': classifier,
                        'strategy': strategy,
                        'target_performance': int(target_pct * 100),
                        'target_f1_score': target_f1_score,
                        'avg_labels_needed': avg_labels_needed,
                        'std_labels_needed': std_labels_needed,
                        'savings_pct': savings_pct,
                        'relative_savings_pct': relative_savings_pct,
                        'random_100_f1': random_100_f1
                    })
    
    return pd.DataFrame(savings_results)


# In[15]:


def plot_label_savings(savings_df, dataset_name="Dachmaterial"):
    """
    Visualisiert die Label-Einsparungen basierend auf F1-Score.
    """
    # Style setzen
    try:
        plt.style.use('seaborn-v0_8-whitegrid')
    except:
        plt.style.use('ggplot')
    
    # Figure mit Subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle(f'{dataset_name} - Label Savings Analysis (F1-Score Based)', fontsize=16)
    
    # 1. Labels benötigt für verschiedene Performance-Level
    ax1 = axes[0, 0]
    
    # Gruppiere nach Klassifikator und Target Performance
    for classifier in savings_df['classifier'].unique():
        for target in savings_df['target_performance'].unique():
            data = savings_df[(savings_df['classifier'] == classifier) & 
                            (savings_df['target_performance'] == target)]
            
            if not data.empty:
                strategies = data['strategy'].values
                labels_needed = data['avg_labels_needed'].values
                errors = data['std_labels_needed'].values
                
                x = np.arange(len(strategies))
                width = 0.2
                offset = (target - 95) * width / 5  # Offset basierend auf target
                
                ax1.bar(x + offset, labels_needed, width, 
                       yerr=errors, capsize=5,
                       label=f'{classifier} - {target}% of baseline F1',
                       alpha=0.7)
    
    ax1.set_xlabel('Strategy')
    ax1.set_ylabel('Labels Needed')
    ax1.set_title('Labels Required to Reach Target F1-Score Performance')
    ax1.set_xticks(np.arange(len(strategies)))
    ax1.set_xticklabels(strategies, rotation=45, ha='right')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    ax1.grid(axis='y', alpha=0.3)
    
    # 2. Relative Einsparung gegenüber Random Sampling
    ax2 = axes[0, 1]
    
    # Pivot für Heatmap
    pivot_savings = savings_df[savings_df['strategy'] != 'Random Sampling'].pivot_table(
        values='relative_savings_pct',
        index=['classifier', 'strategy'],
        columns='target_performance',
        fill_value=0
    )
    
    if not pivot_savings.empty and sns is not None:
        sns.heatmap(pivot_savings, 
                    annot=True, 
                    fmt='.1f', 
                    cmap='RdYlGn',
                    center=0,
                    cbar_kws={'label': 'Label Savings (%)'},
                    ax=ax2)
        ax2.set_title('Label Savings vs Random Sampling (%) - F1-Score Based')
        ax2.set_xlabel('Target F1-Score Performance (% of baseline)')
    
    # 3. Absolute Label-Anzahl pro Klassifikator bei 95% Performance
    ax3 = axes[1, 0]
    
    data_95 = savings_df[savings_df['target_performance'] == 95]
    
    classifiers = data_95['classifier'].unique()
    strategies = ['Random Sampling', 'Entropy Sampling', 'Margin Sampling', 'Least Confidence']
    
    x = np.arange(len(classifiers))
    width = 0.2
    
    for i, strategy in enumerate(strategies):
        values = []
        errors = []
        for classifier in classifiers:
            row = data_95[(data_95['classifier'] == classifier) & 
                         (data_95['strategy'] == strategy)]
            if not row.empty:
                values.append(row['avg_labels_needed'].values[0])
                errors.append(row['std_labels_needed'].values[0])
            else:
                values.append(0)
                errors.append(0)
        
        ax3.bar(x + i*width - 1.5*width, values, width, 
               yerr=errors, capsize=5,
               label=strategy, alpha=0.8)
    
    ax3.set_xlabel('Classifier')
    ax3.set_ylabel('Labels Needed')
    ax3.set_title('Labels Needed to Reach 95% of Baseline F1-Score Performance')
    ax3.set_xticks(x)
    ax3.set_xticklabels(classifiers, rotation=45, ha='right')
    ax3.legend()
    ax3.grid(axis='y', alpha=0.3)
    
    # Referenzlinie bei 100% der Daten
    n_total_samples = savings_df['avg_labels_needed'].max() * 1.1  # Schätzung
    ax3.axhline(y=n_total_samples, color='red', linestyle='--', alpha=0.5, label='Full Dataset')
    
    # 4. Zusammenfassungstabelle
    ax4 = axes[1, 1]
    ax4.axis('tight')
    ax4.axis('off')
    
    # Erstelle Zusammenfassungstabelle für 95% Performance
    summary_data = []
    for classifier in classifiers:
        row_data = [classifier]
        for strategy in strategies:
            data = data_95[(data_95['classifier'] == classifier) & 
                          (data_95['strategy'] == strategy)]
            if not data.empty:
                labels = data['avg_labels_needed'].values[0]
                savings = data['savings_pct'].values[0]
                row_data.append(f'{int(labels):,}\n({savings:.1f}% saved)')
            else:
                row_data.append('N/A')
        summary_data.append(row_data)
    
    table = ax4.table(cellText=summary_data,
                     colLabels=['Classifier'] + strategies,
                     cellLoc='center',
                     loc='center')
    
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1.2, 2)
    
    # Style the header row
    for i in range(len(strategies) + 1):
        table[(0, i)].set_facecolor('#40466e')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    ax4.set_title('Summary: Labels Needed for 95% F1-Score Performance', pad=20)
    
    plt.tight_layout()
    
    # Speichern
    filename = f'plots/{dataset_name.lower()}_f1_label_savings_analysis.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    logger.info(f"[ok] F1-Score basierte Label-Einsparungs-Analyse erstellt: {filename}")
    
    plt.close()
    
    return savings_df


# In[16]:


def create_label_savings_report(savings_df, dataset_name="Dachmaterial"):
    """
    Erstellt einen detaillierten Bericht über Label-Einsparungen basierend auf F1-Score.
    """
    report = []
    report.append("\n" + "="*80)
    report.append(f"LABEL-EINSPARUNGS-BERICHT - {dataset_name} (F1-SCORE BASIERT)")
    report.append("="*80)
    
    # Für jedes Performance-Level
    for target_perf in sorted(savings_df['target_performance'].unique()):
        report.append(f"\nZIEL: {target_perf}% der Baseline F1-Score Performance")
        report.append("-"*60)
        
        target_data = savings_df[savings_df['target_performance'] == target_perf]
        
        # Nach Klassifikator gruppieren
        for classifier in sorted(target_data['classifier'].unique()):
            classifier_data = target_data[target_data['classifier'] == classifier]
            baseline_f1 = classifier_data['random_100_f1'].iloc[0]
            target_f1 = classifier_data['target_f1_score'].iloc[0]
            
            report.append(f"\n{classifier}:")
            report.append(f"  Baseline F1-Score (Random 100%): {baseline_f1:.4f}")
            report.append(f"  Ziel F1-Score: {target_f1:.4f}")
            report.append(f"  Labels benötigt:")
            
            # Sortiere nach Labels benötigt
            sorted_data = classifier_data.sort_values('avg_labels_needed')
            
            for _, row in sorted_data.iterrows():
                strategy = row['strategy']
                labels = row['avg_labels_needed']
                std = row['std_labels_needed']
                savings = row['savings_pct']
                rel_savings = row['relative_savings_pct']
                
                report.append(f"    - {strategy:<20}: {int(labels):>6,} ± {int(std):>4} "
                            f"({savings:>5.1f}% gespart)")
                
                if strategy != 'Random Sampling' and rel_savings > 0:
                    report.append(f"      -> {rel_savings:.1f}% weniger Labels als Random Sampling")
    
    # Beste Strategien
    report.append("\n\nBESTE STRATEGIEN (bei 95% F1-Score Performance):")
    report.append("-"*60)
    
    data_95 = savings_df[savings_df['target_performance'] == 95]
    
    for classifier in sorted(data_95['classifier'].unique()):
        classifier_data = data_95[data_95['classifier'] == classifier]
        best_row = classifier_data.loc[classifier_data['avg_labels_needed'].idxmin()]
        
        report.append(f"{classifier}: {best_row['strategy']} "
                     f"(nur {int(best_row['avg_labels_needed']):,} Labels = "
                     f"{best_row['savings_pct']:.1f}% Einsparung)")
    
    # Durchschnittliche Einsparungen
    report.append("\n\nDURCHSCHNITTLICHE EINSPARUNGEN ÜBER ALLE KLASSIFIKATOREN:")
    report.append("-"*60)
    
    avg_savings = data_95.groupby('strategy')['relative_savings_pct'].mean()
    for strategy, savings in avg_savings.items():
        if strategy != 'Random Sampling':
            report.append(f"{strategy}: {savings:.1f}% weniger Labels als Random Sampling")
    
    report.append("\n" + "="*80)
    
    # Ausgabe und Speichern
    report_text = "\n".join(report)
    print(report_text)
    
    filename = f'reports/{dataset_name.lower()}_f1_label_savings_report.txt'
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(report_text)
    
    logger.info(f"[ok] F1-Score basierter Label-Einsparungs-Bericht gespeichert: {filename}")
    
    return report_text


# ## 11. Hauptprogramm

# In[17]:


def main():
    """
    Haupteinstiegspunkt für das erweiterte Active Learning Experiment für Dachmaterial mit F1-Score.
    """
    print("="*80)
    print("ACTIVE LEARNING AUF DACHMATERIAL - F1-SCORE VERSION")
    print("="*80)
    
    # System Info
    print(f"Python Version: {sys.version.split()[0]}")
    print(f"PyTorch Version: {torch.__version__}")
    print(f"NumPy Version: {np.__version__}")
    print(f"Pandas Version: {pd.__version__}")
    print(f"Scikit-learn Version: {sklearn.__version__}")
    print(f"SciPy Version: {scipy.__version__}")
    
    # Device Info
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if device.type == 'cuda':
        print(f"Verwende GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    else:
        print("Verwende CPU (keine GPU gefunden)")
    
    print(f"\nExperiment-Konfiguration:")
    print(f"- Hauptmetrik: F1-Score (Macro Average)")
    print(f"- Anzahl Runs: {N_RUNS}")
    print(f"- Budget-Stufen: {[f'{int(b*100)}%' for b in BUDGET_PERCENTAGES]}")
    print(f"- Batch-Größe: {BATCH_SIZE}")
    print(f"- Signifikanzniveau: {SIGNIFICANCE_LEVEL}")
    print(f"- Mindest-Samples pro Klasse: {MIN_SAMPLES_PER_CLASS}")
    print("="*80)
    
    # Daten laden
    try:
        X_train, y_train, X_test, y_test, label_encoder, preprocessor = load_dachmaterial_data()
        
        # Speichere wichtige Variablen für spätere Verwendung
        input_dim = X_train.shape[1]
        n_classes = len(np.unique(y_train))
        
    except Exception as e:
        logger.error(f"Kritischer Fehler beim Laden der Daten: {e}")
        return 1
    
    # Klassifikatoren und Strategien definieren
    classifiers = ['Neural Network', 'Naive Bayes', 'Random Forest', 'Logistic Regression', 'SVM']
    strategies = [
        ('Random Sampling', random_sampling),
        ('Entropy Sampling', entropy_sampling),
        ('Margin Sampling', margin_sampling),
        ('Least Confidence', least_confidence_sampling)
    ]
    
    # Experimente durchführen
    all_results = []
    total_start_time = time.time()
    
    total_experiments = len(classifiers) * len(strategies)
    current_experiment = 0
    
    for classifier_name in classifiers:
        for strategy_name, strategy_func in strategies:
            current_experiment += 1
            print(f"\n{'='*60}")
            print(f"Experiment {current_experiment}/{total_experiments}: {classifier_name} + {strategy_name}")
            print(f"{'='*60}")
            
            experiment_start_time = time.time()
            
            try:
                results = run_active_learning_experiment(
                    X_train, y_train, X_test, y_test,
                    classifier_name, strategy_name, strategy_func,
                    BUDGET_PERCENTAGES, BATCH_SIZE,
                    input_dim=input_dim, n_classes=n_classes
                )
                all_results.extend(results)
                
                experiment_time = time.time() - experiment_start_time
                print(f"\n[ok] {classifier_name} + {strategy_name} abgeschlossen in {experiment_time/60:.1f} Minuten")
                
            except Exception as e:
                logger.error(f"Kritischer Fehler bei {classifier_name} + {strategy_name}: {e}")
                import traceback
                traceback.print_exc()
                continue
    
    # Gesamtzeit
    total_time = time.time() - total_start_time
    print(f"\n[ok] Alle Experimente abgeschlossen in {total_time/60:.1f} Minuten")
    
    # Überprüfe ob Ergebnisse vorhanden sind
    if not all_results:
        logger.error("Keine Experimenteergebnisse vorhanden!")
        return 1
    
    # Ergebnisse in DataFrame konvertieren für statistische Analyse
    try:
        results_df = pd.DataFrame([{
            'classifier': r['classifier'],
            'strategy': r['strategy'],
            'budget_pct': r['budget_pct'],
            'run': r['run'],
            'n_labeled': r['n_labeled'],
            'accuracy': r['accuracy'],
            'f1_score': r['f1_score'],
            'avg_query_time': r.get('avg_query_time', 0),
            'avg_train_time': r.get('avg_train_time', 0)
        } for r in all_results])
    except Exception as e:
        logger.error(f"Fehler beim Erstellen des Results DataFrame: {e}")
        return 1
    
    # Statistische Analyse durchführen mit F1-Score
    print("\n" + "="*60)
    print("Führe statistische Analyse durch (F1-Score)...")
    print("="*60)
    
    try:
        stat_results = perform_statistical_analysis(results_df, metric='f1_score')
    except Exception as e:
        logger.error(f"Fehler bei der statistischen Analyse: {e}")
        stat_results = pd.DataFrame()
    
    # Statistischen Bericht erstellen
    try:
        statistical_report = create_statistical_report(stat_results)
    except Exception as e:
        logger.error(f"Fehler beim Erstellen des statistischen Berichts: {e}")
    
    # Label-Einsparungs-Analyse mit F1-Score
    print("\n" + "="*60)
    print("Berechne Label-Einsparungen (F1-Score basiert)...")
    print("="*60)
    
    try:
        # Berechne Label-Einsparungen für verschiedene Performance-Level
        savings_df = calculate_label_savings(all_results, target_performance_percentages=[0.90, 0.95, 0.98])
        
        # Visualisiere Label-Einsparungen
        plot_label_savings(savings_df, dataset_name="Dachmaterial")
        
        # Erstelle detaillierten Bericht
        label_savings_report = create_label_savings_report(savings_df, dataset_name="Dachmaterial")
        
        # Speichere als CSV
        savings_csv = 'results/dachmaterial_f1_label_savings.csv'
        savings_df.to_csv(savings_csv, index=False)
        print(f"[ok] F1-Score basierte Label-Einsparungen gespeichert: {savings_csv}")
        
    except Exception as e:
        logger.error(f"Fehler bei der Label-Einsparungs-Analyse: {e}")
        import traceback
        traceback.print_exc()
    
    # Ergebnisse visualisieren
    print("\n" + "="*60)
    print("Erstelle Visualisierungen (F1-Score basiert)...")
    print("="*60)
    
    try:
        # Pro Klassifikator mit Signifikanz
        plot_per_classifier_with_significance(all_results, stat_results)
        
        # Statistische Zusammenfassung
        plot_statistical_summary(stat_results)
        
        # Finale Vergleichsmatrix
        plot_final_comparison(all_results)
        
        # Improvement Analyse
        plot_improvement_analysis(all_results)
        
        print("[ok] Alle F1-Score basierten Visualisierungen erstellt")
    except Exception as e:
        logger.error(f"Fehler bei der Visualisierung: {e}")
        import traceback
        traceback.print_exc()
    
    # Ergebnisse speichern
    try:
        # Detaillierte Ergebnisse
        csv_filename = 'results/dachmaterial_f1_active_learning_results.csv'
        results_df.to_csv(csv_filename, index=False)
        print(f"\n[ok] F1-Score Ergebnisse gespeichert in: {csv_filename}")
        
        # Statistische Ergebnisse
        if not stat_results.empty:
            stat_csv_filename = 'results/dachmaterial_f1_statistical_analysis.csv'
            stat_results.to_csv(stat_csv_filename, index=False)
            print(f"[ok] Statistische F1-Score Analyse gespeichert in: {stat_csv_filename}")
        
        # Zusammenfassung als Excel (wenn verfügbar)
        if EXCEL_AVAILABLE:
            excel_filename = 'results/dachmaterial_f1_active_learning_summary.xlsx'
            try:
                with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
                    # Raw results
                    results_df.to_excel(writer, sheet_name='Raw Results', index=False)
                    
                    # Statistical analysis
                    if not stat_results.empty:
                        stat_results.to_excel(writer, sheet_name='Statistical Analysis', index=False)
                    
                    # Summary by classifier and strategy
                    summary = results_df.groupby(['classifier', 'strategy', 'budget_pct'])[['accuracy', 'f1_score']].agg(['mean', 'std'])
                    summary.to_excel(writer, sheet_name='Summary Statistics')
                    
                    # Best combinations at 100% budget
                    final_results = results_df[results_df['budget_pct'] == 1.0]
                    if not final_results.empty:
                        best_combinations = final_results.groupby(['classifier', 'strategy'])[['accuracy', 'f1_score']].mean()
                        best_combinations = best_combinations.sort_values('f1_score', ascending=False)
                        best_combinations.to_excel(writer, sheet_name='Best Combinations')
                    
                    # Significant improvements
                    if not stat_results.empty and 'significant' in stat_results.columns:
                        sig_improvements = stat_results[stat_results['significant']].sort_values('cliffs_delta', ascending=False)
                        if not sig_improvements.empty:
                            sig_improvements.to_excel(writer, sheet_name='Significant Improvements', index=False)
                    
                    # Label savings
                    if 'savings_df' in locals() and not savings_df.empty:
                        savings_df.to_excel(writer, sheet_name='Label Savings', index=False)
                
                print(f"[ok] F1-Score Zusammenfassung gespeichert in: {excel_filename}")
            except Exception as e:
                logger.error(f"Fehler beim Excel-Export: {e}")
        
    except Exception as e:
        logger.error(f"Fehler beim Speichern der Ergebnisse: {e}")
    
    # Abschlusszusammenfassung
    print("\n" + "="*80)
    print("EXPERIMENT ERFOLGREICH ABGESCHLOSSEN (F1-SCORE VERSION)")
    print(f"Hauptmetrik: F1-Score (Macro Average)")
    print(f"Gesamtanzahl Experimente: {len(all_results)}")
    print(f"Datensatzgröße: {len(y_train):,} Trainingssamples")
    print(f"Klassifikatoren: {len(classifiers)}")
    print(f"Query-Strategien: {len(strategies)}")
    print(f"Budget-Stufen: {len(BUDGET_PERCENTAGES)}")
    print(f"Wiederholungen pro Experiment: {N_RUNS}")
    
    # Statistische Zusammenfassung
    if not stat_results.empty:
        total_comparisons = len(stat_results)
        significant_count = stat_results['significant'].sum() if 'significant' in stat_results.columns else 0
        print(f"\nStatistische Analyse (F1-Score):")
        print(f"- Anzahl Vergleiche: {total_comparisons}")
        print(f"- Signifikante F1-Score Verbesserungen: {significant_count} ({significant_count/total_comparisons*100:.1f}%)")
        print(f"- Verwendeter Test: Wilcoxon Signed-Rank Test")
        print(f"- Effektstärkemaß: Cliff's Delta")
        print(f"- Multiple Vergleiche: Bonferroni-Korrektur")
    
    print("\nF1-Score basierte Label-Einsparungs-Analyse durchgeführt!")
    print("- Visualisierung: plots/dachmaterial_f1_label_savings_analysis.png")
    print("- Bericht: reports/dachmaterial_f1_label_savings_report.txt")
    print("="*80)
    
    return 0


# ## 12. Ausführung

# In[18]:


# Führe das Hauptprogramm aus
if __name__ == "__main__":
    try:
        exit_code = main()
        print(f"\nProgramm beendet mit Exit-Code: {exit_code}")
    except Exception as e:
        logger.error(f"Unerwarteter Fehler im Hauptprogramm: {e}")
        import traceback
        traceback.print_exc()
        exit_code = 1

14:39:05 [INFO] Jupyter/IPython Umgebung erkannt - UTF-8 Handling bereits aktiv
ACTIVE LEARNING AUF DACHMATERIAL - F1-SCORE VERSION
Python Version: 3.13.4
PyTorch Version: 2.7.0+cpu
NumPy Version: 2.2.6
Pandas Version: 2.2.3
Scikit-learn Version: 1.6.1
SciPy Version: 1.15.3
Verwende CPU (keine GPU gefunden)

Experiment-Konfiguration:
- Hauptmetrik: F1-Score (Macro Average)
- Anzahl Runs: 5
- Budget-Stufen: ['20%', '40%', '60%', '80%', '100%']
- Batch-Größe: 500
- Signifikanzniveau: 0.05
- Mindest-Samples pro Klasse: 20
14:39:05 [INFO] Lade vollständigen Dachmaterial-Datensatz...
14:39:05 [INFO] [ok] Datensatz geladen: 8,213 Zeilen, 10 Spalten
14:39:05 [INFO] Ursprüngliche Klassen-Verteilung:
mat_qgis
Ziegel                4112
Metallbahn            1212
Asbest|Faserzement     892
Beton                  838
Bitumen                734
PVC|Polycarbonat       140
Schiefer               128
Glas                   119
Dachbegrünung           24
Kunststoffbahn          12
Kupfer              