In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import warnings
warnings.filterwarnings('ignore')

# Matplotlib für deutsche Umlaute konfigurieren
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

# Seaborn Style setzen
sns.set_style("whitegrid")
sns.set_palette("husl")

# Daten aus den Bildern extrahieren
data = {
    'CNN': {
        'Entropy Sampling': {0.2: 0.90866, 0.4: 0.92344, 0.6: 0.92632, 0.8: 0.92784, 1.0: 0.92922},
        'Least Confidence': {0.2: 0.91166, 0.4: 0.9237, 0.6: 0.92702, 0.8: 0.92878, 1.0: 0.92984},
        'Margin Sampling': {0.2: 0.9142, 0.4: 0.92296, 0.6: 0.92722, 0.8: 0.92876, 1.0: 0.93028},
        'Random Sampling': {0.2: 0.89808, 0.4: 0.91444, 0.6: 0.9208, 0.8: 0.92538, 1.0: 0.92934}
    },
    'Logistic Regression': {
        'Entropy Sampling': {0.2: 0.82298, 0.4: 0.83456, 0.6: 0.83374, 0.8: 0.83494, 1.0: 0.8352},
        'Least Confidence': {0.2: 0.8253, 0.4: 0.83366, 0.6: 0.83362, 0.8: 0.83538, 1.0: 0.83516},
        'Margin Sampling': {0.2: 0.82624, 0.4: 0.83376, 0.6: 0.83384, 0.8: 0.83504, 1.0: 0.83538},
        'Random Sampling': {0.2: 0.80432, 0.4: 0.8195, 0.6: 0.82648, 0.8: 0.83214, 1.0: 0.8349}
    },
    'Naive Bayes': {
        'Entropy Sampling': {0.2: 0.1, 0.4: 0.1, 0.6: 0.1, 0.8: 0.1, 1.0: 0.1},
        'Least Confidence': {0.2: 0.1, 0.4: 0.1, 0.6: 0.1, 0.8: 0.1, 1.0: 0.1},
        'Margin Sampling': {0.2: 0.1, 0.4: 0.1, 0.6: 0.1, 0.8: 0.1, 1.0: 0.1},
        'Random Sampling': {0.2: 0.1, 0.4: 0.1, 0.6: 0.1, 0.8: 0.1, 1.0: 0.1}
    },
    'Random Forest': {
        'Entropy Sampling': {0.2: 0.8241, 0.4: 0.83746, 0.6: 0.8336, 0.8: 0.82494, 1.0: 0.81976},
        'Least Confidence': {0.2: 0.84678, 0.4: 0.84984, 0.6: 0.83322, 0.8: 0.82516, 1.0: 0.81982},
        'Margin Sampling': {0.2: 0.85946, 0.4: 0.84912, 0.6: 0.8346, 0.8: 0.82544, 1.0: 0.82044},
        'Random Sampling': {0.2: 0.81314, 0.4: 0.81798, 0.6: 0.81824, 0.8: 0.81978, 1.0: 0.81944}
    },
    'Support Vector Machine': {
        'Entropy Sampling': {0.2: 0.87148, 0.4: 0.8843, 0.6: 0.88552, 0.8: 0.8859, 1.0: 0.88554},
        'Least Confidence': {0.2: 0.88232, 0.4: 0.8846, 0.6: 0.8861, 0.8: 0.8859, 1.0: 0.88542},
        'Margin Sampling': {0.2: 0.88176, 0.4: 0.88304, 0.6: 0.88514, 0.8: 0.88528, 1.0: 0.88508},
        'Random Sampling': {0.2: 0.85724, 0.4: 0.86926, 0.6: 0.87644, 0.8: 0.88096, 1.0: 0.88356}
    }
}

# Daten in DataFrame umwandeln für einfachere Verarbeitung
df_list = []
for classifier, strategies in data.items():
    for strategy, values in strategies.items():
        for budget, accuracy in values.items():
            df_list.append({
                'Klassifikator': classifier,
                'Query-Strategie': strategy,
                'Budget (%)': budget * 100,
                'Accuracy': accuracy
            })

df = pd.DataFrame(df_list)

# PDF für alle Plots erstellen
pdf = PdfPages('active_learning_analysis.pdf')

# Farbpalette definieren
colors_strategies = {
    'Entropy Sampling': '#1f77b4',
    'Least Confidence': '#ff7f0e',
    'Margin Sampling': '#2ca02c',
    'Random Sampling': '#d62728'
}

colors_classifiers = {
    'CNN': '#9467bd',
    'Logistic Regression': '#8c564b',
    'Naive Bayes': '#e377c2',
    'Random Forest': '#7f7f7f',
    'Support Vector Machine': '#bcbd22'
}

# Plot 1: Übersicht aller Klassifikatoren
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

classifiers = list(data.keys())
for idx, classifier in enumerate(classifiers):
    ax = axes[idx]
    
    # Dynamische Y-Achsen-Skalierung
    all_values = []
    for strategy in data[classifier]:
        all_values.extend(data[classifier][strategy].values())
    
    if classifier == 'Naive Bayes':
        y_min, y_max = 0.095, 0.105
    else:
        y_range = max(all_values) - min(all_values)
        y_min = min(all_values) - y_range * 0.1
        y_max = max(all_values) + y_range * 0.1
    
    for strategy in data[classifier]:
        budgets = list(data[classifier][strategy].keys())
        accuracies = list(data[classifier][strategy].values())
        ax.plot([b*100 for b in budgets], accuracies, 
                marker='o', linewidth=2.5, markersize=8,
                label=strategy, color=colors_strategies[strategy])
    
    ax.set_title(f'{classifier}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Trainingsbudget (%)', fontsize=12)
    ax.set_ylabel('Genauigkeit', fontsize=12)
    ax.set_ylim(y_min, y_max)
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, alpha=0.3)
    ax.set_xticks([20, 40, 60, 80, 100])

# Leere Subplot entfernen
axes[-1].axis('off')

plt.suptitle('Active Learning: Genauigkeit nach Klassifikator und Query-Strategie', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 2: Vergleich der Query-Strategien über alle Klassifikatoren (ohne Naive Bayes)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

strategies = ['Entropy Sampling', 'Least Confidence', 'Margin Sampling', 'Random Sampling']
classifiers_filtered = [c for c in classifiers if c != 'Naive Bayes']

for idx, strategy in enumerate(strategies):
    ax = axes[idx]
    
    for classifier in classifiers_filtered:
        budgets = list(data[classifier][strategy].keys())
        accuracies = list(data[classifier][strategy].values())
        ax.plot([b*100 for b in budgets], accuracies, 
                marker='o', linewidth=2.5, markersize=8,
                label=classifier, color=colors_classifiers[classifier])
    
    ax.set_title(f'{strategy}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Trainingsbudget (%)', fontsize=12)
    ax.set_ylabel('Genauigkeit', fontsize=12)
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, alpha=0.3)
    ax.set_xticks([20, 40, 60, 80, 100])
    
    # Dynamische Y-Achsen-Skalierung
    ax.set_ylim(0.78, 0.94)

plt.suptitle('Vergleich der Query-Strategien über Klassifikatoren', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 3: Heatmap der finalen Genauigkeiten (100% Budget)
final_accuracies = df[df['Budget (%)'] == 100].pivot(
    index='Klassifikator', 
    columns='Query-Strategie', 
    values='Accuracy'
)

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(final_accuracies, annot=True, fmt='.4f', cmap='YlOrRd', 
            cbar_kws={'label': 'Genauigkeit'}, ax=ax, 
            linewidths=1, linecolor='black')
ax.set_title('Finale Genauigkeiten bei 100% Trainingsbudget', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Query-Strategie', fontsize=12)
ax.set_ylabel('Klassifikator', fontsize=12)
plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 4: Effizienz-Analyse (Genauigkeit bei 20% vs 100% Budget)
fig, ax = plt.subplots(figsize=(14, 8))

efficiency_data = []
for classifier in classifiers:
    if classifier != 'Naive Bayes':
        for strategy in strategies:
            acc_20 = data[classifier][strategy][0.2]
            acc_100 = data[classifier][strategy][1.0]
            efficiency = (acc_20 / acc_100) * 100
            efficiency_data.append({
                'Klassifikator': classifier,
                'Query-Strategie': strategy,
                'Effizienz (%)': efficiency
            })

efficiency_df = pd.DataFrame(efficiency_data)
efficiency_pivot = efficiency_df.pivot(
    index='Klassifikator', 
    columns='Query-Strategie', 
    values='Effizienz (%)'
)

x = np.arange(len(efficiency_pivot.index))
width = 0.2

for i, strategy in enumerate(strategies):
    offset = (i - 1.5) * width
    ax.bar(x + offset, efficiency_pivot[strategy], width, 
           label=strategy, color=colors_strategies[strategy])

ax.set_xlabel('Klassifikator', fontsize=12)
ax.set_ylabel('Effizienz (%) - Genauigkeit bei 20% / Genauigkeit bei 100%', fontsize=12)
ax.set_title('Effizienz der Query-Strategien bei frühem Trainingsbudget', 
             fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(efficiency_pivot.index, rotation=45, ha='right')
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim(94, 100)

plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 5: Box-Plot für Variabilität über alle Budgets
fig, ax = plt.subplots(figsize=(14, 8))

# Daten für Box-Plot vorbereiten (ohne Naive Bayes)
box_data = []
box_labels = []
box_colors = []

for classifier in classifiers_filtered:
    for strategy in strategies:
        values = list(data[classifier][strategy].values())
        box_data.append(values)
        box_labels.append(f'{classifier[:3]}\n{strategy[:3]}')
        box_colors.append(colors_classifiers[classifier])

bp = ax.boxplot(box_data, labels=box_labels, patch_artist=True, 
                showmeans=True, meanline=True)

for patch, color in zip(bp['boxes'], box_colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_xlabel('Klassifikator / Query-Strategie', fontsize=12)
ax.set_ylabel('Genauigkeit', fontsize=12)
ax.set_title('Verteilung der Genauigkeiten über alle Trainingsbudgets', 
             fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 6: Lernkurven-Steigung (Verbesserung pro Budget-Schritt)
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

for idx, classifier in enumerate(classifiers_filtered):
    ax = axes[idx]
    
    for strategy in strategies:
        budgets = list(data[classifier][strategy].keys())
        accuracies = list(data[classifier][strategy].values())
        
        # Berechne Verbesserungen zwischen Budget-Schritten
        improvements = []
        for i in range(1, len(accuracies)):
            improvement = (accuracies[i] - accuracies[i-1]) / (budgets[i] - budgets[i-1])
            improvements.append(improvement)
        
        budget_steps = [(budgets[i] + budgets[i+1])/2 * 100 for i in range(len(budgets)-1)]
        ax.plot(budget_steps, improvements, 
                marker='s', linewidth=2, markersize=7,
                label=strategy, color=colors_strategies[strategy])
    
    ax.set_title(f'{classifier}', fontsize=14, fontweight='bold')
    ax.set_xlabel('Trainingsbudget (%)', fontsize=12)
    ax.set_ylabel('Genauigkeitsverbesserung pro Budget-Schritt', fontsize=12)
    ax.legend(loc='best', fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

plt.suptitle('Lernkurven-Steigung: Effizienz der Datennutzung', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 7: Ranking der besten Kombinationen
fig, ax = plt.subplots(figsize=(14, 10))

# Berechne durchschnittliche Genauigkeit über alle Budgets
avg_accuracies = []
for classifier in classifiers:
    for strategy in strategies:
        avg_acc = np.mean(list(data[classifier][strategy].values()))
        avg_accuracies.append({
            'Kombination': f'{classifier} - {strategy}',
            'Durchschnitt': avg_acc,
            'Klassifikator': classifier,
            'Strategie': strategy
        })

avg_df = pd.DataFrame(avg_accuracies)
avg_df = avg_df.sort_values('Durchschnitt', ascending=True)

# Filter Naive Bayes für bessere Visualisierung
avg_df_filtered = avg_df[avg_df['Klassifikator'] != 'Naive Bayes']

y_pos = np.arange(len(avg_df_filtered))
colors_bar = [colors_classifiers[row['Klassifikator']] for _, row in avg_df_filtered.iterrows()]

bars = ax.barh(y_pos, avg_df_filtered['Durchschnitt'], color=colors_bar, alpha=0.8)

# Füge Werte am Ende der Balken hinzu
for i, (value, bar) in enumerate(zip(avg_df_filtered['Durchschnitt'], bars)):
    ax.text(value + 0.002, bar.get_y() + bar.get_height()/2, 
            f'{value:.4f}', va='center', fontsize=9)

ax.set_yticks(y_pos)
ax.set_yticklabels(avg_df_filtered['Kombination'], fontsize=10)
ax.set_xlabel('Durchschnittliche Genauigkeit über alle Budgets', fontsize=12)
ax.set_title('Ranking der Klassifikator-Strategie-Kombinationen', 
             fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

# Dynamische X-Achsen-Skalierung
x_min = avg_df_filtered['Durchschnitt'].min() - 0.01
x_max = avg_df_filtered['Durchschnitt'].max() + 0.01
ax.set_xlim(x_min, x_max)

plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 8: Konvergenzanalyse (Genauigkeit bei verschiedenen Budgets)
fig, ax = plt.subplots(figsize=(14, 8))

budget_levels = [20, 40, 60, 80, 100]
x = np.arange(len(classifiers_filtered))
width = 0.15

for i, budget in enumerate(budget_levels):
    accuracies_at_budget = []
    for classifier in classifiers_filtered:
        # Durchschnitt über alle Strategien bei diesem Budget
        acc_values = [data[classifier][strategy][budget/100] 
                     for strategy in strategies]
        accuracies_at_budget.append(np.mean(acc_values))
    
    offset = (i - 2) * width
    bars = ax.bar(x + offset, accuracies_at_budget, width, 
                  label=f'{budget}% Budget', alpha=0.8)
    
    # Füge Werte über den Balken hinzu
    for bar, value in zip(bars, accuracies_at_budget):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.002,
                f'{value:.3f}', ha='center', va='bottom', fontsize=8, rotation=90)

ax.set_xlabel('Klassifikator', fontsize=12)
ax.set_ylabel('Durchschnittliche Genauigkeit', fontsize=12)
ax.set_title('Konvergenz der Genauigkeit bei steigendem Trainingsbudget', 
             fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(classifiers_filtered)
ax.legend(loc='upper left', ncol=5)
ax.grid(True, alpha=0.3, axis='y')

# Dynamische Y-Achsen-Skalierung
all_values = []
for classifier in classifiers_filtered:
    for strategy in strategies:
        all_values.extend(data[classifier][strategy].values())
y_min = min(all_values) - 0.02
y_max = max(all_values) + 0.03
ax.set_ylim(y_min, y_max)

plt.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()

# PDF schließen
pdf.close()

print("Analyse abgeschlossen! Die Plots wurden in 'active_learning_analysis.pdf' gespeichert.")
print("\nZusammenfassung der Ergebnisse:")
print("="*60)

# Beste Kombinationen ausgeben
print("\nTop 5 Klassifikator-Strategie-Kombinationen (Durchschnitt über alle Budgets):")
top_5 = avg_df_filtered.nlargest(5, 'Durchschnitt')
for i, row in enumerate(top_5.iterrows(), 1):
    print(f"{i}. {row[1]['Kombination']}: {row[1]['Durchschnitt']:.4f}")

print("\nBeste Strategie pro Klassifikator (bei 100% Budget):")
for classifier in classifiers_filtered:
    best_strategy = max(data[classifier].items(), 
                       key=lambda x: x[1][1.0])
    print(f"{classifier}: {best_strategy[0]} ({best_strategy[1][1.0]:.4f})")

print("\nEffizienteste Kombinationen (höchste Genauigkeit bei 20% Budget):")
efficiency_20 = []
for classifier in classifiers_filtered:
    for strategy in strategies:
        efficiency_20.append({
            'Kombination': f'{classifier} - {strategy}',
            'Genauigkeit bei 20%': data[classifier][strategy][0.2]
        })
efficiency_20_df = pd.DataFrame(efficiency_20)
efficiency_20_df = efficiency_20_df.nlargest(5, 'Genauigkeit bei 20%')
for i, row in enumerate(efficiency_20_df.iterrows(), 1):
    print(f"{i}. {row[1]['Kombination']}: {row[1]['Genauigkeit bei 20%']:.4f}")

Analyse abgeschlossen! Die Plots wurden in 'active_learning_analysis.pdf' gespeichert.

Zusammenfassung der Ergebnisse:

Top 5 Klassifikator-Strategie-Kombinationen (Durchschnitt über alle Budgets):
1. CNN - Margin Sampling: 0.9247
2. CNN - Least Confidence: 0.9242
3. CNN - Entropy Sampling: 0.9231
4. CNN - Random Sampling: 0.9176
5. Support Vector Machine - Least Confidence: 0.8849

Beste Strategie pro Klassifikator (bei 100% Budget):
CNN: Margin Sampling (0.9303)
Logistic Regression: Margin Sampling (0.8354)
Random Forest: Margin Sampling (0.8204)
Support Vector Machine: Entropy Sampling (0.8855)

Effizienteste Kombinationen (höchste Genauigkeit bei 20% Budget):
1. CNN - Margin Sampling: 0.9142
2. CNN - Least Confidence: 0.9117
3. CNN - Entropy Sampling: 0.9087
4. CNN - Random Sampling: 0.8981
5. Support Vector Machine - Least Confidence: 0.8823
