In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import warnings
warnings.filterwarnings('ignore')

# Matplotlib für deutsche Beschriftungen konfigurieren
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['figure.titlesize'] = 16

# Seaborn Style setzen
sns.set_style("whitegrid")
sns.set_palette("husl")

# Daten manuell eingeben basierend auf den PDFs
# Die Daten aus den 5 PDFs repräsentieren die 5 Klassifikatoren

# Daten für CNN (Dokument 1)
data_cnn = {
    'strategy': ['Random Sampling']*25 + ['Entropy Sampling']*25 + ['Margin Sampling']*25 + ['Least Confidence']*25,
    'budget_pct': ([0.2]*5 + [0.4]*5 + [0.6]*5 + [0.8]*5 + [1.0]*5) * 4,
    'run': list(range(5)) * 20,
    'n_labeled': ([12000]*5 + [24000]*5 + [36000]*5 + [48000]*5 + [60000]*5) * 4,
    'avg_train_time': [
        # Random Sampling
        1.521085076, 1.502775908, 1.501781723, 1.502621081, 1.502706331,
        2.97725921, 2.983116505, 2.997533179, 3.012997597, 3.020966895,
        4.5361837, 4.544467459, 4.550611335, 4.553623045, 4.556777531,
        6.068494465, 6.071016299, 6.071812213, 6.073508913, 6.076823483,
        7.592866254, 7.595250643, 7.596366393, 7.597735401, 7.597040092,
        # Entropy Sampling
        1.533692256, 1.529967018, 1.530879539, 1.530919199, 1.529847155,
        3.042272654, 3.044967428, 3.04630348, 3.044037535, 3.042879394,
        4.562415472, 4.564509718, 4.564144403, 4.56408135, 4.562399176,
        6.083620132, 6.083112973, 6.082519689, 6.081528182, 6.083057825,
        7.606978691, 7.605105717, 7.633382693, 7.662749339, 7.662503158,
        # Margin Sampling
        1.544966718, 1.544624308, 1.541594868, 1.541721904, 1.543142764,
        3.068383024, 3.070673745, 3.069041729, 3.071545784, 3.069628832,
        4.599050807, 4.599122638, 4.600143036, 4.598187269, 4.599487278,
        6.128177532, 6.126837956, 6.132447323, 6.15017252, 6.154459896,
        7.693038536, 7.693774452, 7.691228987, 7.689596619, 7.693505323,
        # Least Confidence
        1.549689438, 1.549501222, 1.550653592, 1.551303781, 1.581333907,
        3.159619839, 3.156467524, 3.156243116, 3.154623001, 3.154910296,
        4.719349008, 4.721554813, 4.723619152, 4.723466504, 4.722391434,
        6.290463124, 6.289522076, 6.288772335, 6.292358554, 6.29081067,
        7.862094092, 7.861531244, 7.856163269, 7.856267458, 7.854389507
    ]
}

# Daten für Logistic Regression (Dokument 2)
data_lr = {
    'strategy': ['Random Sampling']*25 + ['Entropy Sampling']*25 + ['Margin Sampling']*25 + ['Least Confidence']*25,
    'budget_pct': ([0.2]*5 + [0.4]*5 + [0.6]*5 + [0.8]*5 + [1.0]*5) * 4,
    'run': list(range(5)) * 20,
    'n_labeled': ([12000]*5 + [24000]*5 + [36000]*5 + [48000]*5 + [60000]*5) * 4,
    'avg_train_time': [
        # Random Sampling
        0.430414107, 0.434070763, 0.437924333, 0.442383891, 0.44850282,
        0.666724175, 0.663643624, 0.666885457, 0.666870538, 0.66949228,
        0.906450863, 0.905770513, 0.906783383, 0.90681656, 0.91027872,
        1.153505619, 1.153761931, 1.155046857, 1.157732933, 1.158681829,
        1.40018023, 1.399510408, 1.397971267, 1.393453348, 1.394937257,
        # Entropy Sampling
        0.44753084, 0.443597534, 0.446408096, 0.445562954, 0.448979077,
        0.669268989, 0.668161316, 0.67034522, 0.672580425, 0.673893269,
        0.902393724, 0.903276306, 0.903596949, 0.902608667, 0.905928283,
        1.142473753, 1.144409172, 1.144791229, 1.143618097, 1.148521042,
        1.386908441, 1.388435424, 1.391244007, 1.388608534, 1.391419936,
        # Margin Sampling
        0.449506324, 0.446377806, 0.450042009, 0.449820829, 0.451139512,
        0.67007042, 0.667958665, 0.670687914, 0.670846893, 0.670404673,
        0.900362717, 0.902063168, 0.905314432, 0.903093623, 0.904682129,
        1.141420533, 1.145052533, 1.14466871, 1.144293281, 1.147952557,
        1.387121649, 1.388144371, 1.390303175, 1.387223406, 1.389719134,
        # Least Confidence
        0.452533763, 0.450354348, 0.446699371, 0.449374271, 0.453598955,
        0.670029097, 0.67045781, 0.675982176, 0.673277388, 0.676585908,
        0.910121202, 0.907668372, 0.910441033, 0.909627055, 0.912553582,
        1.152339536, 1.150865906, 1.150365531, 1.15430273, 1.150210135,
        1.3962539, 1.397003805, 1.398311481, 1.395124504, 1.400465492
    ]
}

# Daten für Naive Bayes (Dokument 3 - sehr kleine Zeiten)
data_nb = {
    'strategy': ['Random Sampling']*25 + ['Entropy Sampling']*25 + ['Margin Sampling']*25 + ['Least Confidence']*25,
    'budget_pct': ([0.2]*5 + [0.4]*5 + [0.6]*5 + [0.8]*5 + [1.0]*5) * 4,
    'run': list(range(5)) * 20,
    'n_labeled': ([12000]*5 + [24000]*5 + [36000]*5 + [48000]*5 + [60000]*5) * 4,
    'avg_train_time': [
        # Random Sampling
        0.044904242, 0.045427913, 0.045572312, 0.044479152, 0.044497075,
        0.085929891, 0.087938755, 0.088246969, 0.086517978, 0.089378403,
        0.128625044, 0.128465757, 0.128066493, 0.126750429, 0.129766797,
        0.172569523, 0.172209065, 0.173458007, 0.171681811, 0.17276723,
        0.215535144, 0.216414027, 0.213230311, 0.213274355, 0.215519274,
        # Entropy Sampling
        0.044630714, 0.044069798, 0.044171157, 0.044869796, 0.044918496,
        0.088534279, 0.088299097, 0.086302138, 0.086233722, 0.088829167,
        0.130589102, 0.130465763, 0.131825827, 0.130653297, 0.128563243,
        0.173025558, 0.176241031, 0.171646522, 0.17273794, 0.173139507,
        0.216089974, 0.213906793, 0.214336381, 0.215689477, 0.21318728,
        # Margin Sampling
        0.044133394, 0.044300017, 0.044421072, 0.044250955, 0.044895411,
        0.087127107, 0.086885615, 0.08565505, 0.088557522, 0.087548271,
        0.129344158, 0.1297024, 0.12924326, 0.130578421, 0.129503398,
        0.171091529, 0.172271104, 0.171494788, 0.170917556, 0.171385983,
        0.213984505, 0.212621154, 0.214580682, 0.213818883, 0.214850883,
        # Least Confidence
        0.044761658, 0.044804884, 0.043863711, 0.044454968, 0.044124821,
        0.088369187, 0.086711336, 0.088337807, 0.087724036, 0.08805419,
        0.129984503, 0.130504967, 0.129831418, 0.130670269, 0.13072482,
        0.172372896, 0.172822473, 0.171587184, 0.172248715, 0.172725286,
        0.217317597, 0.216626286, 0.215034569, 0.213721053, 0.214255287
    ]
}

# Daten für Random Forest (Dokument 4)
data_rf = {
    'strategy': ['Random Sampling']*25 + ['Entropy Sampling']*25 + ['Margin Sampling']*25 + ['Least Confidence']*25,
    'budget_pct': ([0.2]*5 + [0.4]*5 + [0.6]*5 + [0.8]*5 + [1.0]*5) * 4,
    'run': list(range(5)) * 20,
    'n_labeled': ([12000]*5 + [24000]*5 + [36000]*5 + [48000]*5 + [60000]*5) * 4,
    'avg_train_time': [
        # Random Sampling
        0.181024375, 0.181443173, 0.181925473, 0.182831526, 0.182940079,
        0.209142624, 0.204839443, 0.20475943, 0.206234283, 0.209145926,
        0.231014534, 0.231087369, 0.231550378, 0.231923066, 0.23275368,
        0.261499407, 0.25845696, 0.260742654, 0.258089964, 0.259150809,
        0.279392623, 0.278795467, 0.279237723, 0.281439607, 0.285700948,
        # Entropy Sampling
        0.189325571, 0.188209254, 0.189695483, 0.196063819, 0.189229022,
        0.214630142, 0.213050604, 0.21443714, 0.21203073, 0.212053624,
        0.235228374, 0.235411547, 0.235309104, 0.237508985, 0.235569195,
        0.262416443, 0.261908689, 0.261195991, 0.259595264, 0.258364911,
        0.28291877, 0.282267979, 0.282410894, 0.282449556, 0.285054654,
        # Margin Sampling
        0.189271616, 0.187176829, 0.188204071, 0.187763297, 0.190631379,
        0.21764808, 0.21725611, 0.213284726, 0.215038132, 0.21258655,
        0.237819007, 0.236781735, 0.23719843, 0.236060928, 0.235635825,
        0.259415494, 0.25997328, 0.258534948, 0.257463741, 0.258455475,
        0.280901923, 0.283022277, 0.2842962, 0.284526024, 0.285941531,
        # Least Confidence
        0.186542625, 0.187671703, 0.187047585, 0.191757814, 0.187757471,
        0.213432155, 0.213734201, 0.21515722, 0.213531854, 0.214701226,
        0.24190396, 0.241706741, 0.24122898, 0.241975929, 0.241126111,
        0.262677755, 0.264882198, 0.265062676, 0.262004348, 0.267042474,
        0.292114174, 0.28661645, 0.288433734, 0.287224451, 0.289441369
    ]
}

# Daten für SVM (Dokument 5)
data_svm = {
    'strategy': ['Random Sampling']*25 + ['Entropy Sampling']*25 + ['Margin Sampling']*25 + ['Least Confidence']*25,
    'budget_pct': ([0.2]*5 + [0.4]*5 + [0.6]*5 + [0.8]*5 + [1.0]*5) * 4,
    'run': list(range(5)) * 20,
    'n_labeled': ([12000]*5 + [24000]*5 + [36000]*5 + [48000]*5 + [60000]*5) * 4,
    'avg_train_time': [
        # Random Sampling
        5.59030737, 5.631493206, 5.637021417, 5.714793775, 5.692046165,
        6.680006372, 6.678584266, 6.655204053, 6.704958682, 6.669765878,
        7.703251607, 7.709751032, 7.695497489, 7.716707777, 7.718714835,
        8.827618975, 8.825608959, 8.809704961, 8.846616552, 8.819319994,
        12.17497655, 12.0745011, 12.10531111, 12.07346025, 12.16441602,
        # Entropy Sampling
        5.791202027, 5.759870301, 5.784416054, 5.787054186, 5.769426107,
        7.090640139, 7.07274338, 7.065608319, 7.074797514, 7.070599307,
        8.271367547, 8.267165543, 8.263351273, 8.267215151, 8.252499574,
        9.469496285, 9.45080651, 14.47313201, 15.22126127, 15.19501162,
        19.86465102, 19.80036875, 19.78375989, 19.73566187, 19.74067761,
        # Margin Sampling
        7.443813708, 7.377446714, 7.40087108, 7.412813207, 7.401774406,
        10.13653782, 10.16182711, 10.14165094, 10.15075541, 10.1491361,
        12.67734949, 12.71951891, 12.68432982, 12.69772379, 12.68531133,
        15.2147598, 15.23573265, 15.19877471, 15.19631885, 15.21041072,
        19.79250878, 19.8052629, 19.84635638, 19.76931348, 19.72871738,
        # Least Confidence
        7.410335292, 7.416148144, 7.402038149, 7.42436493, 7.435452969,
        10.17758531, 10.17286485, 10.18398625, 10.17652081, 10.17764466,
        12.7190653, 12.73554912, 12.70581536, 12.71939814, 12.72156518,
        15.22794602, 15.24144237, 15.22603206, 15.22580796, 15.23181833,
        19.75822038, 19.84954281, 19.82456296, 19.77718003, 19.84717054
    ]
}

# DataFrames erstellen
df_cnn = pd.DataFrame(data_cnn)
df_cnn['classifier'] = 'CNN'

df_lr = pd.DataFrame(data_lr)
df_lr['classifier'] = 'Logistic Regression'

df_nb = pd.DataFrame(data_nb)
df_nb['classifier'] = 'Naive Bayes'

df_rf = pd.DataFrame(data_rf)
df_rf['classifier'] = 'Random Forest'

df_svm = pd.DataFrame(data_svm)
df_svm['classifier'] = 'SVM'

# Alle Daten kombinieren
df_all = pd.concat([df_cnn, df_lr, df_nb, df_rf, df_svm], ignore_index=True)

# Deutsche Übersetzungen für Strategien
strategy_translation = {
    'Random Sampling': 'Zufallsauswahl',
    'Entropy Sampling': 'Entropie-Auswahl',
    'Margin Sampling': 'Margin-Auswahl',
    'Least Confidence': 'Geringste Konfidenz'
}

df_all['strategy_de'] = df_all['strategy'].map(strategy_translation)

# Statistiken berechnen
df_stats = df_all.groupby(['classifier', 'strategy_de', 'n_labeled'])['avg_train_time'].agg(['mean', 'std']).reset_index()

# PDF für alle Plots erstellen
pdf_pages = PdfPages('active_learning_trainingszeiten.pdf')

# Plot 1: Trainingszeiten nach Klassifikator und Strategie
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Active Learning: Trainingszeiten nach Klassifikator und Query-Strategie', fontsize=16, y=1.02)

classifiers = ['CNN', 'Logistic Regression', 'Naive Bayes', 'Random Forest', 'SVM']
colors = sns.color_palette("husl", 4)

for idx, classifier in enumerate(classifiers):
    ax = axes.flatten()[idx]
    df_classifier = df_all[df_all['classifier'] == classifier]
    
    for i, strategy in enumerate(df_classifier['strategy_de'].unique()):
        df_strategy = df_classifier[df_classifier['strategy_de'] == strategy]
        df_grouped = df_strategy.groupby('n_labeled')['avg_train_time'].agg(['mean', 'std']).reset_index()
        
        ax.errorbar(df_grouped['n_labeled'], df_grouped['mean'], yerr=df_grouped['std'], 
                   label=strategy, marker='o', capsize=5, capthick=2, linewidth=2, 
                   markersize=8, alpha=0.8, color=colors[i])
    
    ax.set_xlabel('Anzahl gelabelter Daten', fontsize=11)
    ax.set_ylabel('Trainingszeit (Sekunden)', fontsize=11)
    ax.set_title(f'{classifier}', fontsize=13, fontweight='bold')
    ax.legend(loc='best', frameon=True, fancybox=True, shadow=True)
    ax.grid(True, alpha=0.3)
    ax.set_xlim(8000, 64000)
    
    # Y-Achse dynamisch skalieren
    y_min, y_max = ax.get_ylim()
    y_range = y_max - y_min
    if y_range < 1:  # Für sehr kleine Unterschiede
        ax.set_ylim(y_min - 0.1 * y_range, y_max + 0.1 * y_range)

# Leeres Subplot entfernen
axes.flatten()[-1].remove()

plt.tight_layout()
pdf_pages.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 2: Vergleich aller Klassifikatoren pro Strategie
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Trainingszeiten: Vergleich der Klassifikatoren nach Query-Strategie', fontsize=16, y=1.02)

strategies_de = df_all['strategy_de'].unique()
classifier_colors = sns.color_palette("Set2", 5)

for idx, strategy in enumerate(strategies_de):
    ax = axes.flatten()[idx]
    df_strategy = df_all[df_all['strategy_de'] == strategy]
    
    for i, classifier in enumerate(classifiers):
        df_clf = df_strategy[df_strategy['classifier'] == classifier]
        df_grouped = df_clf.groupby('n_labeled')['avg_train_time'].agg(['mean', 'std']).reset_index()
        
        ax.errorbar(df_grouped['n_labeled'], df_grouped['mean'], yerr=df_grouped['std'],
                   label=classifier, marker='o', capsize=3, linewidth=2,
                   markersize=7, alpha=0.8, color=classifier_colors[i])
    
    ax.set_xlabel('Anzahl gelabelter Daten', fontsize=11)
    ax.set_ylabel('Trainingszeit (Sekunden)', fontsize=11)
    ax.set_title(f'{strategy}', fontsize=13, fontweight='bold')
    ax.legend(loc='best', frameon=True, fancybox=True, shadow=True)
    ax.grid(True, alpha=0.3)
    ax.set_xlim(8000, 64000)
    
    # Logarithmische Skala für bessere Darstellung der Unterschiede
    ax.set_yscale('log')

plt.tight_layout()
pdf_pages.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 3: Heatmap der durchschnittlichen Trainingszeiten
fig, ax = plt.subplots(figsize=(14, 8))

# Pivot-Tabelle für Heatmap erstellen
pivot_data = df_all.pivot_table(values='avg_train_time', 
                                index=['classifier'], 
                                columns=['strategy_de', 'n_labeled'], 
                                aggfunc='mean')

# Spalten neu ordnen für bessere Lesbarkeit
columns_order = []
for strategy in strategies_de:
    for n_labeled in sorted(df_all['n_labeled'].unique()):
        columns_order.append((strategy, n_labeled))

pivot_data = pivot_data[columns_order]

# Heatmap erstellen mit logarithmischer Farbskala
im = ax.imshow(np.log10(pivot_data.values), cmap='YlOrRd', aspect='auto')

# Achsenbeschriftungen
ax.set_xticks(range(len(pivot_data.columns)))
ax.set_xticklabels([f'{s}\n{n//1000}k' for s, n in pivot_data.columns], rotation=45, ha='right', fontsize=9)
ax.set_yticks(range(len(pivot_data.index)))
ax.set_yticklabels(pivot_data.index, fontsize=11)

# Colorbar mit korrekten Labels
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Log10(Trainingszeit in Sekunden)', fontsize=11)

# Titel
ax.set_title('Heatmap: Logarithmische Trainingszeiten nach Klassifikator und Strategie', 
            fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
pdf_pages.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 4: Boxplots für Variabilität der Trainingszeiten
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('Variabilität der Trainingszeiten', fontsize=16, y=1.02)

# Boxplot nach Klassifikator
ax1 = axes[0]
df_melted = df_all[['classifier', 'avg_train_time']].copy()
classifiers_sorted = df_melted.groupby('classifier')['avg_train_time'].median().sort_values().index
sns.boxplot(data=df_melted, y='classifier', x='avg_train_time', order=classifiers_sorted, ax=ax1, palette='Set2')
ax1.set_xlabel('Trainingszeit (Sekunden)', fontsize=11)
ax1.set_ylabel('Klassifikator', fontsize=11)
ax1.set_title('Verteilung nach Klassifikator', fontsize=13, fontweight='bold')
ax1.set_xscale('log')
ax1.grid(True, alpha=0.3, axis='x')

# Boxplot nach Strategie
ax2 = axes[1]
df_melted2 = df_all[['strategy_de', 'avg_train_time']].copy()
sns.boxplot(data=df_melted2, x='strategy_de', y='avg_train_time', ax=ax2, palette='husl')
ax2.set_xlabel('Query-Strategie', fontsize=11)
ax2.set_ylabel('Trainingszeit (Sekunden)', fontsize=11)
ax2.set_title('Verteilung nach Query-Strategie', fontsize=13, fontweight='bold')
ax2.set_yscale('log')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
pdf_pages.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 5: Skalierungsverhalten (Trainingszeit vs. Datenmenge)
fig, ax = plt.subplots(figsize=(14, 10))

# Durchschnittliche Trainingszeit pro Klassifikator und Datenmenge
for classifier in classifiers:
    df_clf = df_all[df_all['classifier'] == classifier]
    df_grouped = df_clf.groupby('n_labeled')['avg_train_time'].agg(['mean', 'std']).reset_index()
    
    ax.plot(df_grouped['n_labeled'], df_grouped['mean'], marker='o', 
           markersize=10, linewidth=3, label=classifier, alpha=0.8)
    
    # Trendlinie hinzufügen
    z = np.polyfit(df_grouped['n_labeled'], df_grouped['mean'], 1)
    p = np.poly1d(z)
    ax.plot(df_grouped['n_labeled'], p(df_grouped['n_labeled']), 
           linestyle='--', alpha=0.5, linewidth=1)

ax.set_xlabel('Anzahl gelabelter Daten', fontsize=12)
ax.set_ylabel('Durchschnittliche Trainingszeit (Sekunden)', fontsize=12)
ax.set_title('Skalierungsverhalten der Klassifikatoren', fontsize=14, fontweight='bold')
ax.legend(loc='best', frameon=True, fancybox=True, shadow=True, fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_xlim(8000, 64000)

# Zweite Y-Achse für Naive Bayes (wegen der sehr kleinen Werte)
ax2 = ax.twinx()
df_nb_only = df_all[df_all['classifier'] == 'Naive Bayes']
df_nb_grouped = df_nb_only.groupby('n_labeled')['avg_train_time'].agg(['mean']).reset_index()
ax2.plot(df_nb_grouped['n_labeled'], df_nb_grouped['mean'], 
        color='red', marker='^', markersize=8, linewidth=2, 
        alpha=0.6, linestyle=':', label='Naive Bayes (rechte Achse)')
ax2.set_ylabel('Trainingszeit Naive Bayes (Sekunden)', fontsize=11, color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.tight_layout()
pdf_pages.savefig(fig, bbox_inches='tight')
plt.close()

# Plot 6: Relative Performanz (normalisiert auf Random Sampling)
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Relative Trainingszeit im Vergleich zur Zufallsauswahl', fontsize=16, y=1.02)

for idx, classifier in enumerate(classifiers):
    ax = axes.flatten()[idx]
    df_classifier = df_all[df_all['classifier'] == classifier]
    
    # Random Sampling als Baseline
    df_random = df_classifier[df_classifier['strategy_de'] == 'Zufallsauswahl']
    baseline = df_random.groupby('n_labeled')['avg_train_time'].mean()
    
    for strategy in df_classifier['strategy_de'].unique():
        if strategy != 'Zufallsauswahl':
            df_strategy = df_classifier[df_classifier['strategy_de'] == strategy]
            df_grouped = df_strategy.groupby('n_labeled')['avg_train_time'].mean()
            
            relative_time = (df_grouped / baseline) * 100
            ax.plot(relative_time.index, relative_time.values, 
                   marker='o', label=strategy, linewidth=2, markersize=8)
    
    ax.axhline(y=100, color='black', linestyle='--', alpha=0.5, label='Zufallsauswahl (100%)')
    ax.set_xlabel('Anzahl gelabelter Daten', fontsize=11)
    ax.set_ylabel('Relative Trainingszeit (%)', fontsize=11)
    ax.set_title(f'{classifier}', fontsize=13, fontweight='bold')
    ax.legend(loc='best', frameon=True, fancybox=True, shadow=True, fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.set_xlim(8000, 64000)

# Leeres Subplot entfernen
axes.flatten()[-1].remove()

plt.tight_layout()
pdf_pages.savefig(fig, bbox_inches='tight')
plt.close()

# PDF schließen
pdf_pages.close()

# Zusammenfassende Statistiken ausgeben
print("=" * 80)
print("ZUSAMMENFASSUNG DER TRAININGSZEITEN")
print("=" * 80)
print("\nDurchschnittliche Trainingszeiten nach Klassifikator (in Sekunden):")
print("-" * 60)
summary_clf = df_all.groupby('classifier')['avg_train_time'].agg(['mean', 'std', 'min', 'max'])
summary_clf = summary_clf.round(4)
print(summary_clf.to_string())

print("\n\nDurchschnittliche Trainingszeiten nach Query-Strategie (in Sekunden):")
print("-" * 60)
summary_strategy = df_all.groupby('strategy_de')['avg_train_time'].agg(['mean', 'std', 'min', 'max'])
summary_strategy = summary_strategy.round(4)
print(summary_strategy.to_string())

print("\n\nSkalierungsfaktor (60k vs 12k Daten) nach Klassifikator:")
print("-" * 60)
for classifier in classifiers:
    df_clf = df_all[df_all['classifier'] == classifier]
    time_12k = df_clf[df_clf['n_labeled'] == 12000]['avg_train_time'].mean()
    time_60k = df_clf[df_clf['n_labeled'] == 60000]['avg_train_time'].mean()
    scaling_factor = time_60k / time_12k
    print(f"{classifier:20s}: {scaling_factor:.2f}x")

print("\n\nEffizienteste Kombination (Top 5 schnellste):")
print("-" * 60)
df_mean = df_all.groupby(['classifier', 'strategy_de', 'n_labeled'])['avg_train_time'].mean().reset_index()
df_mean_sorted = df_mean.sort_values('avg_train_time').head(10)
for idx, row in df_mean_sorted.iterrows():
    print(f"{row['classifier']:20s} | {row['strategy_de']:20s} | {row['n_labeled']:6.0f} Daten | {row['avg_train_time']:.4f} Sek")

print("\n\nLangsamste Kombination (Top 5 langsamste):")
print("-" * 60)
df_mean_sorted_slow = df_mean.sort_values('avg_train_time', ascending=False).head(10)
for idx, row in df_mean_sorted_slow.iterrows():
    print(f"{row['classifier']:20s} | {row['strategy_de']:20s} | {row['n_labeled']:6.0f} Daten | {row['avg_train_time']:.4f} Sek")

print("\n" + "=" * 80)
print("PDF 'active_learning_trainingszeiten.pdf' wurde erfolgreich erstellt!")
print("=" * 80)

ZUSAMMENFASSUNG DER TRAININGSZEITEN

Durchschnittliche Trainingszeiten nach Klassifikator (in Sekunden):
------------------------------------------------------------
                        mean     std     min      max
classifier                                           
CNN                   4.6101  2.1901  1.5018   7.8621
Logistic Regression   0.9130  0.3373  0.4304   1.4005
Naive Bayes           0.1298  0.0604  0.0439   0.2173
Random Forest         0.2362  0.0346  0.1810   0.2921
SVM                  11.2627  4.6274  5.5903  19.8647


Durchschnittliche Trainingszeiten nach Query-Strategie (in Sekunden):
------------------------------------------------------------
                       mean     std     min      max
strategy_de                                         
Entropie-Auswahl     3.3164  4.7994  0.0441  19.8647
Geringste Konfidenz  3.8148  5.3964  0.0439  19.8495
Margin-Auswahl       3.7870  5.3817  0.0441  19.8464
Zufallsauswahl       2.8032  3.4544  0.0445  12.1750


Ska