# Notebook 03: Klassische Modelle mit TF-IDF

Vergleich zweier linearer Baselines (Logistic Regression, Multinomial Naive Bayes) auf den in Notebook 02 erzeugten Textvarianten. Fokus: schneller, reproduzierbarer Benchmark + Interpretierbarkeit (Koeffizienten / log Wahrscheinlichkeiten).

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report
from scipy.sparse import hstack

RANDOM_STATE = 42
TEST_SIZE = 0.2
WORD_VOCAB_MAX = 50000
CHAR_VOCAB_MAX = 60000
DATA_DIR = Path('../data/processed/')
# Mögliche Varianten: 'raw', 'clean', 'raw_lemma', 'clean_lemma'
VARIANT = 'clean'  # anpassen für Einzel-Lauf

FILE_MAP = {
    'raw': 'cves_processed_text_raw.csv',
    'clean': 'cves_processed_text_clean.csv',
    'raw_lemma': 'cves_processed_text_raw_lemma.csv',
    'clean_lemma': 'cves_processed_text_clean_lemma.csv'
}
# TEXT_COL_MAP kann None setzen -> automatische Erkennung.
TEXT_COL_MAP = {
    'raw': None,
    'clean': None,
    'raw_lemma': None,
    'clean_lemma': None
}

def detect_text_column(df: pd.DataFrame) -> str:
    candidates = [c for c in df.columns if c.startswith('description_')]
    if not candidates:
        candidates = [c for c in df.columns if 'description' in c]
    if not candidates:
        raise ValueError(f"Keine Beschreibungsspalte gefunden. Spalten: {df.columns.tolist()}")
    # Wähle Spalte mit größter mittlerer Textlänge
    best = max(candidates, key=lambda c: df[c].astype(str).str.len().mean())
    return best

variant_file = DATA_DIR / FILE_MAP[VARIANT]
assert variant_file.exists(), f"Variant file missing: {variant_file}"
print({'variant': VARIANT, 'file': str(variant_file)})

## 1. Konfiguration & Variantenauswahl

Legt globale Parameter (Random State, Vokabulargrenzen) fest und wählt eine konkrete Textvariante (`VARIANT`). Unterstützt: raw, clean, raw_lemma, clean_lemma.

In [None]:
df = pd.read_csv(variant_file)
required_base = {'cve_id','severity','severity_id'}
missing_base = required_base - set(df.columns)
assert not missing_base, f'Missing columns: {missing_base}'

# Textspalte identifizieren
mapped = TEXT_COL_MAP.get(VARIANT)
if mapped is not None and mapped in df.columns:
    text_col = mapped
else:
    text_col = detect_text_column(df)
print('Verwendete Textspalte:', text_col)

print('Shape (original):', df.shape)
print('Klassenverteilung (original):', df['severity_id'].value_counts().to_dict())

X = df[text_col].astype(str).values
y = df['severity_id'].values

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)
print({'train': len(X_train_text), 'test': len(X_test_text), 'text_col': text_col})

## 2. Daten laden & Train/Test Split

Liest die ausgewählte Variantendatei, erkennt dynamisch die Textspalte, filtert nichts weiter und erstellt einen stratifizierten Train/Test Split.

In [None]:
print('DEBUG detect_text_column output check passed for:', text_col)

In [None]:
word_vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=2,
    max_features=WORD_VOCAB_MAX,
    strip_accents='unicode',
    lowercase=False,
    sublinear_tf=True,
    dtype=np.float32
)
char_vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3,5),
    min_df=2,
    max_features=CHAR_VOCAB_MAX,
    lowercase=False,
    sublinear_tf=True,
    dtype=np.float32
)
X_train_word = word_vectorizer.fit_transform(X_train_text)
X_test_word = word_vectorizer.transform(X_test_text)
X_train_char = char_vectorizer.fit_transform(X_train_text)
X_test_char = char_vectorizer.transform(X_test_text)
X_train = hstack([X_train_word, X_train_char]).tocsr()
X_test = hstack([X_test_word, X_test_char]).tocsr()
print({'X_train': X_train.shape, 'X_test': X_test.shape})

## 3. TF-IDF Vektorisierung (Wort + Zeichen)

Zwei getrennte TF-IDF Repräsentationen (Word 1–2grams, Char 3–5grams) werden gebildet und dann horizontal zusammengeführt. Char n-grams fangen Subwort-/Muster (z.B. API, Datei-Endungen, Exploit-Signaturen) ein.

In [None]:
results = []
# Logistic Regression
logreg = LogisticRegression(max_iter=500, solver='lbfgs', random_state=RANDOM_STATE)
logreg.fit(X_train, y_train)
log_preds = logreg.predict(X_test)
log_acc = accuracy_score(y_test, log_preds)
log_f1 = f1_score(y_test, log_preds, average='macro')
results.append({'model': 'LogisticRegression', 'accuracy': log_acc, 'macro_f1': log_f1})
print('LogReg:', {'accuracy': round(log_acc,4), 'macro_f1': round(log_f1,4)})

# Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)
nb_f1 = f1_score(y_test, nb_preds, average='macro')
results.append({'model': 'MultinomialNB', 'accuracy': nb_acc, 'macro_f1': nb_f1})
print('MultinomialNB:', {'accuracy': round(nb_acc,4), 'macro_f1': round(nb_f1,4)})

results_df = pd.DataFrame(results)
print('\nErgebnisübersicht:')
print(results_df)

## 4. Baseline-Training der Modelle

Trainiert Logistic Regression (multinomial) und Multinomial Naive Bayes auf der kombinierten TF-IDF Matrix. Bewertet mit Accuracy & Macro-F1.

In [None]:
best_row = results_df.sort_values('macro_f1', ascending=False).iloc[0]
print('\nBestes Modell nach macro_f1:', best_row['model'])
if best_row['model'] == 'LogisticRegression':
    best_preds = log_preds
else:
    best_preds = nb_preds
print(classification_report(y_test, best_preds))

### 4.1 Klassifikationsbericht (Bestes Modell)

Erzeugt einen detaillierten Report (Precision/Recall/F1) für das leistungsstärkste Baseline-Modell nach Macro-F1.

### 5.1 Support-Filter für seltene Klassen

Optional: Entfernt Klassen mit sehr geringem Test-Support (< `MIN_TEST_SUPPORT`) aus dem Bericht, um verzerrte Metriken / Warnungen zu reduzieren.

In [None]:
from time import time
from pathlib import Path as _Path

VARIANT_LIST = ['raw','clean','raw_lemma','clean_lemma']
comp_rows = []
results_path_variants = _Path('../results/classic_variant_comparison.csv')

for v in VARIANT_LIST:
    f = DATA_DIR / FILE_MAP[v]
    if not f.exists():
        print(f"[SKIP] Datei fehlt: {f}")
        continue
    d = pd.read_csv(f)
    # Dynamische Spaltenerkennung
    tcol_map_entry = TEXT_COL_MAP.get(v)
    if tcol_map_entry and tcol_map_entry in d.columns:
        tcol = tcol_map_entry
    else:
        try:
            tcol = detect_text_column(d)
        except Exception as e:
            print(f"[SKIP] Keine geeignete Textspalte in {f.name}: {e}")
            continue
    if tcol not in d.columns:
        print(f"[SKIP] Spalte {tcol} fehlt in {f.name}")
        continue
    Xv = d[tcol].astype(str).values
    yv = d['severity_id'].values
    # Stratified split
    X_tr, X_te, y_tr, y_te = train_test_split(Xv, yv, test_size=TEST_SIZE, stratify=yv, random_state=RANDOM_STATE)
    # Vectorizer (gleiche Einstellungen wie Basislauf)
    w_vec = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=WORD_VOCAB_MAX, strip_accents='unicode', lowercase=False, sublinear_tf=True, dtype=np.float32)
    c_vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=2, max_features=CHAR_VOCAB_MAX, lowercase=False, sublinear_tf=True, dtype=np.float32)
    t0 = time()
    X_tr_w = w_vec.fit_transform(X_tr)
    X_te_w = w_vec.transform(X_te)
    X_tr_c = c_vec.fit_transform(X_tr)
    X_te_c = c_vec.transform(X_te)
    X_tr_all = hstack([X_tr_w, X_tr_c]).tocsr()
    X_te_all = hstack([X_te_w, X_te_c]).tocsr()
    vec_time = time() - t0

    # Modelle
    # LogReg
    lr_m = LogisticRegression(max_iter=500, solver='lbfgs', random_state=RANDOM_STATE)
    lr_m.fit(X_tr_all, y_tr)
    lr_preds = lr_m.predict(X_te_all)
    lr_acc = accuracy_score(y_te, lr_preds)
    lr_f1 = f1_score(y_te, lr_preds, average='macro')
    comp_rows.append({'variant': v, 'model':'LogisticRegression', 'accuracy': lr_acc, 'macro_f1': lr_f1, 'vectorize_time_sec': vec_time})

    # NB
    nb_m = MultinomialNB()
    nb_m.fit(X_tr_all, y_tr)
    nb_preds = nb_m.predict(X_te_all)
    nb_acc = accuracy_score(y_te, nb_preds)
    nb_f1 = f1_score(y_te, nb_preds, average='macro')
    comp_rows.append({'variant': v, 'model':'MultinomialNB', 'accuracy': nb_acc, 'macro_f1': nb_f1, 'vectorize_time_sec': vec_time})

comp_df = pd.DataFrame(comp_rows)
if not comp_df.empty:
    display(comp_df.sort_values(['macro_f1','accuracy'], ascending=False))
    try:
        if results_path_variants.exists():
            prev = pd.read_csv(results_path_variants)
            comp_df = pd.concat([prev, comp_df], ignore_index=True)
        comp_df.to_csv(results_path_variants, index=False)
        print('Variant Vergleich gespeichert unter', results_path_variants)
    except Exception as e:
        print('Speichern fehlgeschlagen:', e)
else:
    print('Keine Varianten verarbeitet (Dateien/Spalten fehlen?).')

## 5. Variant-Vergleich (Optional)

Benchmark über alle vorhandenen Textvarianten (`raw`, `clean`, `raw_lemma`, `clean_lemma`) mit identischer Vektorisierung. Liefert schnelle Indikation, welche Vorverarbeitung den größten Nutzen bietet.

In [None]:
from collections import Counter
MIN_TEST_SUPPORT = 5  # Schwellwert anpassen

# Nutzt bereits berechnete best_preds & y_test
test_support = Counter(y_test)
# Mapping severity_id -> behalten?
keep_labels = {lab for lab,count in test_support.items() if count >= MIN_TEST_SUPPORT}

mask = [lab in keep_labels for lab in y_test]
filtered_true = y_test[mask]
if 'best_preds' in globals():
    filtered_pred = np.array(best_preds)[mask]
else:
    filtered_pred = np.array([])

print('Original Klassenanzahl:', len(test_support))
print('Gefiltert (>= support):', len(keep_labels))
print('Verworfene Klassen:', set(test_support.keys()) - keep_labels)

if len(filtered_true) and len(np.unique(filtered_pred)):
    print(classification_report(filtered_true, filtered_pred, zero_division=0, digits=4))
else:
    print('Zu wenige verbleibende Klassen oder keine Vorhersagen nach Filter.')

## 6. Hyperparameter Grid & Logging

Kleiner kombinierter Grid über TF-IDF (word/char) Parameter und Modell-Hyperparameter (LogReg: C, NB: alpha). Ergebnisse werden in `results/classic_models_baseline.csv` protokolliert (append).

In [None]:
import time, json, os
from itertools import product
from pathlib import Path as _Path
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack

RESULTS_DIR = _Path('../results')
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_PATH = RESULTS_DIR / 'classic_models_baseline.csv'

# Re-use Original Daten (X_train_text, X_test_text, y_train, y_test)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

word_ngrams_choices = [(1,1),(1,2)]
char_ngrams_choices = [(3,5)]
min_df_choices = [1,2]
word_max_feats_choices = [20000, 50000]
char_max_feats_choices = [30000]
logreg_C_choices = [0.5, 1.0, 2.0]
nb_alpha_choices = [0.5, 1.0]

rows = []
start_global = time.time()
run_id_base = int(start_global)

for (w_ng, c_ng, min_df, w_max, c_max) in product(word_ngrams_choices, char_ngrams_choices, min_df_choices, word_max_feats_choices, char_max_feats_choices):
    # Vectorizer fit
    word_vec = TfidfVectorizer(ngram_range=w_ng, min_df=min_df, max_features=w_max, strip_accents='unicode', lowercase=False, sublinear_tf=True, dtype=np.float32)
    char_vec = TfidfVectorizer(analyzer='char_wb', ngram_range=c_ng, min_df=min_df, max_features=c_max, lowercase=False, sublinear_tf=True, dtype=np.float32)
    t0 = time.time()
    X_tr_w = word_vec.fit_transform(X_train_text)
    X_te_w = word_vec.transform(X_test_text)
    X_tr_c = char_vec.fit_transform(X_train_text)
    X_te_c = char_vec.transform(X_test_text)
    X_tr = hstack([X_tr_w, X_tr_c]).tocsr()
    X_te = hstack([X_te_w, X_te_c]).tocsr()
    vec_time = time.time() - t0

    # Logistic Regression variations
    for C in logreg_C_choices:
        model_name = 'LogisticRegression'
        logreg = LogisticRegression(max_iter=400, solver='lbfgs', C=C, n_jobs=None, random_state=RANDOM_STATE)
        t1 = time.time()
        logreg.fit(X_tr, y_train_enc)
        train_time = time.time() - t1
        preds = logreg.predict(X_te)
        acc = accuracy_score(y_test_enc, preds)
        f1 = f1_score(y_test_enc, preds, average='macro')
        rows.append({
            'run_id': run_id_base,
            'timestamp': time.time(),
            'model': model_name,
            'variant': VARIANT,
            'word_ngrams': str(w_ng),
            'char_ngrams': str(c_ng),
            'min_df': min_df,
            'word_max_features': w_max,
            'char_max_features': c_max,
            'C': C,
            'alpha': None,
            'accuracy': acc,
            'macro_f1': f1,
            'vectorize_time_sec': vec_time,
            'train_time_sec': train_time,
            'n_train': X_tr.shape[0],
            'n_test': X_te.shape[0],
            'notes': ''
        })

    # Naive Bayes variations
    for alpha in nb_alpha_choices:
        model_name = 'MultinomialNB'
        nb = MultinomialNB(alpha=alpha)
        t1 = time.time()
        nb.fit(X_tr, y_train_enc)
        train_time = time.time() - t1
        preds = nb.predict(X_te)
        acc = accuracy_score(y_test_enc, preds)
        f1 = f1_score(y_test_enc, preds, average='macro')
        rows.append({
            'run_id': run_id_base,
            'timestamp': time.time(),
            'model': model_name,
            'variant': VARIANT,
            'word_ngrams': str(w_ng),
            'char_ngrams': str(c_ng),
            'min_df': min_df,
            'word_max_features': w_max,
            'char_max_features': c_max,
            'C': None,
            'alpha': alpha,
            'accuracy': acc,
            'macro_f1': f1,
            'vectorize_time_sec': vec_time,
            'train_time_sec': train_time,
            'n_train': X_tr.shape[0],
            'n_test': X_te.shape[0],
            'notes': ''
        })

results_grid_df = pd.DataFrame(rows)
print('Grid Rows:', results_grid_df.shape)
if RESULTS_PATH.exists():
    prev = pd.read_csv(RESULTS_PATH)
    results_grid_df = pd.concat([prev, results_grid_df], ignore_index=True)
results_grid_df.to_csv(RESULTS_PATH, index=False)
results_grid_df.sort_values('macro_f1', ascending=False).head()

### 6.1 Feature Importance (LogReg)

Extrahiert für das beste Logistic-Regression-Modell die wichtigsten (positiven & negativen) Tokens je Klasse anhand der Koeffizienten.

In [None]:
# Rekonstruiere zuletzt im Grid verwendete word_vec / char_vec wenn LogReg vorhanden war
# (Für vollständige Reproduzierbarkeit: separaten Fit mit besten Parametern durchführen.)
from math import isfinite

if 'results_grid_df' in globals():
    # Bestes LogReg Modell bestimmen
    best_lr_row = results_grid_df[results_grid_df['model']=='LogisticRegression'].sort_values('macro_f1', ascending=False).head(1)
    if not best_lr_row.empty:
        r = best_lr_row.iloc[0]
        print('Best LogReg Params:', r.to_dict())
        w_ng = eval(r['word_ngrams'])
        c_ng = eval(r['char_ngrams'])
        min_df = int(r['min_df'])
        w_max = int(r['word_max_features']) if isfinite(r['word_max_features']) else None
        c_max = int(r['char_max_features']) if isfinite(r['char_max_features']) else None
        C = float(r['C']) if r['C'] else 1.0

        # Refit für transparente Pipeline
        word_vec_best = TfidfVectorizer(ngram_range=w_ng, min_df=min_df, max_features=w_max, strip_accents='unicode', lowercase=False, sublinear_tf=True, dtype=np.float32)
        char_vec_best = TfidfVectorizer(analyzer='char_wb', ngram_range=c_ng, min_df=min_df, max_features=c_max, lowercase=False, sublinear_tf=True, dtype=np.float32)
        X_tr_w_best = word_vec_best.fit_transform(X_train_text)
        X_te_w_best = word_vec_best.transform(X_test_text)
        X_tr_c_best = char_vec_best.fit_transform(X_train_text)
        X_te_c_best = char_vec_best.transform(X_test_text)
        X_tr_best = hstack([X_tr_w_best, X_tr_c_best]).tocsr()
        X_te_best = hstack([X_te_w_best, X_te_c_best]).tocsr()

        logreg_best = LogisticRegression(max_iter=400, solver='lbfgs', C=C, random_state=RANDOM_STATE)
        logreg_best.fit(X_tr_best, y_train_enc)

        feature_names = list(word_vec_best.get_feature_names_out()) + list(char_vec_best.get_feature_names_out())
        coefs = logreg_best.coef_  # shape [n_classes, n_features]
        top_k = 15
        class_importance = {}
        for class_index, class_label in enumerate(le.classes_):
            weights = coefs[class_index]
            top_pos_idx = np.argsort(weights)[-top_k:][::-1]
            top_neg_idx = np.argsort(weights)[:top_k]
            class_importance[int(class_label)] = {
                'top_positive': [(feature_names[i], float(weights[i])) for i in top_pos_idx],
                'top_negative': [(feature_names[i], float(weights[i])) for i in top_neg_idx]
            }
        import json
        imp_path = _Path('../results/logreg_feature_importance.json')
        with open(imp_path, 'w') as f:
            json.dump(class_importance, f, indent=2)
        print('Feature Importance gespeichert unter', imp_path)
    else:
        print('Kein LogReg Ergebnis im Grid gefunden.')
else:
    print('results_grid_df nicht definiert - Grid vorher ausführen.')

## 7. Visualisierungen & Analyse

Grafische Auswertung der Grid-Ergebnisse und Modellinterpretationen:
- 7.1 Modellvergleich
- 7.2 Hyperparameter-Einflüsse
- 7.3 LogReg Feature Importance (Balken)
- 7.4 Naive Bayes Top Tokens (log P(token|class))

### 7.1 Modellvergleich (Best Scores)

Zeigt für jedes Modell das jeweils beste Ergebnis (Accuracy & Macro-F1) aus dem Grid / Baseline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path as _Path

sns.set_theme(style='whitegrid')

if 'results_grid_df' not in globals():
    path = _Path('../results/classic_models_baseline.csv')
    if path.exists():
        results_grid_df = pd.read_csv(path)
    else:
        print('Keine Grid-Ergebnisse gefunden. Abschnitt übersprungen.')

if 'results_grid_df' in globals():
    # Aggregiere bestes Ergebnis je Modell
    best_per_model = results_grid_df.sort_values('macro_f1', ascending=False).groupby('model', as_index=False).first()
    display(best_per_model[['model','accuracy','macro_f1']])

    fig, axes = plt.subplots(1,2, figsize=(10,4))
    # Verwende hue=model + legend=False, damit zukünftige seaborn Versionen kein Warning werfen
    sns.barplot(data=best_per_model, x='model', y='accuracy', hue='model', ax=axes[0], palette='Blues_d', legend=False)
    axes[0].set_title('Accuracy (best pro Modell)')
    sns.barplot(data=best_per_model, x='model', y='macro_f1', hue='model', ax=axes[1], palette='Greens_d', legend=False)
    axes[1].set_title('Macro-F1 (best pro Modell)')
    for ax in axes:
        for p in ax.patches:
            ax.annotate(f"{p.get_height():.3f}", (p.get_x()+p.get_width()/2, p.get_height()), ha='center', va='bottom', fontsize=9)
    plt.tight_layout()
else:
    print('results_grid_df nicht verfügbar.')

### 7.2 Hyperparameter-Einflüsse

Untersucht den Einfluss einzelner Einstellungen (C, alpha, min_df, word ngram_range) auf die Macro-F1 Performance.

In [None]:
if 'results_grid_df' in globals():
    df_hp = results_grid_df.copy()
    # Wort-Ngram Range als String -> vereinfachen
    df_hp['word_ngrams'] = df_hp['word_ngrams'].astype(str)
    df_hp['char_ngrams'] = df_hp['char_ngrams'].astype(str)

    fig, axes = plt.subplots(2,2, figsize=(12,8))
    # C vs Macro-F1 (LogReg)
    lr_df = df_hp[df_hp['model']=='LogisticRegression'].dropna(subset=['C'])
    if not lr_df.empty:
        sns.lineplot(data=lr_df, x='C', y='macro_f1', marker='o', ax=axes[0,0])
        axes[0,0].set_title('LogReg: C vs Macro-F1')
    else:
        axes[0,0].text(0.5,0.5,'Keine LogReg Daten', ha='center')

    # alpha vs Macro-F1 (NB)
    nb_df = df_hp[df_hp['model']=='MultinomialNB'].dropna(subset=['alpha'])
    if not nb_df.empty:
        sns.lineplot(data=nb_df, x='alpha', y='macro_f1', marker='o', color='orange', ax=axes[0,1])
        axes[0,1].set_title('NaiveBayes: alpha vs Macro-F1')
    else:
        axes[0,1].text(0.5,0.5,'Keine NB Daten', ha='center')

    # min_df Effekt (aggregiert best per setting)
    agg_min_df = df_hp.sort_values('macro_f1', ascending=False).groupby(['model','min_df'], as_index=False).first()
    sns.barplot(data=agg_min_df, x='min_df', y='macro_f1', hue='model', ax=axes[1,0])
    axes[1,0].set_title('min_df Effekt (best per model & min_df)')

    # ngram range Effekt (word)
    agg_ng = df_hp.sort_values('macro_f1', ascending=False).groupby(['model','word_ngrams'], as_index=False).first()
    sns.barplot(data=agg_ng, x='word_ngrams', y='macro_f1', hue='model', ax=axes[1,1])
    axes[1,1].set_title('Word ngram_range Effekt')
    plt.tight_layout()
else:
    print('results_grid_df nicht verfügbar für Hyperparameter-Plots.')

### 7.3 LogReg Feature Importance Visualisierung

Barplots der Top positiven und negativen Tokens pro Klasse basierend auf den Koeffizienten des besten Logistic-Regression Modells.

In [None]:
import json
from math import isfinite

imp_json_path = _Path('../results/logreg_feature_importance.json')
if imp_json_path.exists():
    with open(imp_json_path) as f:
        imp_data = json.load(f)
    n_classes = len(imp_data)
    fig, axes = plt.subplots(n_classes, 2, figsize=(10, 3*n_classes))
    if n_classes == 1:
        axes = np.array([axes])
    for idx, (cls, val) in enumerate(sorted(imp_data.items(), key=lambda x: int(x[0]))):
        pos = val['top_positive']
        neg = val['top_negative']
        pos_df = pd.DataFrame(pos, columns=['token','weight']).sort_values('weight')
        neg_df = pd.DataFrame(neg, columns=['token','weight']).sort_values('weight')
        sns.barplot(data=pos_df, x='weight', y='token', hue='token', dodge=False, legend=False, ax=axes[idx,0])
        axes[idx,0].set_title(f'Class {cls} Top Positive')
        sns.barplot(data=neg_df, x='weight', y='token', hue='token', dodge=False, legend=False, ax=axes[idx,1])
        axes[idx,1].set_title(f'Class {cls} Top Negative')
    plt.tight_layout()
else:
    print('Keine logreg_feature_importance.json gefunden. Vorher Section 5.1 ausführen.')

### 7.4 Naive Bayes Top Tokens

Refit des besten NB Modells und Darstellung der höchsten log P(token|class) Werte je Klasse – liefert probabilistische Sicht auf diskriminative Tokens.

In [None]:
# Refit des besten NB Modells anhand Grid (analog zu LogReg Importance)
if 'results_grid_df' in globals():
    best_nb_row = results_grid_df[results_grid_df['model']=='MultinomialNB'].sort_values('macro_f1', ascending=False).head(1)
    if not best_nb_row.empty:
        r = best_nb_row.iloc[0]
        print('Best NB Params:', r.to_dict())
        from math import isfinite
        w_ng = eval(r['word_ngrams'])
        c_ng = eval(r['char_ngrams'])
        min_df = int(r['min_df'])
        w_max = int(r['word_max_features']) if isfinite(r['word_max_features']) else None
        c_max = int(r['char_max_features']) if isfinite(r['char_max_features']) else None
        alpha = float(r['alpha']) if r['alpha'] else 1.0

        word_vec_nb = TfidfVectorizer(ngram_range=w_ng, min_df=min_df, max_features=w_max, strip_accents='unicode', lowercase=False, sublinear_tf=True, dtype=np.float32)
        char_vec_nb = TfidfVectorizer(analyzer='char_wb', ngram_range=c_ng, min_df=min_df, max_features=c_max, lowercase=False, sublinear_tf=True, dtype=np.float32)
        X_tr_w_nb = word_vec_nb.fit_transform(X_train_text)
        X_te_w_nb = word_vec_nb.transform(X_test_text)
        X_tr_c_nb = char_vec_nb.fit_transform(X_train_text)
        X_te_c_nb = char_vec_nb.transform(X_test_text)
        X_tr_nb = hstack([X_tr_w_nb, X_tr_c_nb]).tocsr()
        X_te_nb = hstack([X_te_w_nb, X_te_c_nb]).tocsr()

        nb_best = MultinomialNB(alpha=alpha)
        nb_best.fit(X_tr_nb, y_train_enc)

        feature_names_nb = list(word_vec_nb.get_feature_names_out()) + list(char_vec_nb.get_feature_names_out())
        log_prob = nb_best.feature_log_prob_  # shape [n_classes, n_features]
        top_k = 15

        fig, axes = plt.subplots(len(le.classes_), 1, figsize=(10, 3*len(le.classes_)))
        if len(le.classes_) == 1:
            axes = [axes]
        nb_tokens_export = {}
        for idx, class_label in enumerate(le.classes_):
            weights = log_prob[idx]
            top_idx = np.argsort(weights)[-top_k:][::-1]
            top_tokens = [(feature_names_nb[i], float(weights[i])) for i in top_idx]
            nb_tokens_export[int(class_label)] = top_tokens
            plot_df = pd.DataFrame(top_tokens, columns=['token','log_prob']).sort_values('log_prob')
            sns.barplot(data=plot_df, x='log_prob', y='token', hue='token', dodge=False, legend=False, ax=axes[idx])
            axes[idx].set_title(f'NB log P(token|class={class_label}) Top {top_k}')
        plt.tight_layout()
        out_path = _Path('../results/nb_feature_logprob.json')
        with open(out_path, 'w') as f:
            json.dump(nb_tokens_export, f, indent=2)
        print('NB Token Log-Prob exportiert nach', out_path)
    else:
        print('Kein NB Modell im Grid gefunden.')
else:
    print('results_grid_df nicht verfügbar – Grid zuerst ausführen.')