In [4]:
!pip install -q transformers
!pip install -q evaluate
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/3.2 MB[0m [31m27.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m44.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.1


In [2]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/happy522/Argument-Mining/refs/heads/main/AM_gelabeled-2.csv")
df
keep_classes = [
    "Prämisse",
    "Behauptung",
    "Schlussfolgerung",
    "Nicht-argumentativer Text"
]

# Remove leading "/" in case your labels look like "/Zurückweisung"
df["Label"] = df["Label"].str.replace("/", "", regex=False).str.strip()

# Filter DataFrame
filtered_df = df[df["Label"].isin(keep_classes)]

# Classes you want to keep


# Reset index if needed
df = filtered_df.reset_index(drop=True)

#print(df)
#df.to_csv("new_arg_mining.csv")

In [5]:
# Nur notwendige Imports
from transformers import pipeline
from rapidfuzz import process, fuzz
import evaluate
from sklearn.metrics import classification_report
import pandas as pd
import logging
from getpass import getpass
from huggingface_hub import InferenceApi
from tqdm import tqdm

DEVICE = 0   # GPU 0 oder -1 für CPU
HF_TOKEN = getpass("HuggingFace Token:")


CLASSES = [
    "Prämisse",
    "Behauptung",
    "Schlussfolgerung",
    "Nicht-argumentativer Text"
]

# Prompt-Builder (1 Beispiel pro Klasse)
def build_few_shot_prompt(df, classes, text, random_state=42):
    examples = df.groupby("Label", group_keys=False).sample(n=1, random_state=random_state)
    lines = []
    lines.append("Du bist ein Assistent, der kurze deutsche Textabschnitte klassifiziert.")
    lines.append("Mögliche Labels: " + " | ".join(classes))
    lines.append("")
    lines.append("Beispiele:")
    for i, row in enumerate(examples.itertuples(index=False), 1):
        t_clean = str(getattr(row, "Komponente", getattr(row, "Text", ""))).replace("\n", " ").strip()
        lines.append(f"{i}. Text: „{t_clean}“\n   → Label: {row.Label}")
    lines.append("")
    lines.append("Jetzt klassifiziere den folgenden Text. Gib **nur** das Label zurück.")
    lines.append(f"Text: „{text.replace(chr(10),' ')}“")
    return "\n".join(lines)

# fuzzy map helper
def fuzzy_map(pred, classes=CLASSES):
    match, score, _ = process.extractOne(pred, classes, scorer=fuzz.WRatio)
    return match

# Evaluation helpers
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def evaluate_model_seq2seq(model_name, df, device=DEVICE, hf_token=HF_TOKEN):
    gen = pipeline("text2text-generation", model=model_name, device=device)
    preds = []
    for text in tqdm(df["Komponente"].astype(str), total=len(df)):
        prompt = build_few_shot_prompt(df, CLASSES, text)
        #print(prompt)
        out = gen(prompt, max_new_tokens=64, num_return_sequences=1)[0]["generated_text"]
        pred = fuzzy_map(out)
        #print("Prediction",pred)
        preds.append(pred)

    # metrics (map to ints)
    label2id = {l:i for i,l in enumerate(CLASSES)}
    y_true = df["Label"].map(label2id).astype(int).tolist()
    y_pred = [label2id.get(p, label2id["Nicht-argumentativer Text"]) for p in preds]

    acc = metric_acc.compute(predictions=y_pred, references=y_true)
    f1 = metric_f1.compute(predictions=y_pred, references=y_true, average="macro")

    print(f"Model: {model_name}  Acc={acc['accuracy']:.4f}  MacroF1={f1['f1']:.4f}")
    print(classification_report(y_true, y_pred, target_names=CLASSES, zero_division=0))
    return {"model": model_name, "accuracy": float(acc["accuracy"]), "macro_f1": float(f1["f1"])}


# Beispielaufruf:
"""
def safe_sample_group(g, n):
 #Sample up to n rows per class (safe if fewer available).
    return g.sample(min(len(g), n), random_state=42)
df = df.groupby("Label", group_keys=False).apply(lambda g: safe_sample_group(g, n=10)).reset_index(drop=True)

"""


HuggingFace Token:··········


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

'\ndef safe_sample_group(g, n):\n #Sample up to n rows per class (safe if fewer available).\n    return g.sample(min(len(g), n), random_state=42)\ndf = df.groupby("Label", group_keys=False).apply(lambda g: safe_sample_group(g, n=10)).reset_index(drop=True)\n\n'

In [6]:
res = evaluate_model_seq2seq("Shahm/t5-small-german", df)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
 18%|█▊        | 192/1073 [11:33<51:53,  3.53s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1073/1073 [1:06:33<00:00,  3.72s/it]

Model: Shahm/t5-small-german  Acc=0.4902  MacroF1=0.1936
                           precision    recall  f1-score   support

                 Prämisse       0.13      0.10      0.11       193
               Behauptung       0.00      0.00      0.00       204
         Schlussfolgerung       0.00      0.00      0.00        18
Nicht-argumentativer Text       0.58      0.77      0.66       658

                 accuracy                           0.49      1073
                macro avg       0.18      0.22      0.19      1073
             weighted avg       0.38      0.49      0.43      1073






In [7]:
res = evaluate_model_seq2seq("google/flan-t5-base", df)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu
 18%|█▊        | 192/1073 [27:30<1:52:08,  7.64s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1073/1073 [3:09:21<00:00, 10.59s/it]

Model: google/flan-t5-base  Acc=0.2833  MacroF1=0.1953
                           precision    recall  f1-score   support

                 Prämisse       0.07      0.05      0.06       193
               Behauptung       0.28      0.25      0.27       204
         Schlussfolgerung       0.01      0.11      0.01        18
Nicht-argumentativer Text       0.57      0.37      0.45       658

                 accuracy                           0.28      1073
                macro avg       0.23      0.19      0.20      1073
             weighted avg       0.42      0.28      0.33      1073






In [8]:
res = evaluate_model_seq2seq("google/mt5-small", df)

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu

  0%|          | 0/1073 [00:00<?, ?it/s][A
  0%|          | 1/1073 [00:02<41:41,  2.33s/it][A
  0%|          | 2/1073 [00:03<33:18,  1.87s/it][A
  0%|          | 3/1073 [00:04<26:05,  1.46s/it][A
  0%|          | 4/1073 [00:05<22:36,  1.27s/it][A
  0%|          | 5/1073 [00:06<20:30,  1.15s/it][A
  1%|          | 6/1073 [00:07<19:32,  1.10s/it][A
  1%|          | 7/1073 [00:08<18:53,  1.06s/it][A
  1%|          | 8/1073 [00:09<18:30,  1.04s/it][A
  1%|          | 9/1073 [00:10<18:12,  1.03s/it][A
  1

Model: google/mt5-small  Acc=0.5843  MacroF1=0.1869
                           precision    recall  f1-score   support

                 Prämisse       0.08      0.01      0.01       193
               Behauptung       0.00      0.00      0.00       204
         Schlussfolgerung       0.00      0.00      0.00        18
Nicht-argumentativer Text       0.60      0.95      0.74       658

                 accuracy                           0.58      1073
                macro avg       0.17      0.24      0.19      1073
             weighted avg       0.38      0.58      0.45      1073

