# SemEval 2024 Task 2: Safe Biomedical Natural Language Inference for Clinical Trials

----------------

In [43]:
# Könyvtárak telepítése (ha még nem telepítetted)
# !pip install --quiet transformers datasets torch scikit-learn pandas tqdm
# !pip install -U transformers

# Adatok klónozása és kicsomagolása (egyszeri lépés)
# !git clone https://github.com/ai-systems/Task-2-SemEval-2024.git

# !unzip -q /content/Task-2-SemEval-2024/training_data.zip -d /content/Task-2-SemEval-2024/

In [44]:
import transformers
print(transformers.__version__)

4.52.4


In [45]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

import torch
import accelerate
import transformers

print(torch.__version__)
print(accelerate.__version__)
print(transformers.__version__)

CUDA available: True
GPU: NVIDIA GeForce RTX 2070
2.7.1+cu118
1.7.0
4.52.4


In [46]:
import os
import json
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer
import torch

# DATA_DIR = "/content/Task-2-SemEval-2024"
DATA_DIR = "Task-2-SemEval-2024"
TRAIN_PATH = f"{DATA_DIR}/train.json"
DEV_PATH = f"{DATA_DIR}/dev.json"
TEST_PATH = f"{DATA_DIR}/test.json"
CTRS_DIR = f"{DATA_DIR}/CT json"  # CTR fájlok könyvtára
# DEV_PATH = "/content/Task-2-SemEval-2024/dev.json"
DEV_PATH = "Task-2-SemEval-2024/dev.json"

# Modell neve (cserélhető pl. 'emilyalsentzer/Bio_ClinicalBERT')
MODEL_NAME = "bert-base-uncased"
# MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
# MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
# MODEL_NAME = "microsoft/deberta-v3-base"


In [47]:
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# Betöltés
train_dict = load_json(TRAIN_PATH)
dev_dict = load_json(DEV_PATH)
test_dict = load_json(TEST_PATH)

# Átalakítás lista formátumra, ID-k hozzáadása
def dict_to_list(data_dict):
    data_list = []
    for ex_id, ex in data_dict.items():
        ex["id"] = ex_id
        data_list.append(ex)
    return data_list

train_data = dict_to_list(train_dict)
dev_data = dict_to_list(dev_dict)
test_data = dict_to_list(test_dict)

In [48]:
def process_examples(data_list, ctr_dir):
    processed = []
    for ex in tqdm(data_list):
        ctr_id = ex["Primary_id"]
        section = ex["Section_id"]
        hypothesis = ex["Statement"]
        label = ex.get("Label")  # tesztnél nem biztos, hogy van

        ctr_file = os.path.join(ctr_dir, f"{ctr_id}.json")
        if not os.path.isfile(ctr_file):
            print(f"WARNING: CTR file not found: {ctr_file}")
            continue

        with open(ctr_file, "r", encoding="utf-8") as f:
            ctr = json.load(f)

        # A szekció neve a CTR-ben pontos egyezést igényel
        if section not in ctr:
            print(f"WARNING: section '{section}' not found in CTR {ctr_id}")
            continue

        section_content = ctr[section]
        premise_text = "\n".join(section_content) if isinstance(section_content, list) else str(section_content)

        input_text = f"[{section}] {premise_text.strip()}\nHypothesis: {hypothesis.strip()}"

        processed.append({
            "id": ex["id"],
            "ctr_id": ctr_id,
            "section": section,
            "hypothesis": hypothesis,
            "label": label,
            "input_text": input_text
        })
    return pd.DataFrame(processed)

# Feldolgozás
df_train = process_examples(train_data, CTRS_DIR)
df_dev = process_examples(dev_data, CTRS_DIR)
df_test = process_examples(test_data, CTRS_DIR)

100%|██████████| 1700/1700 [00:00<00:00, 3471.65it/s]
100%|██████████| 1700/1700 [00:00<00:00, 3471.65it/s]
100%|██████████| 200/200 [00:00<00:00, 3184.51it/s]
100%|██████████| 200/200 [00:00<00:00, 3184.51it/s]
 82%|████████▏ | 4531/5500 [00:01<00:00, 2478.54it/s]



KeyboardInterrupt: 

In [None]:
# Mentés CSV-be, ha szükséges
df_train.to_csv("processed_train.csv", index=False)
df_dev.to_csv("processed_dev.csv", index=False)
df_test.to_csv("processed_test.csv", index=False)

# HuggingFace Dataset objektumok létrehozása
dataset_train = Dataset.from_pandas(df_train)
dataset_dev = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

In [None]:
# Tokenizer és modell betöltése
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)  # 3 label: Entailment, Neutral, Contradiction

# GPU beállítása (ha elérhető)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Címkék számokká konvertálása
label2id = {"Entailment": 0, "Neutral": 1, "Contradiction": 2}
id2label = {v: k for k, v in label2id.items()}

def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

dataset_train = dataset_train.map(encode_labels)

Map: 100%|██████████| 1700/1700 [00:00<00:00, 7413.04 examples/s]


In [None]:
def tokenize_function(example):
    return tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_train = dataset_train.map(tokenize_function, batched=True)

Map: 100%|██████████| 1700/1700 [00:01<00:00, 1699.85 examples/s]


In [None]:
tokenized_train = tokenized_train.train_test_split(test_size=0.1)
train_dataset = tokenized_train["train"]
eval_dataset = tokenized_train["test"]

In [None]:
# dev_dict betöltése
with open(DEV_PATH, "r", encoding="utf-8") as f:
    dev_dict = json.load(f)

# Átalakítás listává
dev_data = []
for ex_id, ex in dev_dict.items():
    ex["id"] = ex_id
    dev_data.append(ex)

In [None]:
# dev_dict = json.load(...) és process_examples(...) a dev-re is
df_dev = process_examples(dev_data, CTRS_DIR)
dataset_dev = Dataset.from_pandas(df_dev)
dataset_dev = dataset_dev.map(encode_labels)
tokenized_dev = dataset_dev.map(tokenize_function, batched=True)

train_dataset = tokenized_train
eval_dataset = tokenized_dev

100%|██████████| 200/200 [00:00<00:00, 3006.18it/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 8521.54 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 1316.06 examples/s]


In [None]:
training_args = TrainingArguments(
    output_dir="/content/bert-semeval-results",
    eval_strategy="epoch",  # ← ***
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="/content/logs",
    logging_steps=50,
    report_to=[]  # kikapcsolja az összes külső riportáló rendszert (beleértve a wandb-t),
)

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

In [None]:
train_dataset = tokenized_train["train"]
# eval_dataset = tokenized_train["test"]
eval_dataset = tokenized_dev

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.7231,0.707118,0.5,0.333333
2,0.7071,0.699011,0.5,0.333333
3,0.7073,0.693676,0.53,0.524243


TrainOutput(global_step=576, training_loss=0.7146381901370155, metrics={'train_runtime': 410.5661, 'train_samples_per_second': 11.18, 'train_steps_per_second': 1.403, 'total_flos': 1207690587371520.0, 'train_loss': 0.7146381901370155, 'epoch': 3.0})

In [None]:
# Tokenizált tesztadat újragenerálása, ha szükséges
tokenized_test = dataset_test.map(tokenize_function, batched=True)

# A label mező eltávolítása (mivel a teszt set nem tartalmaz címkéket)
tokenized_test = tokenized_test.remove_columns("label")

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

Map: 100%|██████████| 5500/5500 [00:03<00:00, 1708.23 examples/s]


In [None]:
predictions = trainer.predict(tokenized_test)
predicted_labels = np.argmax(predictions.predictions, axis=1)
predicted_classes = [id2label[p] for p in predicted_labels]

# Mentés CSV-be
df_preds = df_test.copy()
df_preds["predicted_label"] = predicted_classes
df_preds[["id", "predicted_label"]].to_csv("test_predictions.csv", index=False)

In [None]:
trainer.evaluate(eval_dataset=tokenized_dev)

{'eval_loss': 0.6936758160591125,
 'eval_accuracy': 0.53,
 'eval_f1_macro': 0.5242433444680635,
 'eval_runtime': 6.0037,
 'eval_samples_per_second': 33.313,
 'eval_steps_per_second': 4.164,
 'epoch': 3.0}

## Bináris címkézés: Neutral → Contradiction

A verseny értékelése bináris, ezért a Neutral predikciókat Contradiction-nak tekintjük.

In [57]:
# Bináris címkézés: Neutral → Contradiction
def to_binary_label(label):
    # Neutral → Contradiction
    return "Contradiction" if label != "Entailment" else "Entailment"

binary_predicted_classes = [to_binary_label(lbl) for lbl in predicted_classes]

In [58]:
# Bináris predikciók mentése CSV-be
df_preds_bin = df_test.copy()
df_preds_bin["predicted_label"] = binary_predicted_classes
df_preds_bin[["id", "predicted_label"]].to_csv("test_predictions_binary.csv", index=False)

# ellenőzzés
""" from collections import Counter
print("Bináris predikciók eloszlása:", Counter(binary_predicted_classes))
print(df_preds_bin[["id", "predicted_label"]].head()) """

' from collections import Counter\nprint("Bináris predikciók eloszlása:", Counter(binary_predicted_classes))\nprint(df_preds_bin[["id", "predicted_label"]].head()) '

## Faithfulness & Consistency metrikák számítása

A következő cellák kiszámítják a Faithfulness és Consistency metrikákat a dev halmazon, ha az adathalmaz tartalmazza a szükséges mezőket (`Type`, `group_id`).

In [59]:
# Példa: group_id és Type mezők generálása, ha van 'perturbed_group' vagy hasonló
# (Ha nincs ilyen mező, módosítsd a logikát a saját adathalmazadhoz!)

if "group_id" not in df_dev.columns or "Type" not in df_dev.columns:
    # Példa: ha van 'perturbed_group' mező, azt használjuk group_id-nak
    if "perturbed_group" in df_dev.columns:
        df_dev["group_id"] = df_dev["perturbed_group"]
    else:
        # Ha nincs, minden eredeti példához (Type==original) hozzárendeljük a következő sorokat csoportként
        # Itt csak minta, igazítsd a saját adathalmazadhoz!
        df_dev["group_id"] = None
        df_dev["Type"] = "original"
        group_counter = 0
        for idx, row in df_dev.iterrows():
            if "perturbed" in str(row.get("id", "")):
                df_dev.at[idx, "Type"] = "altering"
            elif "preserving" in str(row.get("id", "")):
                df_dev.at[idx, "Type"] = "preserving"
            else:
                group_counter += 1
            df_dev.at[idx, "group_id"] = group_counter

    print("group_id és Type mezők generálva.")
else:
    print("A szükséges mezők már léteznek.")

A szükséges mezők már léteznek.


In [60]:
# Feltételezzük, hogy df_dev tartalmazza a 'Type' (original/preserving/altering) és 'group_id' mezőket
# Ha nem, előbb egészítsd ki az adathalmazt ezekkel!

# 1. Predikciók hozzárendelése az id-khez
dev_pred_dict = dict(zip(df_dev["id"], binary_predicted_classes[:len(df_dev)]))

faithful_total = 0
faithful_count = 0
consistent_total = 0
consistent_count = 0

for group_id, group in df_dev.groupby("group_id"):
    orig = group[group["Type"] == "original"]
    if orig.empty:
        continue
    orig_pred = dev_pred_dict.get(orig["id"].values[0], None)
    for _, row in group.iterrows():
        if row["Type"] == "altering":
            faithful_total += 1
            pert_pred = dev_pred_dict.get(row["id"], None)
            if pert_pred is not None and pert_pred != orig_pred:
                faithful_count += 1
        elif row["Type"] == "preserving":
            consistent_total += 1
            pert_pred = dev_pred_dict.get(row["id"], None)
            if pert_pred is not None and pert_pred == orig_pred:
                consistent_count += 1

faithfulness = faithful_count / faithful_total if faithful_total else None
consistency = consistent_count / consistent_total if consistent_total else None

if faithful_total:
    print(f"Faithfulness: {faithfulness:.3f} ({faithful_count}/{faithful_total})")
else:
    print("Faithfulness: N/A (nincs altering példacsoport)")

if consistent_total:
    print(f"Consistency: {consistency:.3f} ({consistent_count}/{consistent_total})")
else:
    print("Consistency: N/A (nincs preserving példacsoport)")

Faithfulness: N/A (nincs altering példacsoport)
Consistency: N/A (nincs preserving példacsoport)
