# **1. Starting Setup**




In [1]:
!pip install -q "transformers>=4.40.0" datasets accelerate bitsandbytes sentencepiece protobuf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import torch
import re
from tqdm import tqdm
from pathlib import Path
from datasets import Dataset

## 1.2 Setting runtime

In [4]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠️ No GPU. Go to Runtime → Change runtime type → set GPU.")

CUDA available: True
GPU: Tesla T4


## 1.3 Import Dataset

In [5]:
!git clone https://github.com/sooo66/semeval2026-task12-dataset.git
!ls semeval2026-task12-dataset

Cloning into 'semeval2026-task12-dataset'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 27 (delta 7), reused 21 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 6.12 MiB | 26.21 MiB/s, done.
Resolving deltas: 100% (7/7), done.
dev_data  README.md  sample_data  test_data  train_data


## 1.4 Constants definition

In [6]:


# Root of the cloned repo in Colab
DATA_ROOT = Path("/content/semeval2026-task12-dataset")

# Choose the split you want to evaluate: "train_data", "dev_data", or "sample_data"
SPLIT = "dev_data"  # you can change this

QUESTIONS_FILE = DATA_ROOT / SPLIT / "questions.jsonl"
DOCS_FILE      = DATA_ROOT / SPLIT / "docs.json"

print("Questions file:", QUESTIONS_FILE)
print("Docs file:", DOCS_FILE)

Questions file: /content/semeval2026-task12-dataset/dev_data/questions.jsonl
Docs file: /content/semeval2026-task12-dataset/dev_data/docs.json


# 1.5 Import Repository and load dataset

In [7]:
%cd /content
!git clone https://github.com/irenebartolini02/LLM-abductive-event-reasoning.git
%cd /content/LLM-abductive-event-reasoning

/content
Cloning into 'LLM-abductive-event-reasoning'...
remote: Enumerating objects: 112, done.[K
remote: Counting objects: 100% (112/112), done.[K
remote: Compressing objects: 100% (85/85), done.[K
remote: Total 112 (delta 58), reused 72 (delta 26), pack-reused 0 (from 0)[K
Receiving objects: 100% (112/112), 397.20 KiB | 10.45 MiB/s, done.
Resolving deltas: 100% (58/58), done.
/content/LLM-abductive-event-reasoning


In [8]:
from utils.data_loader import load_jsonl, load_json, index_docs_by_topic

questions = load_jsonl(QUESTIONS_FILE)
docs = load_json(DOCS_FILE)
docs_by_topic = index_docs_by_topic(docs)

print("Num questions:", len(questions))
print("Num doc groups:", len(docs_by_topic))
print("Example question keys:", questions[0].keys())

Num questions: 400
Num doc groups: 36
Example question keys: dict_keys(['topic_id', 'uuid', 'target_event', 'option_A', 'option_B', 'option_C', 'option_D', 'golden_answer'])


# **2. Loading Model**



In [9]:
!pip install -U bitsandbytes



In [10]:

from utils.model_utils import load_model

# set yor model name
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"


model, tokenizer= load_model(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Model loaded on: cuda:0


# 3 Checkpints loading

In [None]:
%pip install -q google-colab

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m49.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

import json
from google.colab import drive
import os

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

CHECKPOINT_FILE = "/content/drive/MyDrive/LLM/Qwen/evaluation_checkpoint.jsonl"
WRONG_ANSWERS_FILE = "/content/drive/MyDrive/LLM/Qwen/wrong_answers.json"

In [None]:
results = []

processed_uuids = set() # Set to check which question to skip (already processed)

if os.path.exists(CHECKPOINT_FILE):
    print(f"📂 Found checkpoint: {CHECKPOINT_FILE}. Loading checkpoints...")
    with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    data = json.loads(line)
                    results.append(data)
                    processed_uuids.add(data['uuid']) # Memorize processed ID
                    total_score += data['score']
                except json.JSONDecodeError:
                    continue
    print(f"Restored {len(results)} previous results.")
else:
    print("No checkpoint found. Start a new evaluation")

📂 Found checkpoint: /content/drive/MyDrive/LLM/Qwen/evaluation_checkpoint.jsonl. Loading checkpoints...
Restored 0 previous results.


# 4. Evaluation Loop

In [None]:

SYSTEM_PROMPT = (
    "You are solving SemEval 2026 Task 12: Abductive Event Reasoning. "
    "Given an event, context documents, and four options (A–D), "
    "choose which option(s) are the most plausible direct cause of the event. "
    "Respond ONLY with the letters of all correct options, "
    "separated by commas (e.g. 'A', 'A,B', or 'D'). "
    "Do not output any explanations."
)

In [None]:
SYSTEM_PROMPT_STEP_BACK = (
    "You are an expert in causal logic for SemEval 2026. Your task is"
    " to identify only the DIRECT CAUSES of the target event."
    "Strictly follow these steps for each option:"
    "1. Identify the timestamp or logical order of the option relative to"
    " the event."
    "2. Exclude options that are CONSEQUENCES of the event (those that happened after)."
    "3. Exclude options that are only BACKGROUND information (those that happened a long time before)."
    "4. Verify the SUBJECT: ensure the subject of the option matches the subject of the event (e.g., Signature Bank is not the same as SVB)."
    "Respond ONLY with the letters (A, B, C, D) separated by commas. Do not provide any explanations."
)

# 5. Results

# **RAG implementation**

# 1. Setup, divide doc in chuncks

In [11]:

# Installiamo i pacchetti specifici per LangChain 0.3+
!pip install -q langchain langchain-text-splitters langchain-community sentence-transformers faiss-cpu


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[3

In [12]:

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Strategia di Chunking: Recursive è meglio perché rispetta i paragrafi
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200, # Sovrapposizione per non perdere i nessi causali tra i pezzi
    separators=["\n\n", "\n", ".", " "]
)



In [13]:
import torch

# Controlla se la GPU è disponibile
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Sto usando: {device}")
#sentence-transformers/all-MiniLM-L6-v2: È circa 5-10 volte più veloce del precedente e molto più leggero in memoria.
# Per compiti di recupero indiziario spesso è più che sufficiente
# Inizializza embeddings sulla GPU
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': device}, # <-- Modificato qui
    encode_kwargs={'batch_size': 32} # <-- Aumenta la velocità processando blocchi di testi
)


Sto usando: cuda


  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
from langchain_core.documents import Document

# impiega 1h questa cella, forse dovremmo usare un modello embeddings più leggero e veloce

# create a database dictionary for each topic id
vector_db_per_topic= dict()
for topic_id, docs_list in docs_by_topic.items():
  # 1. Convert to Document objects
    langchain_docs = [Document(page_content=d.get('content', ''), metadata={'title': d.get('title', '')}) for d in docs_list]
    # 2. SPLIT doc in chunks
    split_docs = text_splitter.split_documents(langchain_docs)
    # 3. create database for the document's chunks and insert it in the dictionary
    vector_db_per_topic[topic_id] = FAISS.from_documents(split_docs, embeddings)

# 2. Retrival (Recupero Intelligente)

Naive RAG

In [21]:
def retrieve_relevant_context(vector_db, target_event, k=5):
    # vector search (FAISS) is that it calculates the similarity based on the overall distance of the text
    # Naive RAG
    relevant_docs = vector_db.similarity_search(target_event, k=k)
    return "\n---\n".join([d.page_content for d in relevant_docs])


Hybrid

In [None]:

def retrieve_hybrid_context(vector_db, item, k=5):
    # Adding answer option in the searching
    causal_queries = [
        item['target_event'],
        f"Why did {item['target_event']} happen?",
        f"Reason for {item['target_event']}",
        f"{item['option_A']} {item['option_B']} {item['option_C']} {item['option_D']}"
    ]

    all_docs = []
    for q in causal_queries:
        all_docs.extend(vector_db.similarity_search(q, k=2))

    seen = set()
    unique_docs = []
    for d in all_docs:
        if d.page_content not in seen:
            unique_docs.append(d)
            seen.add(d.page_content)

    return "\n---\n".join([d.page_content for d in unique_docs[:k]])



Reranking

In [16]:
!pip install sentence-transformers



In [17]:
from sentence_transformers import CrossEncoder

# Rileva automaticamente se hai CUDA disponibile
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 1. Initialize Reranker
reranker_model = CrossEncoder('BAAI/bge-reranker-base', device=device)

def retrieve_relevant_chunks_with_reranking(vector_db, query, k_initial=20, k_final=5):
    #  1: we take k_initial (high) candidates to make sure the answer is among them
    initial_docs = vector_db.similarity_search(query, k=k_initial)

    if not initial_docs:
        return []

    # 2: Preparing couples for the Reranker [Query, Document]
    pairs = [[query, doc.page_content] for doc in initial_docs]

    # 3: Scoring (Reranking)
    scores = reranker_model.predict(pairs)

    # 4: Sorting and selecting the final top-K
    scored_docs = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)
    final_docs = [doc for doc, score in scored_docs[:k_final]]

    return "\n---\n".join([d.page_content for d in final_docs[:k_final]])

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

# 2. Evaluation loop

In [None]:
import gc   # garbage collector
from utils.model_utils import format_qwen_prompt
from utils.output_utils import clean_response, calculate_score, print_metrics

WRONG_ANSWERS_FILE = "wrong_answers.jsonl"
obj={}
total_score = 0
count = 0
errors = 0
results=[]
print(f"Starting evaluation on {len(questions)} questions...")

i=1
# we use tqdm to see the progress bar
for entry in tqdm(questions):
    # Inizializziamo a None per evitare errori nel blocco 'del'
    inputs = None
    outputs = None

    try:
        topic_id = entry['topic_id']
        question_uuid = entry['uuid']
        golden_ans = entry['golden_answer']
        target_event = entry['target_event']
        #RAG

        #all chunks for item id
        vector_db = vector_db_per_topic.get(topic_id, [])

        # select 1 more util chuncks
        context_text= retrieve_relevant_chunks_with_reranking(vector_db, target_event, 20, 3)

        # TASK 2 - causal reasoning with extracted keywords------------------------------------------
        prompt_inference= format_qwen_prompt(tokenizer, SYSTEM_PROMPT_STEP_BACK, entry, context_text , max_total_chars=20_000)

         # 2. Tokenization
        inputs = tokenizer(prompt_inference, return_tensors="pt").to(model.device)
         # 3. Generation (Greedy decoding for reproducibility)
        with torch.no_grad():
            outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=False,      # Determinism
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id
          )
      # 4. Decoding e Cleaning
      # Cutting the input prompt -> the response of the model also contains the input prompt we provided
        generated_ids = outputs[0][inputs.input_ids.shape[1]:]
        raw_response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

      # Extract the letters set (es. {"A", "B"})
        pred_set = clean_response(raw_response)
        if pred_set == set():
          print("no answer")

      # 5. Calculate score
        score = calculate_score(pred_set, golden_ans)
        if score==0:
          # save wrong answer
          with open(WRONG_ANSWERS_FILE, 'a', encoding='utf-8') as f:
            obj["prompt"]= prompt_inference
            obj["gold"]= golden_ans
            obj["pred"]= raw_response
            f.write( json.dumps(obj, indent=3))

        # We prepare the result object (a list of result_item, which is the following dictionare)
        result_item = {
            "uuid": question_uuid,
            "topic_id": topic_id,
            "golden_raw": golden_ans,
            "prediction_raw": raw_response,
            "prediction_set": list(pred_set),
            "score": score
        }


        # Updating variables in memory
        results.append(result_item)
        processed_uuids.add(question_uuid)
        total_score += score
        count += 1

    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"UUID:{entry['uuid']} process skipped due to OOM")
            errors += 1
            continue
        else:
            print(f"Errore generico: {e}")
            continue
    finally:
        # Questo blocco viene eseguito SEMPRE, sia se il codice va bene sia se crasha
        if inputs is not None: inputs = None  # Reassign to None instead of del
        if outputs is not None: outputs = None # Reassign to None instead of del
        torch.cuda.empty_cache()
        gc.collect()

Starting evaluation on 400 questions...


  0%|          | 0/400 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 27%|██▋       | 108/400 [22:39<58:49, 12.09s/it]

no answer


 98%|█████████▊| 391/400 [1:22:50<01:59, 13.32s/it]

In [None]:
print_metrics(results)

Total questions: 400
Correct answers: 205 - 51.24999999999999%
Partial answers: 91 - 22.75%
Wrong answers: 104 - 26.0%
Total score: 250.5
Performance of the score: 62.625%


# Nuova versione, per ogni Option generiamo del contesto e facciamo scegliere a Qwen se è causa diretta dell'evento, True/False

Migioramento: applicare cusal reranking (capire come fare)

In [22]:
SYSTEM_PROMPT_BINARY=(
  "You are a Causal Reasoning Verifier specializing in Abductive Analysis. Your goal is to determine if a specific 'Candidate Cause' is the DIRECT reason for a given 'Target Event'."
 # "Follow this rigorous verification protocol:"
 # "1. Chronological Check: A cause MUST occur before the event. If the candidate cause is a result or a reaction to the event, label it as FALSE."
 # "2. Directness Test: A direct cause is a necessary event that happened before the Target Event. If the Target Event would not have happened without the Candidate Cause, consider it a DIRECT cause."
 #"3. Evidence-Based: Answer TRUE only if the provided context explicitly supports the causal link."
  "Respond ONLY with 'TRUE' or 'FALSE'. Do not provide any explanation."
)

def format_qwen_prompt_binary(tokenizer, system_prompt, entry, context_text, option_label):
    """
    Formatta il prompt per una singola opzione in modalità binaria (TRUE/FALSE).

    Args:
        tokenizer: Il tokenizer del modello Qwen.
        SYSTEM_PROMPT: Il prompt di sistema "Causal Reasoning Verifier" che abbiamo scritto.
        entry: L'elemento del dataset (contiene 'target_event' e le opzioni).
        context_text: Il contesto recuperato (idealmente specifico per quell'opzione).
        option_label: La lettera dell'opzione da valutare (A, B, C o D).
    """

    event = entry.get('target_event')

    # Recuperiamo il testo dell'opzione specifica (es. entry['option_A'])
    option_text = entry.get(f'option_{option_label}')

    # Costruiamo il messaggio dell'utente per la singola opzione
    user_content = (
        f"TARGET EVENT: \"{event}\"\n"
        f"CANDIDATE CAUSE: \"{option_text}\"\n"
        f"CONTEXT:\n{context_text}\n\n"
        f"Does the Candidate Cause directly trigger the Target Event according to the context?\n"
        f"Answer (TRUE/FALSE):"
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_content}
    ]

    # Utilizziamo l'apply_chat_template del tokenizer per mantenere il formato <|im_start|>
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    return prompt

In [32]:
import gc   # garbage collector
from utils.model_utils import format_qwen_prompt
from utils.output_utils import clean_response, calculate_score, print_metrics
import json

WRONG_ANSWERS_FILE = "wrong_answers.jsonl"

total_score = 0
count = 0
errors = 0
results=[]
processed_uuids=set()
print(f"Starting evaluation on {len(questions)} questions...")

i=1
# we use tqdm to see the progress bar
for entry in tqdm(questions[:20]):
    # Inizializziamo a None per evitare errori nel blocco 'del'
    inputs = None
    outputs = None

    try:
        topic_id = entry['topic_id']
        question_uuid = entry['uuid']
        golden_ans = entry['golden_answer']
        target_event = entry['target_event']
        #RAG

        #all chunks for item id
        vector_db = vector_db_per_topic.get(topic_id, [])

        all_prompts_responses={}
        none_of_the_others= None
        pred_set= set()

        for op_label in ['A', 'B', 'C', 'D']:

          option_text= entry[f'option_{op_label}']

          # none of the others detection
          if "none of the others are correct causes" in option_text.lower().strip():
            none_of_the_others= op_label
            continue

          # select 1 more util chuncks
          query= f"{option_text} causes {target_event}"
          context_text= retrieve_relevant_chunks_with_reranking(vector_db, query, 20, 3)

          # Prompt binary------------------------------------------
          prompt_inference= format_qwen_prompt_binary(tokenizer, SYSTEM_PROMPT_BINARY, entry, context_text , op_label)

          # 2. Tokenization
          inputs = tokenizer(prompt_inference, return_tensors="pt").to(model.device)
          # 3. Generation (Greedy decoding for reproducibility)
          with torch.no_grad():
              outputs = model.generate(
              **inputs,
              max_new_tokens=10,
              do_sample=False,      # Determinism
              temperature=0.0,
              pad_token_id=tokenizer.eos_token_id
          )
          # 4. Decoding e Cleaning
          # Cutting the input prompt -> the response of the model also contains the input prompt we provided
          generated_ids = outputs[0][inputs.input_ids.shape[1]:]
          # Extract true or false
          raw_response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
          all_prompts_responses[op_label]={'response': raw_response,
                                                'prompt': prompt_inference,
                                               }

          if raw_response.upper().strip() == "TRUE":
            pred_set.add(op_label)
        # outise the loop
        if pred_set == set():
          if none_of_the_others:
            pred_set.add(none_of_the_others)
          else:
            print("no answer")


      # 5. Calculate score
        score = calculate_score(pred_set, golden_ans)
        if score!=1:
          # save wrong answer
          obj = {}
          with open(WRONG_ANSWERS_FILE, 'a', encoding='utf-8') as f:
            obj = all_prompts_responses
            obj["gold"]= golden_ans
            obj["pred"]= list(pred_set)
            f.write( json.dumps(obj, indent=3))

        # We prepare the result object (a list of result_item, which is the following dictionare)
        result_item = {
            "uuid": question_uuid,
            "topic_id": topic_id,
            "golden_raw": golden_ans,
            "prediction_raw": raw_response,
            "prediction_set": list(pred_set),
            "score": score
        }


        # Updating variables in memory
        results.append(result_item)
        processed_uuids.add(question_uuid)
        total_score += score
        count += 1

    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"UUID:{entry['uuid']} process skipped due to OOM")
            errors += 1
            continue
        else:
            print(f"Errore generico: {e}")
            continue
    finally:
        # Questo blocco viene eseguito SEMPRE, sia se il codice va bene sia se crasha
        if inputs is not None: inputs = None  # Reassign to None instead of del
        if outputs is not None: outputs = None # Reassign to None instead of del
        torch.cuda.empty_cache()
        gc.collect()

Starting evaluation on 400 questions...


  5%|▌         | 1/20 [00:05<01:35,  5.00s/it]

no answer


 25%|██▌       | 5/20 [00:26<01:20,  5.39s/it]

no answer


 30%|███       | 6/20 [00:32<01:16,  5.45s/it]

no answer


 40%|████      | 8/20 [00:42<01:01,  5.16s/it]

no answer


 75%|███████▌  | 15/20 [01:19<00:26,  5.35s/it]

no answer


100%|██████████| 20/20 [01:46<00:00,  5.33s/it]


In [33]:
from utils.output_utils import print_metrics
print_metrics(results, "Qwen")

Total questions: 20
Correct answers: 12 - 60.0%
Partial answers: 2 - 10.0%
Wrong answers: 6 - 30.0%
Total score: 13.0
Performance of the score: 65.0%
