# **1. Starting Setup**




## 1.1 Download dependencies

In [1]:
!pip install -q "transformers>=4.40.0" datasets accelerate bitsandbytes sentencepiece protobuf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import re
from tqdm import tqdm

## 1.2 Setting runtime

In [3]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠️ No GPU. Go to Runtime → Change runtime type → set GPU.")

CUDA available: True
GPU: Tesla T4


## 1.3 Import Dataset

In [4]:
!git clone https://github.com/sooo66/semeval2026-task12-dataset.git
!ls semeval2026-task12-dataset

Cloning into 'semeval2026-task12-dataset'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 27 (delta 7), reused 21 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 6.12 MiB | 12.31 MiB/s, done.
Resolving deltas: 100% (7/7), done.
dev_data  README.md  sample_data  test_data  train_data


## 1.4 Constants definition

In [5]:
from pathlib import Path
import json
from typing import Dict, List, Set

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Root of the cloned repo in Colab
DATA_ROOT = Path("/content/semeval2026-task12-dataset")

# Choose the split you want to evaluate: "train_data", "dev_data", or "sample_data"
SPLIT = "dev_data"  # you can change this

QUESTIONS_FILE = DATA_ROOT / SPLIT / "questions.jsonl"
DOCS_FILE      = DATA_ROOT / SPLIT / "docs.json"

print("Questions file:", QUESTIONS_FILE)
print("Docs file:", DOCS_FILE)

Questions file: /content/semeval2026-task12-dataset/dev_data/questions.jsonl
Docs file: /content/semeval2026-task12-dataset/dev_data/docs.json


# **2. Functions to load dataset**

In [6]:
def load_jsonl(path: Path) -> List[Dict]:
    """Read a .jsonl file into a list of dicts."""
    items = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

def load_json(path: Path):
  """Read a .json file."""
  with path.open("r", encoding="utf-8") as f:
      return json.load(f)

def index_docs_by_topic(docs_list: List[Dict]) -> Dict[int, List[Dict]]:
    """
    Build a mapping: topic_id -> docs (list of dicts).
    Each item in docs_list has: {"topic_id": int, "docs": [ { ... }, ... ] }
    """
    result: Dict[int, List[Dict]] = {}
    for d in docs_list:
        tid = d["topic_id"]
        result[tid] = d["docs"]
    return result

questions = load_jsonl(QUESTIONS_FILE)
docs = load_json(DOCS_FILE)
docs_by_topic = index_docs_by_topic(docs)

print("Num questions:", len(questions))
print("Num doc groups:", len(docs_by_topic))
print("Example question keys:", questions[0].keys())

Num questions: 400
Num doc groups: 36
Example question keys: dict_keys(['topic_id', 'uuid', 'target_event', 'option_A', 'option_B', 'option_C', 'option_D', 'golden_answer'])


# **3. Loading Model**



In [7]:
MODEL_NAME = "upstage/SOLAR-10.7B-Instruct-v1.0"

# 1. Definition of the quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,              # Active 4-bit loading
    bnb_4bit_quant_type="nf4",      # Use the type "nf4" (more precise)
    bnb_4bit_compute_dtype=torch.float16  # loat16
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
)

# 2. Loading model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)

model.eval()
print("Model loaded on:", model.device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

Model loaded on: cuda:0


# **4. Prompt Generation**

## 4.1 Build Context

In [8]:
def build_context(docs, max_total_chars):
  if not docs:
        context_text = "No context document available"
  else:
        full_context_list = []
        current_length = 0

        for i, doc in enumerate(docs, 1):
            # Take snippet or content
            raw_text = doc.get('content') or doc.get('snippet', '')

            # Clear text to remove useless blank spaces (to reduce waste of resources)
            clean_text = " ".join(raw_text.split())

            # Create the document string
            doc_str = f"--- Document {i} ---\nTitle: {doc.get('title')}\nText: {clean_text}\n\n"

            # Check length: if adding this doc we exceed the limit we don't add it
            if current_length + len(doc_str) > max_total_chars:
                # Add only the part that stays in the limit
                remaining = max_total_chars - current_length
                full_context_list.append(doc_str[:remaining] + "... [TRUNCATED]")
                break
            full_context_list.append(doc_str)
            current_length += len(doc_str)

        context_text = "".join(full_context_list)

In [9]:
def build_context_B(docs, max_docs=5, max_chars_per_doc=1000):
    if not docs:
        return "No context document available"

    full_context_list = []

    # Prendiamo solo i primi N documenti (MAX_DOCS_PER_TOPIC)
    selected_docs = docs[:max_docs]

    for i, doc in enumerate(selected_docs, 1):
        # Recupera il contenuto
        raw_text = doc.get('content') or doc.get('snippet', '')
        clean_text = " ".join(raw_text.split())

        # TRONCAMENTO SINGOLO (Versione B)
        # Tronchiamo il testo se supera il limite per singolo documento
        if len(clean_text) > max_chars_per_doc:
             clean_text = clean_text[:max_chars_per_doc] + "..."

        # Formattazione specifica della Versione B
        doc_str = (
            f"Doc {i}:\n"
            f"Title: {doc.get('title')}\n"
            f"Snippet: {doc.get('snippet', '')}\n"
            f"Content: {clean_text}\n\n"
        )

        full_context_list.append(doc_str)

    return "".join(full_context_list)

## 4.2 Prompt Structure

In [10]:
# VERSIONE A
SYSTEM_PROMPT = (
    "You are solving SemEval 2026 Task 12: Abductive Event Reasoning. "
    "Given an event, context documents, and four options (A–D), "
    "choose which option(s) are the most plausible direct cause of the event. "
    "Respond ONLY with the letters of all correct options, "
    "separated by commas (e.g. 'A', 'A,B', or 'D'). "
    "Do not output any explanations."
)

def format_solar_prompt_A(question, docs, max_total_chars=6000):
  #1 Obtain the context
  context_text = build_context(docs,max_total_chars)

  # 2. Prompt Composition
  user_message = f"""{SYSTEM_PROMPT}

Event: "{question['target_event']}"

Context Documents:
{context_text}

Options:
A) {question['option_A']}
B) {question['option_B']}
C) {question['option_C']}
D) {question['option_D']}

Answer (letters only):"""

  messages = [{"role": "user", "content": user_message}]
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  return prompt

In [11]:
# VERSIONE B
SYSTEM_PROMPT = (
    "You are solving SemEval 2026 Task 12: Abductive Event Reasoning. "
    "Given an event, context documents, and four options (A–D), "
    "choose which option(s) are the most plausible direct cause of the event. "
    "Respond ONLY with the letters of all correct options, "
    "separated by commas (e.g. 'A', 'A,B', or 'D'). "
    "Do not output any explanations."
)

# Aggiorniamo la firma per accettare i parametri della Versione B
def format_solar_prompt_B(question, docs, max_docs=5, max_chars_per_doc=1000):

  # 1. Obtain the context
  # Passiamo i parametri esplicitamente con i nomi per evitare confusione
  context_text = build_context_B(docs, max_docs=max_docs, max_chars_per_doc=max_chars_per_doc)

  # 2. Prompt Composition
  user_message = f"""{SYSTEM_PROMPT}

Event: "{question['target_event']}"

Context Documents:
{context_text}

Options:
A) {question['option_A']}
B) {question['option_B']}
C) {question['option_C']}
D) {question['option_D']}

Answer (letters only):"""

  messages = [{"role": "user", "content": user_message}]
  prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  return prompt

# **5. Output parsing and scoring function**

In [12]:
def clean_response(response_text):
    """
    Clean model output to extract only the letters A, B, C, D.
    Handle cases like "Option A", "A.", "A, B", "The answer is C".
    """
    # Remove everything except letters and commas
    # Find for pattern like A, B, C, D
    # Find all uppercase letters A-D in the text
    matches = re.findall(r'\b[A-D]\b', response_text.upper())

    # If does not find anything, return empty string
    if not matches:
        return set()

    # Return set for easy comparision (the order does not count for sets)
    return set(matches)


def calculate_score(prediction_set, golden_string):
    """
    Calculate score: 1.0 (Exact), 0.5 (Partial), 0.0 (Wrong)
    Exact: All letters are correct
    Partial: Some letters are contained in the correct set, but not all
    Wrong: There is a letter not contained in the correct set
    """
    # Clean also the golden answer (that arrives as a string: "C" o "A,B")
    golden_set = set(re.findall(r'\b[A-D]\b', golden_string.upper()))

    if not golden_set:
        print(f"Warning: Golden answer empty or malformed: {golden_string}")
        return 0.0

    # Case 1: Equal (Exact) -> 1 point
    if prediction_set == golden_set:
        return 1.0

    # Case 2: Golden Answer CONTAIN the prediction -> 0.5 points
    # Example: Gold={A, B}, Pred={A} -> Gold contains Pred
    # Esempio: Gold={A}, Pred={A, B} -> Gold DOES NOT contain Pred
    elif prediction_set.issubset(golden_set):
        return 0.5

    # Case 3: Golden does not contain predicted (Wrong) -> 0 punti
    else:
        return 0.0

# **6. Evaluation loop**




## 6.1 Function to compute metrics over the results

In [13]:
def print_metrics(results):
  perfect_answers = [r for r in results if r['score'] == 1.0]
  partial_answers = [r for r in results if r['score'] == 0.5]

  n_total = len(results)
  n_correct = len(perfect_answers)
  n_partial = len(partial_answers)
  n_wrong = n_total - n_correct - n_partial

  total_score = sum(r['score'] for r in results)

  print(f"======= {MODEL_NAME} Causal Reasoning Results =======")
  print(f"Total questions: {n_total}")
  print(f"Correct answers: {n_correct} - {n_correct/n_total*100}%")
  print(f"Partial answers: {n_partial} - {n_partial/n_total*100}%")
  print(f"Wrong answers: {n_wrong} - {n_wrong/n_total*100}%")
  print(f"Total score: {total_score}")
  print(f"Performance of the score: {total_score/n_total*100}%")


## 6.2 Checkpoints loading

### Mount The Drive and set the Checkpoint path

In [14]:
import json
from google.colab import drive
import os

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

CHECKPOINT_FILE = "/content/drive/MyDrive/LLM_Project/evaluation_checkpoint_vB.jsonl"

Mounted at /content/drive


### Remove previous checkpoints

**CAREFUL: RUN THIS SNIPPET ONLY IF YOU WANT TO START FROM THE BEGINNING**

In [15]:
## !!! This will delete the checkpoints in your drive !!!
## Commented so that we can use the "Run all previous"

# !rm -rf $CHECKPOINT_FILE

### Load the checkpoints if it exists

In [16]:
results = []

processed_uuids = set() # Set to check which question to skip (already processed)

if os.path.exists(CHECKPOINT_FILE):
    print(f"📂 Found checkpoint: {CHECKPOINT_FILE}. Loading checkpoints...")
    with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                try:
                    data = json.loads(line)
                    results.append(data)
                    processed_uuids.add(data['uuid']) # Memorize processed ID
                    total_score += data['score']
                except json.JSONDecodeError:
                    continue
    print(f"Restored {len(results)} previous results.")
else:
    print("No checkpoint found. Start a new evaluation")

No checkpoint found. Start a new evaluation


## 6.3 Evaluation Loop

In [19]:
# --- PATCH PER VELOCIZZARE ---
import gc

# 1. Tieni solo i documenti necessari
needed_topics = set(q['topic_id'] for q in questions)
docs_by_topic = {k: v for k, v in docs_by_topic.items() if k in needed_topics}
if 'docs' in globals(): del docs
gc.collect()

# 2. Definisci la funzione OTTIMIZZATA (taglia prima di splittare)
def build_context_B(docs, max_docs=3, max_chars_per_doc=700):
    if not docs: return "No context"
    full_context_list = []
    selected_docs = docs[:max_docs] # Solo i primi 3

    for i, doc in enumerate(selected_docs, 1):
        raw_text = doc.get('content') or doc.get('snippet', '') or ""

        # Taglio preventivo per non bloccare la CPU
        if len(raw_text) > max_chars_per_doc * 2:
            raw_text = raw_text[:max_chars_per_doc * 2]

        clean_text = " ".join(raw_text.split())

        # Taglio finale
        if len(clean_text) > max_chars_per_doc:
             clean_text = clean_text[:max_chars_per_doc] + "..."

        doc_str = f"Doc {i}:\nTitle: {doc.get('title')}\nContent: {clean_text}\n\n"
        full_context_list.append(doc_str)

    return "".join(full_context_list)

In [20]:
import gc   # garbage collector

total_score = 0
count = 0
errors = 0

print(f"Starting evaluation on {len(questions)} questions...")


# we use tqdm to see the progress bar
for entry in tqdm(questions):
    question_uuid = entry['uuid']

    # --- SKIP LOGIC ---

    # If UUID is in the processed sed, we skip the iteration
    if question_uuid in processed_uuids:
        continue

    # ------------------

    try:
        topic_id = entry['topic_id']
        question_uuid = entry['uuid']
        golden_ans = entry['golden_answer']

        # Recover documents for this topic
        context_docs = docs_by_topic.get(topic_id, [])

        # 1. Create the prompt

        # VERSIONE A
        # prompt = format_solar_prompt_A(entry, context_docs, max_total_chars=6000)
        # VERSIONE B
        prompt = format_solar_prompt_B(entry, context_docs, max_docs=3, max_chars_per_doc=700)

        # 2. Tokenization
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # AGGIUNGI QUESTA RIGA DI DEBUG
        if count < 3: # Stampalo solo per i primi 3 per non intasare
            print(f"DEBUG - Input Tokens Length: {inputs.input_ids.shape[1]}")
            # >2500-3000 male
            # intorno ai 1000 va bene

        # 3. Generation (Greedy decoding for reproducibility)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=15,    # Enough for "A, B, C"
                do_sample=False,      # Determinism
                temperature=0.0,
                pad_token_id=tokenizer.eos_token_id
            )

        # 4. Decoding e Cleaning
        # Cutting the input prompt -> the response of the model also contains the input prompt we provided
        generated_ids = outputs[0][inputs.input_ids.shape[1]:]
        raw_response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

        # Extract the letters set (es. {"A", "B"})
        pred_set = clean_response(raw_response)

        # 5. Calculate score
        score = calculate_score(pred_set, golden_ans)

        # We prepare the result object (a list of result_item, which is the following dictionare)
        result_item = {
            "uuid": question_uuid,
            "topic_id": topic_id,
            "golden_raw": golden_ans,
            "prediction_raw": raw_response,
            "prediction_set": list(pred_set),
            "score": score
        }

        # --- IMMEDIATE SAVING (Checkpoint) ---
        # Append (mode 'a') a new line to the JSONL file
        # Basically, we save in a json (on the drive) every computed answer,
        # so that if something goes wrong we can restore it from the drive
        with open(CHECKPOINT_FILE, 'a', encoding='utf-8') as f:
            f.write(json.dumps(result_item) + "\n")

        # Updating variables in memory
        results.append(result_item)
        processed_uuids.add(question_uuid)
        total_score += score
        count += 1

    except RuntimeError as e:
        if "out of memory" in str(e):
            print(f"UUID:{entry['uuid']} process skipped due to OOM")
            # forcely free the cache
            del inputs
            torch.cuda.empty_cache()
            gc.collect()
            errors += 1
            continue
        else:
            print(f"Errore generico: {e}")
            continue

Starting evaluation on 400 questions...


  0%|          | 0/400 [00:00<?, ?it/s]

DEBUG - Input Tokens Length: 838


 28%|██▊       | 110/400 [00:06<00:15, 18.17it/s]

DEBUG - Input Tokens Length: 782


 28%|██▊       | 111/400 [00:28<01:39,  2.91it/s]

DEBUG - Input Tokens Length: 847


100%|██████████| 400/400 [42:34<00:00,  6.39s/it]


# **7. Results**

In [21]:
# Final Report -- Printing Correct, Partial, Wrong answers and the score

print_metrics(results)

Total questions: 400
Correct answers: 118 - 29.5%
Partial answers: 134 - 33.5%
Wrong answers: 148 - 37.0%
Total score: 185.0
Performance of the score: 46.25%
