In [None]:
from huggingface_hub import login

# Paste your token here (or use an environment variable for security)
HUGGINGFACE_TOKEN = ""

# Log in
login(token=HUGGINGFACE_TOKEN)

In [None]:
import re
from pathlib import Path

# Allow letters, numbers, dashes, and underscores
STYLE_RE = re.compile(r'_(?P<style>[A-Za-z0-9_-]+)_(?:\d{8})_\d{6}\.json$')

def extract_style(filename: str):
    m = STYLE_RE.search(filename)
    return m.group('style') if m else None

In [None]:
import json

def evaluate_model(eval_model, prompt_dir):

    folder = Path(f"rephrased_prompts/{prompt_dir}")
    for p in folder.glob("rephrased_plain_one_option*.json"):
        style = extract_style(p.name)
        with open(p, "r", encoding="utf-8") as f:
            file_data = json.load(f)

        eval_model.evaluate_task(file_data, style)

In [None]:
import torch
from typing import List, Dict, Tuple
import re
from abc import ABC, abstractmethod
from collections import defaultdict
from datetime import datetime
import random
import json
from tqdm.notebook import tqdm

class MCQ(ABC):
    def __init__(self, model=None, model_name: str = "meta-llama/Llama-2-7b-hf"):
        """
        Initialize the MCQ handler with shared functionality for prompt formatting and choice extraction.

        Args:
            model: Preloaded local model (optional)
            model_name: HuggingFace model name for local models
        """
        self.model = model
        self.model_name = model_name

    def format_prompt(self, question_data: Dict, phrase_replacement_dict: Dict, output_field: str = None) -> str:
        """Format a prompt using the task's structured prompt format"""
        choices = ""
        for choice, answer in question_data['answers'].items():
            choices += f"({choice}) {answer}\n"
        formatted = f"{phrase_replacement_dict['instruction']}\n{phrase_replacement_dict['question']}: {question_data['question']}\n{phrase_replacement_dict['choices']}: {choices} \n{phrase_replacement_dict['answer']}:"
        return formatted

    @abstractmethod
    def answer_mcq(self, question_data: Dict, phrase_replacement_dict) -> Dict:
        """Abstract method for answering a multiple choice question"""
        pass

    def evaluate_task(self, task_data: List[Dict], style, output_dir: str = "rephrased_results", save_results: bool = True, num_samples: int = None) -> Dict:
        """Evaluate the multiple-choice task and return detailed metrics."""

        # If num_samples is provided, randomly sample `num_samples` questions
        all_questions = task_data
        if num_samples:
            all_questions = random.sample(task_data, min(num_samples, len(task_data)))

        metrics = {
            "total_questions": len(all_questions),
            "correct": 0,
            "wrong": 0,
            "per_choice_stats": defaultdict(lambda: {"correct": 0, "total": 0}),
            "confidence_when_correct": [],
            "confidence_when_wrong": [],
            "all_results": [],
            "model_name": self.model_name
        }

        # Wrap the question processing with tqdm for progress indication
        for q in tqdm(all_questions, desc="Evaluating", unit="question"):
            result = self.answer_mcq(q)
            metrics["all_results"].append(result)

            correct = result["answer"] == result["correct_answer"]
            if correct:
                metrics["correct"] += 1
                metrics["confidence_when_correct"].append(result["confidence"])
            else:
                metrics["wrong"] += 1
                metrics["confidence_when_wrong"].append(result["confidence"])

            # Per-choice stats
            true = result["correct_answer"]
            if true:
                metrics["per_choice_stats"][true]["total"] += 1
                if correct:
                    metrics["per_choice_stats"][true]["correct"] += 1

        metrics["accuracy"] = metrics["correct"] / metrics["total_questions"]
        metrics["per_choice_accuracy"] = {
            k: {"accuracy": v["correct"] / v["total"], "total": v["total"]} for k, v in metrics["per_choice_stats"].items()
        }

        # Save results
        if save_results:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            fname = f"results_task_{self.model_name}_{style}_as_plain_{timestamp}.json"
            fname = fname.replace("/", "_")
            path = f"{output_dir}/{fname}"

            # Convert defaultdict to regular dict for JSON
            metrics["per_choice_stats"] = dict(metrics["per_choice_stats"])

            with open(path, "w") as f:
                json.dump(metrics, f, indent=2)

        return metrics

# Llama 2

### install bits and bytes

In [None]:
!pip install bitsandbytes

### llama

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from typing import Dict

class Llama2Model(MCQ):
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-hf"):
        super().__init__(model_name=model_name)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        bnb_config = BitsAndBytesConfig(load_in_4bit=True)

        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto"
        ).to(self.device)  # Move the model to the selected device

    def answer_mcq(self, question_data: Dict) -> Dict:
        """Answer a single multiple-choice question using Llama Model"""
        choices = question_data["answers"]  # Now we use 'answers' as the choices dictionary
        premise = question_data["question"]
        # Format the prompt with choices
        prompt = question_data["plain"]
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)  # Ensure inputs are on the correct device
        outputs = self.model(**inputs)

        # Get logits for the last token in the sequence
        logits = outputs.logits[:, -1, :]  # Get logits for the last token
        probabilities = torch.softmax(logits, dim=-1)  # Convert logits to probabilities

        # Map each choice to its token ID and calculate probabilities
        choice_probabilities = {}
        for choice, answer in choices.items():
            # Encode the choice text to get its token ID
            choice_token_id = self.tokenizer.encode(answer, add_special_tokens=False)[0]  # Token ID for the choice text
            # Get the probability of the choice token in the model's output
            choice_probabilities[choice] = probabilities[0, choice_token_id].item()

        # Select the choice with the highest probability
        predicted_choice = max(choice_probabilities, key=choice_probabilities.get)
        confidence = choice_probabilities[predicted_choice]

        return {
            "question": premise,
            "answers": choices,
            "answer": predicted_choice,
            "confidence": confidence,
            "correct_answer": question_data["correct_answer"],  # No need to access the first character anymore
            "prompt_used": prompt
        }


In [None]:
llama2_model = Llama2Model()

In [None]:
evaluate_model(llama2_model, "meta-llama_Llama-2-7b-hf")

# Llama 3.2

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
class Llama3BModel(MCQ):
    def __init__(self, model_name: str = "meta-llama/Llama-3.2-3B-Instruct"):
        super().__init__(model_name=model_name)

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = 'left'

        # Load the Llama model
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.bfloat16,  # Use bfloat16 for efficiency
            device_map="auto",           # Automatically map model layers to available devices
            trust_remote_code=True       # Trust custom code if available in the model repo
        ).to(self.device)  # Move the model to the correct device

        # Set up the text generation pipeline
        self.llm = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            pad_token_id=self.tokenizer.eos_token_id,
            max_new_tokens=1,  # Generate a single token for efficiency
            do_sample=False     # For deterministic output
        )

    def answer_mcq(self, question_data: Dict) -> Dict:
        """Answer a single multiple-choice question using Llama Model"""
        choices = question_data["choices"]  # Now we use 'answers' as the choices dictionary

        # Format the prompt with choices
        prompt = question_data["plain"]
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)  # Ensure inputs are on the correct device

        # Get model outputs (logits)
        outputs = self.model(**inputs)
        logits = outputs.logits[:, -1, :]  # Get logits for the last token in the sequence

        # Apply softmax to get probabilities for each choice
        probabilities = torch.softmax(logits, dim=-1)

        # Map each choice to its token ID and calculate probabilities
        choice_probabilities = {}
        for idx, choice in enumerate(choices):
            choice_token_id = self.tokenizer.encode(choice, add_special_tokens=False)[0]  # Token ID for the choice text
            choice_probabilities[choice] = probabilities[0, choice_token_id].item()

        # Select the choice with the highest probability
        predicted_choice = max(choice_probabilities, key=choice_probabilities.get)
        confidence = choice_probabilities[predicted_choice]

        return {
            "question": question_data["question"],
            "choices": choices,
            "answer": predicted_choice,
            "confidence": confidence,
            "correct_answer": question_data["correct_answer"],  # Assuming output field contains the correct answer
            "prompt_used": prompt
        }


In [None]:
llama_3 = Llama3BModel()

In [None]:
evaluate_model(llama_3, "meta-llama_Llama-3.2-3B-Instruct")

# Bart

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from transformers.models.bart.modeling_bart import BartForSequenceClassification # Explicitly import the model class


# Define ENTAILMENT_IDX based on the task (usually 2 for MNLI)
ENTAILMENT_IDX = 2

class BartModel(MCQ):
    def __init__(self, model_name: str = "facebook/bart-large-mnli"):
        super().__init__(model_name=model_name)
        # Load the tokenizer and model for BART from the 'facebook/bart-large-mnli' checkpoint
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def answer_mcq(self, question_data: dict, phrase_replacement_dict: dict) -> dict:
        """
        Answer a single multiple-choice question efficiently using a BART-style model.
        Choices are given as a dictionary (e.g., {"A": "Choice1", "B": "Choice2", ...})
        """

        premise = question_data["question"]
        choices_dict = question_data["answers"]  # e.g., {"A": "Louis XIII", ...}
        choice_keys = list(choices_dict.keys())
        choice_texts = list(choices_dict.values())

        # Format the prompt with the phrase replacement dictionary
        prompt = question_data["plain"]

        # Prepare the batched input: one sequence per choice
        # Each input is: "Prompt + Question + 'The answer is {choice_text}.'"
        batch_texts = [f"{prompt}\n\nThe answer is {text}." for text in choice_texts]

        # Tokenize as a batch
        inputs = self.tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.model.device)

        with torch.no_grad():
            logits = self.model(**inputs).logits
            # Softmax over the last dimension and pick entailment index
            probs = torch.softmax(logits, dim=1)[:, ENTAILMENT_IDX]

        probs = probs.cpu().tolist()

        # Pick the choice with the highest probability
        max_idx = int(torch.tensor(probs).argmax())
        predicted_key = choice_keys[max_idx]
        predicted_answer = choice_texts[max_idx]
        confidence = probs[max_idx]

        return {
            "question": premise,
            "choices": choices_dict,
            "answer": predicted_key,
            "confidence": confidence,
            "correct_answer": question_data.get("correct_answer", None),
            "prompt_used": prompt
        }


In [None]:
bart = BartModel()

In [None]:
evaluate_model(bart, "facebook_bart-large-mnli")

# Deberta

In [None]:
from typing import Dict, Any, List, Optional
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class DeBERTaNLI_MCQ(MCQ):
    """
    Multiple-choice via zero-shot NLI with:
      model = MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli
      premise  = format_prompt(question_data, phrase_replacement_dict)
      hypothesis(choice) = "The answer is {choice}."
      score = P(entailment | premise, hypothesis)
    """

    def __init__(self, model_name: str =
                 "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"):
        super().__init__(model_name=model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device)
        self.model.eval()
        self.entail_idx = self._entailment_index()

    def _entailment_index(self) -> int:
        # Robustly detect the entailment label index from the model config
        id2label = getattr(self.model.config, "id2label", None)
        if isinstance(id2label, dict):
            for i, name in id2label.items():
                if str(name).lower().startswith("entail"):
                    return int(i)
        # Fallback to common MNLI ordering (C, N, E) -> index 2
        return 2

    def answer_mcq(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
        # Build premise with YOUR formatter (required as per your request)
        premise = question_data["plain"]

        choices: Dict[str, str] = question_data["choices"]  # {"A": "...", "B": "..."}
        keys: List[str] = list(choices.keys())
        texts: List[str] = [choices[k] for k in keys]

        premises = [premise] * len(texts)
        hyps = [f"The answer is {t}." for t in texts]

        inputs = self.tokenizer(
            premises,
            hyps,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.device)

        with torch.no_grad():
            logits = self.model(**inputs).logits  # [batch, 3]
            probs = F.softmax(logits, dim=1)[:, self.entail_idx]  # entailment probs

        best_idx = int(probs.argmax().item())
        confidences = probs.detach().cpu().tolist()

        return {
            "question": question_data["question"],
            "choices": choices,
            "answer": keys[best_idx],
            "answer_text": texts[best_idx],
            "confidence": confidences[best_idx],
            "all_entailment_probs": {k: p for k, p in zip(keys, confidences)},
            "correct_answer": question_data.get("correct_answer"),
            "prompt_used": premise,
            "model_name": self.model.config._name_or_path
        }


In [None]:
deberta = DeBERTaNLI_MCQ()

In [None]:
evaluate_model(deberta, "MoritzLaurer_DeBERTa-v3-large-mnli-fever-anli-ling-wanli")