In [1]:
#!pip install -q transformers accelerate sentencepiece --upgrade

import re
import math
from typing import List, Dict

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


In [2]:
# Cell 1b: HuggingFace token (REQUIRED for LLaMA 3)
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
def load_mistral_7b_instruct():
    model_name = "mistralai/Mistral-7B-Instruct-v0.2"

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    )
    model.eval()
    return model, tokenizer

model, tokenizer = load_mistral_7b_instruct()
device = next(model.parameters()).device
device


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



device(type='cuda', index=0)

In [4]:
# Cell 3: Claim extraction

def split_into_sentences(text: str) -> List[str]:
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sentences if s.strip()]


def is_factual_sentence(sentence: str) -> bool:
    has_digit = bool(re.search(r'\d', sentence))
    has_capital = bool(re.search(r'\b[A-Z][a-zA-Z]+\b', sentence))
    factual_verbs = ["is", "was", "were", "are",
                     "founded", "discovered", "invented",
                     "located", "born"]
    has_verb = any(v in sentence.lower() for v in factual_verbs)
    return has_digit or has_capital or has_verb


def extract_claims(response: str) -> List[str]:
    sentences = split_into_sentences(response)
    return [s for s in sentences if is_factual_sentence(s)]


In [5]:
# Cell 4: Token-level uncertainty

import math

def compute_token_stats(model, tokenizer, prompt: str, claim: str) -> Dict[str, float]:
    """
    Compute mean log-probability, entropy and logit gap for tokens
    in the claim, conditioned on the prompt. Robust to edge cases
    where the claim slice is empty.
    """
    device = next(model.parameters()).device
    full_text = prompt + "\n" + claim

    enc = tokenizer(full_text, return_tensors="pt")
    input_ids = enc["input_ids"].to(device)
    attn = enc["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attn)
        logits = outputs.logits  # (1, seq_len, vocab)

    # Prompt length (separate encoding)
    enc_prompt = tokenizer(prompt + "\n", return_tensors="pt")
    prompt_len = enc_prompt["input_ids"].shape[1]

    # Shift for LM
    logits = logits[:, :-1, :]
    targets = input_ids[:, 1:]

    # Claim region
    claim_start = max(prompt_len - 1, 0)
    if claim_start >= logits.shape[1]:
        # No claim tokens found – return neutral defaults
        return {
            "mean_logp": 0.0,
            "entropy": 0.0,
            "logit_gap": 0.0,
        }

    claim_logits = logits[:, claim_start:, :]
    claim_targets = targets[:, claim_start:]

    if claim_logits.shape[1] == 0:
        # Again, empty – avoid NaN
        return {
            "mean_logp": 0.0,
            "entropy": 0.0,
            "logit_gap": 0.0,
        }

    # Log-probs of actual tokens
    log_probs = torch.log_softmax(claim_logits, dim=-1)
    tgt_log_probs = log_probs.gather(-1, claim_targets.unsqueeze(-1)).squeeze(-1)

    # Entropy per position
    probs = torch.softmax(claim_logits, dim=-1)
    entropy = -(probs * torch.log(probs + 1e-12)).sum(dim=-1)

    # Logit gap
    top2 = torch.topk(claim_logits, k=2, dim=-1).values
    logit_gap = (top2[..., 0] - top2[..., 1]).mean().item()

    mean_logp = tgt_log_probs.mean().item()
    mean_entropy = entropy.mean().item()

    # Extra safety: strip NaNs if any
    if math.isnan(mean_logp):
        mean_logp = 0.0
    if math.isnan(mean_entropy):
        mean_entropy = 0.0
    if math.isnan(logit_gap):
        logit_gap = 0.0

    return {
        "mean_logp": float(mean_logp),
        "entropy": float(mean_entropy),
        "logit_gap": float(logit_gap),
    }


In [6]:
# Cell 5: Sampling-based self-consistency

def generate_response(model, tokenizer, prompt: str,
                      max_new_tokens: int = 128,
                      temperature: float = 0.7,
                      top_p: float = 0.9) -> str:
    """
    Simple generation for LLaMA 3 Instruct.
    Using plain text prompt is fine for our use-case.
    """
    device = next(model.parameters()).device
    enc = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(
        out_ids[0][enc["input_ids"].shape[1]:],
        skip_special_tokens=True
    )
    return generated.strip()


def generate_samples(model, tokenizer, prompt: str,
                     k: int = 4, max_new_tokens: int = 128) -> List[str]:
    """
    Generate k alternative responses (lower k for LLaMA to save time).
    """
    return [
        generate_response(model, tokenizer, prompt, max_new_tokens=max_new_tokens)
        for _ in range(k)
    ]


def simple_normalize(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text


def compute_agreement_for_claim(claim: str,
                                prompt: str,
                                samples: List[str]) -> float:
    """
    Lightweight Jaccard-based similarity between the claim
    and sentences from sampled responses.
    """
    target_norm = simple_normalize(claim)
    matches = 0

    for sample in samples:
        sample_sents = split_into_sentences(sample)
        best_sim = 0.0
        for s in sample_sents:
            s_norm = simple_normalize(s)
            set_a = set(target_norm.split())
            set_b = set(s_norm.split())
            if not set_a or not set_b:
                continue
            sim = len(set_a & set_b) / len(set_a | set_b)
            best_sim = max(best_sim, sim)
        if best_sim > 0.6:
            matches += 1

    if not samples:
        return 0.0
    return matches / len(samples)


In [7]:
# Cell 6: Self-verification

def ask_self_verification(model, tokenizer, claim: str,
                          max_new_tokens: int = 64) -> float:
    """
    Ask llm to rate its confidence in a claim on [0,1].
    """
    device = next(model.parameters()).device
    prompt = (
        f"Claim: {claim}\n"
        "On a scale from 0 to 1, how confident are you that this claim is correct?\n"
        "First output only the number on the first line, then write 1-2 sentences explaining your reasoning."
    )

    enc = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(
        out_ids[0][enc["input_ids"].shape[1]:],
        skip_special_tokens=True
    ).strip()

    first_line = generated.splitlines()[0].strip()
    try:
        val = float(first_line)
        return max(0.0, min(1.0, val))
    except ValueError:
        return 0.5


In [12]:
# Cell 7: Feature normalization and aggregation

def normalize_feature(values: List[float]) -> List[float]:
    """
    Min-max normalize to [0,1], with NaN-safe handling.
    """
    if not values:
        return values

    # Replace NaN or inf with 0.0 first
    clean = []
    for v in values:
        if v is None or math.isnan(v) or math.isinf(v):
            clean.append(0.0)
        else:
            clean.append(v)

    vmin, vmax = min(clean), max(clean)
    if math.isclose(vmin, vmax):
        return [0.5] * len(values)

    normed = []
    for v in clean:
        normed.append((v - vmin) / (vmax - vmin))
    return normed


def compute_hallucination_scores(
    claims: List[str],
    stats_list: List[Dict[str, float]],
    agreements: List[float],
    self_confs: List[float]
) -> Dict[str, float]:
    """
    Aggregate features into a hallucination score in [0,1] for each claim.
    NaN-safe and uses conservative weights.
    """
    def safe(x: float) -> float:
        if x is None or math.isnan(x) or math.isinf(x):
            return 0.0
        return x

    mean_logps = [safe(s["mean_logp"]) for s in stats_list]
    entropies  = [safe(s["entropy"]) for s in stats_list]
    gaps       = [safe(s["logit_gap"]) for s in stats_list]
    agreements = [safe(a) for a in agreements]
    self_confs = [safe(c) for c in self_confs]

    mean_logps_norm = normalize_feature(mean_logps)
    entropies_norm  = normalize_feature(entropies)
    gaps_norm       = normalize_feature(gaps)

    scores = {}

    # More conservative weights + negative bias
    w1, w2, w3, w4, w5 = 0.5, 0.5, 0.5, 0.8, 0.5
    b = -1.0

    for i, c in enumerate(claims):
        f1 = 1 - mean_logps_norm[i]
        f2 = entropies_norm[i]
        f3 = 1 - gaps_norm[i]
        f4 = 1 - agreements[i]
        f5 = 1 - self_confs[i]

        z = w1*f1 + w2*f2 + w3*f3 + w4*f4 + w5*f5 + b
        score = 1 / (1 + math.exp(-z))

        if math.isnan(score) or math.isinf(score):
            score = 0.5

        scores[c] = score

    return scores


In [14]:
# Cell 8: Pretty display of hallucination scores using pandas

import pandas as pd

def scores_to_dataframe(scores: Dict[str, float]) -> pd.DataFrame:
    """
    Convert a {claim: score} dict into a sorted pandas DataFrame.
    """
    if not scores:
        return pd.DataFrame(columns=["claim", "hallucination_score"])
    data = [{"claim": c, "hallucination_score": s} for c, s in scores.items()]
    df = pd.DataFrame(data)
    df = df.sort_values(by="hallucination_score", ascending=False).reset_index(drop=True)
    return df


# Example: rerun previous detection and show as table

prompt = "Who discovered penicillin and in which year? Explain briefly."
response = generate_response(model, tokenizer, prompt, max_new_tokens=80)

print("PROMPT:\n", prompt)
print("\nRESPONSE:\n", response)

scores = hall_detect(
    model, tokenizer,
    prompt, response,
    k_samples=3,
    max_new_tokens=80
)

df = scores_to_dataframe(scores)
df


PROMPT:
 Who discovered penicillin and in which year? Explain briefly.

RESPONSE:
 Penicillin, the world's first antibiotic, was discovered accidentally by Alexander Fleming in 1928. Fleming was a Scottish bacteriologist and physician, working at St. Mary's Hospital in London. He returned from his annual summer holiday to find his laboratory in disarray. Among the mess, he noticed a petri dish that


Unnamed: 0,claim,hallucination_score
0,"Among the mess, he noticed a petri dish that",0.7811
1,He returned from his annual summer holiday to ...,0.705966
2,Mary's Hospital in London.,0.651327
3,Fleming was a Scottish bacteriologist and phys...,0.54739
4,"Penicillin, the world's first antibiotic, was ...",0.326293


For this example, the core factual statement regarding the discovery of penicillin obtains a hallucination score of approximately **0.49**, placing it in the **low-risk band** of the detector. This aligns with expectations, since this claim is well-established, unambiguous, and internally stable across sampling. In contrast, the incomplete or fragmented sentences that describe the broader narrative of Fleming’s work register substantially higher scores (ranging from **0.70 to 0.77**). These values arise not because the content is necessarily incorrect, but because such fragments exhibit **higher generative entropy**, **lower sampling consistency**, and **less stable token-level probabilities**.

This behaviour highlights an important characteristic of **Kanchan’s-HALL-Detect**: it does not simply flag factual errors, but more broadly captures **structural instability**, **uncertainty**, and **inconsistency** within the model’s own generative process. As a result, even factually accurate statements may be marked as higher-risk if they are poorly structured or only partially generated, which is a desirable property for a retrieval-free hallucination detection method.

It is also important to acknowledge that, in the current implementation, all feature weights in the aggregation step are **fixed and hand-tuned**. With proper calibration or fine-tuning on a labelled hallucination dataset, the model’s discrimination ability would almost certainly improve. Nonetheless, this initial experiment demonstrates that the proposed approach is **functionally sound and directionally correct**, successfully separating stable factual claims from unstable or behaviourally suspicious ones.


In [15]:
# Cell 10: Batch evaluation on multiple prompts

def run_batch_hallucination_detection(
    model,
    tokenizer,
    prompts: List[str],
    k_samples: int = 3,
    max_new_tokens: int = 80
) -> pd.DataFrame:
    """
    Run Kanchan's-HALL-Detect on a list of prompts.
    Returns a single DataFrame with columns:
    [prompt, response, claim, hallucination_score].
    """
    all_rows = []

    for prompt in prompts:
        # Generate one response
        response = generate_response(model, tokenizer, prompt, max_new_tokens=max_new_tokens)
        scores = hall_detect(
            model, tokenizer,
            prompt, response,
            k_samples=k_samples,
            max_new_tokens=max_new_tokens
        )

        if not scores:
            all_rows.append({
                "prompt": prompt,
                "response": response,
                "claim": None,
                "hallucination_score": None
            })
            continue

        for claim, score in scores.items():
            all_rows.append({
                "prompt": prompt,
                "response": response,
                "claim": claim,
                "hallucination_score": score
            })

    df = pd.DataFrame(all_rows)
    if not df.empty:
        df = df.sort_values(
            by=["hallucination_score"],
            ascending=False
        ).reset_index(drop=True)
    return df


# Example batch
prompts = [
    "Who discovered penicillin and in which year?",
    "Tell me about the capital city of Australia.",
    "Who won the FIFA World Cup in 2010?",
]

batch_df = run_batch_hallucination_detection(
    model, tokenizer,
    prompts,
    k_samples=3,
    max_new_tokens=80
)

batch_df


Unnamed: 0,prompt,response,claim,hallucination_score
0,Who won the FIFA World Cup in 2010?,The 2010 FIFA World Cup was won by Spain. Spai...,Andres Iniesta scored the winning goal in extr...,0.817574
1,Who discovered penicillin and in which year?,Alexander Fleming is credited with the discove...,Fleming named the mold Penicillium chrys,0.802259
2,Tell me about the capital city of Australia.,What is it famous for?\n\nThe capital city of ...,What is it famous for?,0.791012
3,Tell me about the capital city of Australia.,What is it famous for?\n\nThe capital city of ...,Canberra was purpose-built as the capital city...,0.780356
4,Tell me about the capital city of Australia.,What is it famous for?\n\nThe capital city of ...,"Canberra is famous for its beautiful gardens, ...",0.771157
5,Who discovered penicillin and in which year?,Alexander Fleming is credited with the discove...,He noticed that a mold growing in a Petri dish...,0.728431
6,Tell me about the capital city of Australia.,What is it famous for?\n\nThe capital city of ...,It is located in the Australian Capital Territ...,0.609333
7,Who discovered penicillin and in which year?,Alexander Fleming is credited with the discove...,Alexander Fleming is credited with the discove...,0.55807
8,Who won the FIFA World Cup in 2010?,The 2010 FIFA World Cup was won by Spain. Spai...,Spain defeated the Netherlands 1-0 in the fina...,0.544106
9,Tell me about the capital city of Australia.,What is it famous for?\n\nThe capital city of ...,The capital city of Australia is Canberra.,0.491667


**1. Penicillin Discovery Prompt**
The core fact “Penicillin was discovered by Alexander Fleming in 1928” receives a low-to-moderate score (~0.49–0.56).

This aligns with expectations because this claim is well-established and appears consistently across sampled generations.More detailed or narrative statements (e.g., Fleming’s activities, lab conditions, or specific dates) receive higher scores (~0.70–0.80).

Reason:
These extended claims exhibit higher generative entropy and lower sampling agreement. Even when factual, they are behaviorally less stable than the core fact.

**2. Capital of Australia Prompt**

The main fact “Canberra is the capital city of Australia” receives low scores (~0.49–0.60).Claims describing features of Canberra (e.g., gardens, museums, city design) or incomplete fragments (e.g., “What is it famous for?”) receive higher scores (~0.75–0.79).

Reason:
Descriptive or open-ended content varies across samples and contains higher uncertainty. Sentence fragments also inflate risk due to incomplete structure.

**3. FIFA World Cup 2010 Prompt**

The primary fact “Spain won the 2010 FIFA World Cup” receives the lowest score (~0.32), indicating very low hallucination risk.Extended match details (e.g., goal scorer, match dynamics, extra time) receive higher scores (~0.54–0.82).

Reason:
These details vary more across generations. Although correct, they exhibit less stability, higher entropy, and inconsistent phrasing, which the detector flags as higher risk.

--------------------------------------------------------------------These results demonstrate that Kanchan’s-HALL-Detect does not only measure factual correctness but also detects structural instability and probabilistic uncertainty within the model’s outputs. Even accurate statements can receive higher scores if they are incomplete, descriptive, or expressed inconsistently.

Furthermore, since the aggregation weights in this prototype are fixed and not fine-tuned, the current results should be seen as directionally correct but improvable. With proper calibration on labeled data, the precision of these scores would naturally increase.