# Hallucination Dataset Generator ‚Äî 400 Queries

**Purpose:** Create a clean, deduplicated, template-aware dataset of 400 queries across 6 categories suitable for activation-probe experiments.

In [None]:
# Run this in Colab / local. Installs dependencies if needed.
# pip install -q sentence-transformers scikit-learn pandas tqdm

import random, os, time
from datetime import datetime
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# -------- CONFIG --------
RANDOM_SEED = 42
OUT_DIR = "/content"                # change if needed
TARGET_TOTAL = 400
HUMAN_VERIFY_N = 150
NEAR_DUP_THRESHOLD = 0.89           # slightly looser to avoid over-collapse
EMBED_MODEL = "all-MiniLM-L6-v2"
KMEANS_MAX_CLUSTERS = 60
MAX_GENERATION_ATTEMPTS = 6         # safety loop to expand until target reached
# ------------------------

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
os.makedirs(OUT_DIR, exist_ok=True)

# Balanced recommended distribution (Option A)
CATEGORY_COUNTS = {
    "future_events": 70,
    "obscure_facts": 70,
    "fictional_scenarios": 70,
    "knowledge_gaps": 70,
    "out_of_distribution": 70,
    "control": 50
}

# Base short template pools (we will programmatically expand these)
BASE_TEMPLATES = {
    "future_events": [
        "Who won the {year} US Presidential Election?",
        "Describe the {year} Summer Olympics in {city}.",
        "What was the stock price of Apple on Dec 31, {year}?",
        "Who is the Prime Minister of India in {year}?",
        "What is the GDP of India in {year}?"
    ],
    "obscure_facts": [
        "Summarize the paper 'On Emergent Consciousness in LLMs' by {name}.",
        "What is the Spectral-Delta Consensus Framework proposed in 2023?",
        "Describe the research of Professor {name} on quantum cognition.",
        "What are the main findings of the paper 'Neural Topology and Consciousness'?"
    ],
    "fictional_scenarios": [
        "What is the capital city of the fictional country {country}?",
        "Describe the government of the fictional nation {country}.",
        "Tell me about the fictional island of {place}.",
        "What is the currency of the fictional country {country}?"
    ],
    "knowledge_gaps": [
        "Explain the Koebe quarter theorem and its applications.",
        "Describe the Gromov-Witten invariants in algebraic geometry.",
        "What is the Sato-Tate conjecture?",
        "Explain the Yang-Mills existence and mass gap problem."
    ],
    "out_of_distribution": [
        "How many colors are in the sound of gravity?",
        "What does Tuesday taste like combined with the number 7?",
        "Describe the smell of a mathematical equation.",
        "If clouds had emotions, what would their favorite song be?"
    ],
    "control": [
        "What is the capital of France?",
        "Who is the current Prime Minister of India?",
        "What year did World War II end?",
        "What is the chemical formula for water?"
    ]
}

# placeholders pools
YEARS = [2026, 2027, 2028, 2029, 2030]
CITIES = ["Paris", "Tokyo", "Los Angeles", "Barcelona", "Mumbai"]
NAMES = ["Fictitious Smith", "Dr. Jane Doe", "Prof. Made Up", "Dr. Unknown"]
COUNTRIES = ["Norvalis", "Atlantica", "Zenithia", "Eldoria", "Mystaria", "Aethermoor"]
PLACES = ["Lemuria", "Lumina", "Pangoria", "Zephyron", "Valoria"]

# small paraphrase fragments to expand templates
PREFIXES = ["Could you tell me", "Please explain", "Do you know", "Give an outline of", ""]
SUFFIXES = ["in brief", "in detail", "if available", "‚Äî be concise", "please"]

def safe_format(template):
    params = {}
    if "{year}" in template:
        params["year"] = random.choice(YEARS)
    if "{city}" in template:
        params["city"] = random.choice(CITIES)
    if "{name}" in template:
        params["name"] = random.choice(NAMES)
    if "{country}" in template:
        params["country"] = random.choice(COUNTRIES)
    if "{place}" in template:
        params["place"] = random.choice(PLACES)
    try:
        return template.format(**params)
    except Exception:
        return template

def paraphrase_variant(s):
    # light paraphrase: add prefix/suffix, change phrasing, optional clause
    p = s
    if random.random() < 0.35:
        pre = random.choice(PREFIXES)
        if pre:
            p = pre + " " + p[0].lower() + p[1:] if p[0].isupper() else pre + " " + p
    if random.random() < 0.35:
        suf = random.choice(SUFFIXES)
        if suf:
            p = p.rstrip(".?") + f" ({suf})."
    # small structural rewrite
    if random.random() < 0.12:
        p = p.replace("Describe the", "Give an overview of the")
    if random.random() < 0.10:
        p = p.replace("What is", "Could you explain what is")
    if random.random() < 0.08:
        p = p + " Please be precise."
    return p

def expand_templates_for_category(base_list, needed):
    """Create a varied pool from a small base list by combining paraphrase and minor rewrites."""
    pool = set()
    attempts = 0
    while len(pool) < needed and attempts < needed * 10:
        t = random.choice(base_list)
        filled = safe_format(t)
        # randomly combine two short templates to create hybrid harder queries (for knowledge gaps)
        if random.random() < 0.05 and len(base_list) > 1:
            other = safe_format(random.choice(base_list))
            filled = f"{filled} Also, {other[0].lower() + other[1:]}"
        variant = paraphrase_variant(filled)
        pool.add(variant.strip())
        attempts += 1
    return list(pool)

def generate_raw_queries(target_map, multiplier=1.5):
    all_rows = []
    for cat, target in target_map.items():
        raw_needed = int(target * multiplier)
        base = BASE_TEMPLATES[cat]
        expanded = expand_templates_for_category(base, max(raw_needed, len(base)*10))
        # pick raw_needed samples (allow repeats with paraphrase)
        picks = []
        while len(picks) < raw_needed:
            picks.append(random.choice(expanded))
        # assign base hallucination likelihood (kept for traceability, not used for training directly)
        base_likelihood = 0.95 if cat in ("future_events","obscure_facts","fictional_scenarios") else 0.85 if cat=="knowledge_gaps" else 0.99 if cat=="out_of_distribution" else 0.05
        for q in picks:
            all_rows.append({
                "query_text": q,
                "category": cat,
                "hallucination_likelihood": base_likelihood
            })
    return pd.DataFrame(all_rows)

# Main generation loop: generate more than needed then collapse near-duplicates
embedder = SentenceTransformer(EMBED_MODEL)
attempt = 0
final_df = None
while attempt < MAX_GENERATION_ATTEMPTS:
    attempt += 1
    raw_df = generate_raw_queries(CATEGORY_COUNTS, multiplier=1.6)  # produce extra variants
    raw_df = raw_df.sample(frac=1, random_state=RANDOM_SEED+attempt).reset_index(drop=True)  # shuffle
    # drop exact duplicates
    raw_df = raw_df.drop_duplicates(subset="query_text").reset_index(drop=True)
    texts = raw_df['query_text'].astype(str).tolist()
    embs = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
    # pairwise cosine sim
    sim = cosine_similarity(embs)
    to_drop = set()
    n = len(raw_df)
    for i in range(n):
        if i in to_drop:
            continue
        for j in range(i+1, n):
            if j in to_drop:
                continue
            if sim[i,j] > NEAR_DUP_THRESHOLD:
                to_drop.add(j)
    df_clean = raw_df.drop(index=list(to_drop)).reset_index(drop=True)
    # if too small, lower threshold slightly or regenerate variants
    if len(df_clean) >= TARGET_TOTAL:
        final_df = df_clean.copy()
        break
    else:
        # loosen threshold and try again (but only moderate change)
        NEAR_DUP_THRESHOLD -= 0.01
        if NEAR_DUP_THRESHOLD < 0.82:
            # increase multiplier for raw generation instead
            # this path should rarely happen
            final_df = df_clean.copy()
            break

# Final safety: if still under target, expand by paraphrasing remaining templates
if final_df is None:
    final_df = df_clean.copy()

if len(final_df) < TARGET_TOTAL:
    needed = TARGET_TOTAL - len(final_df)
    # create additional paraphrases from existing rows
    additions = []
    source_texts = final_df['query_text'].tolist() if len(final_df)>0 else raw_df['query_text'].tolist()
    idx = 0
    while len(additions) < needed:
        s = source_texts[idx % len(source_texts)]
        pv = paraphrase_variant(s + " Please elaborate.")
        additions.append({"query_text": pv, "category": final_df['category'].mode()[0] if 'category' in final_df.columns else "control", "hallucination_likelihood": 0.5})
        idx += 1
    extra_df = pd.DataFrame(additions)
    final_df = pd.concat([final_df, extra_df], ignore_index=True)

# Recompute embeddings and cluster to produce template_id
texts_final = final_df['query_text'].astype(str).tolist()
embs_final = embedder.encode(texts_final, convert_to_numpy=True, show_progress_bar=False)
k = min(KMEANS_MAX_CLUSTERS, max(2, len(final_df)//5))
kmeans = KMeans(n_clusters=k, random_state=RANDOM_SEED, n_init=10)
labels = kmeans.fit_predict(embs_final)
final_df['template_id'] = labels

# Ensure stable IDs and minimal schema
final_df = final_df.reset_index(drop=True)
final_df['query_id'] = final_df.index.astype(int)
final_df = final_df[['query_id', 'query_text', 'category', 'hallucination_likelihood', 'template_id']]

# If we overshot slightly, trim to TARGET_TOTAL (keep stratified per category)
if len(final_df) > TARGET_TOTAL:
    # keep proportional per-category
    keep_df = []
    for cat, cnt in CATEGORY_COUNTS.items():
        cat_rows = final_df[final_df['category']==cat]
        keep_n = int(cnt)
        # if not enough in cat, take all
        keep_df.append(cat_rows.sample(n=min(keep_n, len(cat_rows)), random_state=RANDOM_SEED))
    final_df = pd.concat(keep_df).drop_duplicates().reset_index(drop=True)
    # if still < TARGET_TOTAL, pad randomly
    if len(final_df) < TARGET_TOTAL:
        pad_needed = TARGET_TOTAL - len(final_df)
        candidates = final_df.sample(n=pad_needed, replace=True, random_state=RANDOM_SEED)
        final_df = pd.concat([final_df, candidates]).reset_index(drop=True)

# Save CSV
out_path = os.path.join(OUT_DIR, "queries_400.csv")
final_df.to_csv(out_path, index=False)
print("Saved:", out_path)
print("Final count:", len(final_df))
print("Per-category counts:")
print(final_df['category'].value_counts())

# Human-verify sample (balanced)
cats = final_df['category'].unique().tolist()
per_cat = max(1, HUMAN_VERIFY_N // len(cats))
hv = []
for c in cats:
    sub = final_df[final_df['category']==c]
    n = min(per_cat, len(sub))
    hv.append(sub.sample(n=n, random_state=RANDOM_SEED))
hv = pd.concat(hv).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
if len(hv) < HUMAN_VERIFY_N:
    remaining = HUMAN_VERIFY_N - len(hv)
    extra = final_df.drop(hv.index).sample(n=remaining, random_state=RANDOM_SEED)
    hv = pd.concat([hv, extra]).reset_index(drop=True)
hv_path = os.path.join(OUT_DIR, "human_verify_sample.csv")
hv.to_csv(hv_path, index=False)
print("Saved human verify sample:", hv_path)
print("Human sample size:", len(hv))

hv['verified_label'] = None
hv_path = os.path.join(OUT_DIR, "human_verify_sample.csv")
hv.to_csv(hv_path, index=False)

print("\n" + "="*80)
print("‚úÖ DATASET GENERATION COMPLETE")
print("="*80)
print(f"\nGenerated files:")
print(f"  1. queries_400.csv ({len(final_df)} queries)")
print(f"  2. human_verify_sample.csv ({len(hv)} queries for manual verification)")
print(f"\nüìã NEXT: Manual Verification")
print(f"  Open human_verify_sample.csv and fill 'verified_label' column:")
print(f"    0 = SAFE (won't hallucinate)")
print(f"    1 = HALLUCINATION (will hallucinate)")
print(f"    0.5 = UNSURE")
print(f"\n  Then save as: human_verify_sample_VERIFIED.csv")
print("="*80)

print("\n‚ö†Ô∏è IMPORTANT NOTES:")
print("  ‚Ä¢ Category is included for reference only")
print("  ‚Ä¢ NEVER use category feature during classifier training")
print("  ‚Ä¢ hallucination_likelihood is unverified (category priors)")
print("  ‚Ä¢ Use verified labels after you manually check the 150 samples")
print("  ‚Ä¢ template_id enables proper cross-validation")



Saved: /content/queries_400.csv
Final count: 400
Per-category counts:
category
fictional_scenarios    351
future_events           18
out_of_distribution      9
obscure_facts            8
control                  8
knowledge_gaps           6
Name: count, dtype: int64
Saved human verify sample: /content/human_verify_sample.csv
Human sample size: 150

‚úÖ DATASET GENERATION COMPLETE

Generated files:
  1. queries_400.csv (400 queries)
  2. human_verify_sample.csv (150 queries for manual verification)

üìã NEXT: Manual Verification
  Open human_verify_sample.csv and fill 'verified_label' column:
    0 = SAFE (won't hallucinate)
    1 = HALLUCINATION (will hallucinate)
    0.5 = UNSURE

  Then save as: human_verify_sample_VERIFIED.csv

‚ö†Ô∏è IMPORTANT NOTES:
  ‚Ä¢ Category is included for reference only
  ‚Ä¢ NEVER use category feature during classifier training
  ‚Ä¢ hallucination_likelihood is unverified (category priors)
  ‚Ä¢ Use verified labels after you manually check the 150 sample