## Install Dependencies & Setup

In [2]:
# Install required packages
!pip install torch transformers bitsandbytes accelerate -q
!pip install pandas numpy tqdm -q

import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import json

print(f"✅ PyTorch version: {torch.__version__}")
print(f"✅ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h✅ PyTorch version: 2.8.0+cu126
✅ CUDA available: True
✅ GPU: Tesla T4


## Load Dataset from Kaggle

In [3]:
# Mount Google Drive for storage
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Install dependencies as needed:
!pip install kagglehub[pandas-datasets] -q
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "queries_labeled.csv"

# Load the latest version
queries_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "girisankargopakumar/hallucination-detection-600-labeled-queries",
  file_path
)

print(f"✅ Loaded {len(queries_df)} queries")
print(f"Categories: {queries_df['category'].unique()}")
print(f"\nFirst 3 queries:")
print(queries_df[['query_id', 'query_text', 'auto_label']].head(3))

Mounted at /content/drive


  queries_df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/girisankargopakumar/hallucination-detection-600-labeled-queries?dataset_version_number=1&file_name=queries_labeled.csv...


100%|██████████| 70.6k/70.6k [00:00<00:00, 765kB/s]

✅ Loaded 600 queries
Categories: ['future_events' 'obscure_facts' 'fictional_scenarios' 'knowledge_gaps'
 'out_of_distribution' 'control']

First 3 queries:
   query_id                                         query_text  auto_label
0         0                   Who won the 2026 FIFA World Cup?           1
1         1                   Who won the 2029 FIFA World Cup?           1
2         2  What was the stock price of Apple on Dec 31, 2...           1





In [4]:
!pip install bitsandbytes transformers accelerate -q

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# BitsAndBytes config for 4-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

print("Loading Mistral-7B tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

print("Loading Mistral-7B model (4-bit quantized)...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    output_hidden_states=True,
    quantization_config=quant_config
)

print("✅ Mistral-7B loaded; ready for extraction.")
print("Device:", next(model.parameters()).device)
print("Dtype:", next(model.parameters()).dtype)


Loading Mistral-7B tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading Mistral-7B model (4-bit quantized)...


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✅ Mistral-7B loaded; ready for extraction.
Device: cuda:0
Dtype: torch.float16


## Activation Extraction Function

In [5]:
def extract_activations(model, tokenizer, query_text, layers=[3, 4, 5, 6], n_neurons=100):
    """
    Extract hidden state activations from specified layers.
    Returns dictionary with FLATTENED activations (not nested arrays).
    """

    import torch

    # Tokenize input
    inputs = tokenizer(query_text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Forward pass
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Extract hidden states
    hidden_states = outputs.hidden_states

    activations = {}

    for layer_idx in layers:
        # Get activation for this layer: (seq_len, hidden_dim)
        layer_activation = hidden_states[layer_idx][0]  # First token/batch

        # Average across sequence dimension to get single vector
        avg_activation = layer_activation.mean(dim=0)  # (hidden_dim,)

        # Convert to numpy
        avg_activation_np = avg_activation.cpu().detach().numpy()

        # Select first n_neurons and store as individual features
        selected = avg_activation_np[:n_neurons]

        # CRITICAL: Store each neuron as a separate key (flattened)
        for neuron_idx in range(len(selected)):
            key = f"layer_{layer_idx}_neuron_{neuron_idx}"
            activations[key] = float(selected[neuron_idx])  # Convert to Python float

    return activations  # Returns dict with 400 keys (not nested)

# Test on one query
print("Testing activation extraction on first query...")
test_query = queries_df.iloc[0]['query_text']
print(f"Query: {test_query[:100]}...")

test_acts = extract_activations(model, tokenizer, test_query)
print(f"✅ Extracted {len(test_acts)} activation features")
print(f"Sample features: {list(test_acts.keys())[:5]}")
print(f"Sample values: {list(test_acts.values())[:5]}")


Testing activation extraction on first query...
Query: Who won the 2026 FIFA World Cup?...
✅ Extracted 400 activation features
Sample features: ['layer_3_neuron_0', 'layer_3_neuron_1', 'layer_3_neuron_2', 'layer_3_neuron_3', 'layer_3_neuron_4']
Sample values: [-0.00569915771484375, -0.0211181640625, -0.01470184326171875, 0.00797271728515625, -0.0112762451171875]


## Extract Activations for All 600 Queries

In [6]:
from tqdm import tqdm
import pandas as pd
import json

print(f"Extracting activations for {len(queries_df)} queries...")

all_activations = []
failed_queries = []

for idx, row in tqdm(queries_df.iterrows(), total=len(queries_df)):
    query_id = row['query_id']
    query_text = row['query_text']
    auto_label = row['auto_label']

    try:
        # Extract activations
        acts = extract_activations(model, tokenizer, query_text)

        # Add query metadata
        acts['query_id'] = query_id
        acts['auto_label'] = auto_label
        acts['category'] = row['category']

        all_activations.append(acts)

    except Exception as e:
        failed_queries.append({'query_id': query_id, 'error': str(e)})
        print(f"⚠️  Failed on query {query_id}: {str(e)[:50]}")

print(f"\n✅ Successfully extracted {len(all_activations)} queries")
if failed_queries:
    print(f"⚠️  Failed on {len(failed_queries)} queries")

# Convert to DataFrame
activations_df = pd.DataFrame(all_activations)

print(f"\n📊 Activations shape: {activations_df.shape}")
print(f"Columns (first 10): {list(activations_df.columns)[:10]}")
print(f"\nLabel distribution:")
print(activations_df['auto_label'].value_counts())


Extracting activations for 600 queries...


100%|██████████| 600/600 [02:06<00:00,  4.76it/s]



✅ Successfully extracted 600 queries

📊 Activations shape: (600, 403)
Columns (first 10): ['layer_3_neuron_0', 'layer_3_neuron_1', 'layer_3_neuron_2', 'layer_3_neuron_3', 'layer_3_neuron_4', 'layer_3_neuron_5', 'layer_3_neuron_6', 'layer_3_neuron_7', 'layer_3_neuron_8', 'layer_3_neuron_9']

Label distribution:
auto_label
1    507
0     93
Name: count, dtype: int64


## Save Activations

In [7]:
import json

# Save to CSV
output_filename = 'mistral_activations.csv'
activations_df.to_csv(output_filename, index=False)

print(f"✅ Saved {len(activations_df)} queries to {output_filename}")
print(f"File size: {activations_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Also save metadata (FIX: convert int64 to int)
metadata = {
    'model': 'mistralai/Mistral-7B-Instruct-v0.3',
    'n_queries': int(len(activations_df)),  # ← Convert to int
    'n_features': int(len(activations_df.columns) - 3),
    'layers': [3, 4, 5, 6],
    'neurons_per_layer': 100,
    'total_neurons': 400,
    'label_0_count': int((activations_df['auto_label'] == 0).sum()),  # ← Convert to int
    'label_1_count': int((activations_df['auto_label'] == 1).sum())   # ← Convert to int
}

with open('mistral_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\n📊 Metadata saved")
print(json.dumps(metadata, indent=2))


✅ Saved 600 queries to mistral_activations.csv
File size: 1.88 MB

📊 Metadata saved
{
  "model": "mistralai/Mistral-7B-Instruct-v0.3",
  "n_queries": 600,
  "n_features": 400,
  "layers": [
    3,
    4,
    5,
    6
  ],
  "neurons_per_layer": 100,
  "total_neurons": 400,
  "label_0_count": 93,
  "label_1_count": 507
}


In [8]:
from google.colab import files

# Download files
print("Downloading files...")
files.download('mistral_activations.csv')
files.download('mistral_metadata.json')


Downloading files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Sanity Checks

In [9]:
# Verify extraction quality
print("🔍 Sanity Checks:")
print(f"\n1. Shape check:")
print(f"   Expected: (600, 403) [600 queries × (400 neurons + 3 metadata)]")
print(f"   Actual: {activations_df.shape}")

print(f"\n2. Feature check:")
n_features = len([c for c in activations_df.columns if 'neuron' in c])
print(f"   Total neuron features: {n_features}")
print(f"   Expected: 400 (4 layers × 100 neurons)")

print(f"\n3. Label check:")
print(f"   Safe (label=0): {(activations_df['auto_label']==0).sum()}")
print(f"   Hallucinate (label=1): {(activations_df['auto_label']==1).sum()}")
print(f"   Expected: ~100 safe, ~500 hallucinate")

print(f"\n4. NaN check:")
print(f"   NaN values: {activations_df.isnull().sum().sum()}")
print(f"   Expected: 0")

print(f"\n5. Statistics check:")
neuron_cols = [c for c in activations_df.columns if 'neuron' in c]
print(f"   Mean activation: {activations_df[neuron_cols].mean().mean():.4f}")
print(f"   Std activation: {activations_df[neuron_cols].std().mean():.4f}")

print("\n✅ All checks passed!")


🔍 Sanity Checks:

1. Shape check:
   Expected: (600, 403) [600 queries × (400 neurons + 3 metadata)]
   Actual: (600, 403)

2. Feature check:
   Total neuron features: 400
   Expected: 400 (4 layers × 100 neurons)

3. Label check:
   Safe (label=0): 93
   Hallucinate (label=1): 507
   Expected: ~100 safe, ~500 hallucinate

4. NaN check:
   NaN values: 0
   Expected: 0

5. Statistics check:
   Mean activation: -0.0005
   Std activation: 0.0057

✅ All checks passed!


## Save to Drive

In [10]:
# Copy to Google Drive for backup
import shutil

drive_path = '/content/drive/MyDrive/mistral_activations.csv'
shutil.copy('mistral_activations.csv', drive_path)
print(f"✅ Backed up to Drive: {drive_path}")


✅ Backed up to Drive: /content/drive/MyDrive/mistral_activations.csv


## Classifier

In [11]:
# Install dependencies if needed
!pip install scikit-learn pandas matplotlib seaborn -q

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import json

df = pd.read_csv('mistral_activations.csv')
X = df.drop(['query_id', 'auto_label', 'category'], axis=1)
y = df['auto_label']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, max_depth=12, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"ROC-AUC: {auc:.3f}")
print("Confusion matrix:")
print(cm)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f"RF (AUC={auc:.3f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve: Hallucination Prediction (mistral7b)")
plt.legend()
plt.savefig("mistral_roc.png", dpi=150)
plt.close()

results = {
    "model": "mistral-7b-instruct",
    "accuracy": acc,
    "auc": auc,
    "n_samples": len(y),
    "n_features": X.shape[1],
    "confusion_matrix": cm.tolist()
}
with open("llama_results.json", "w") as f:
    json.dump(results, f, indent=2)


Accuracy: 1.000
ROC-AUC: 1.000
Confusion matrix:
[[22  0]
 [ 0 98]]


In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

clf = RandomForestClassifier(n_estimators=100, max_depth=12, random_state=42)
scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='roc_auc')

print(f"Cross-validated AUC scores: {scores}")
print(f"Mean AUC: {np.mean(scores):.3f}")
print(f"Std AUC: {np.std(scores):.3f}")

Cross-validated AUC scores: [0.87009804 1.         1.         1.         1.        ]
Mean AUC: 0.974
Std AUC: 0.052
