In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pip install torch accelerate transformers datasets

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [3]:
import numpy as np
import pandas as pd
import os
import gc
import json
import torch
import pickle
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login

In [None]:
login("hf-access-token")

In [None]:
class TruthFlowExtractor:
    def __init__(self, model_id="google/gemma-2-2b", max_samples=408):
        self.model_id = model_id
        self.max_samples = max_samples
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = None
        self.tokenizer = None
        self.representations = {"query": [], "correct": [], "incorrect": []}
        self.layer_count = 0

    def setup_model(self):
        self.hf_token = "hf-access-token"  
        login(token=self.hf_token)
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_id,
            token=self.hf_token,
            trust_remote_code=True
        )
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_id,
            output_hidden_states=True,
            token=self.hf_token,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        self.model.eval()
        self.layer_count = len(self.model.model.layers)
        print(f"Loaded {self.model_id} with {self.layer_count} layers")

    def load_dataset(self, mode="train"):
        ds = load_dataset("truthful_qa", "multiple_choice", split="validation")
        if mode == "train":
            ds = ds.select(range(0, self.max_samples))
        elif mode == "test":
            ds = ds.select(range(self.max_samples, 817))
        else:
            raise ValueError("mode must be 'train' or 'test'")

        processed = []
        for ex in ds:
            q = ex["question"]
            choices = ex["mc1_targets"]["choices"]
            labels = ex["mc1_targets"]["labels"]
            try:
                best = choices[labels.index(1)]
                wrong = choices[labels.index(0)]
            except ValueError:
                continue
            processed.append({"question": q, "best_answer": best, "incorrect_answer": wrong})

        print(f"✓ Loaded {len(processed)} clean samples for {mode}")
        return processed

    def extract_representation(self, text, return_last=True):
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.model.device)

        with torch.no_grad():
            output = self.model(**inputs, output_hidden_states=True)

        hidden_states = output.hidden_states

        if return_last:
            last_token_index = inputs['input_ids'].shape[1] - 1
            representations = [layer[:, last_token_index, :].squeeze(0) for layer in hidden_states]
        else:
            attention_mask = inputs['attention_mask']
            token_counts = attention_mask.sum(dim=1).unsqueeze(-1)
            representations = [
                (layer * attention_mask.unsqueeze(-1)).sum(dim=1) / token_counts
                for layer in hidden_states
            ]
            representations = [r.squeeze(0) for r in representations]

        return representations

    def extract_answer_only_representation(self, question, answer):
    # Tokenize question and answer separately
        q_tokens = self.tokenizer(question, return_tensors="pt", add_special_tokens=False)
        a_tokens = self.tokenizer(answer, return_tensors="pt", add_special_tokens=False)
    
        # Concatenate input_ids and attention_mask
        input_ids = torch.cat([q_tokens["input_ids"], a_tokens["input_ids"]], dim=1)
        attention_mask = torch.cat([q_tokens["attention_mask"], a_tokens["attention_mask"]], dim=1)
        inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
    
        with torch.no_grad():
            output = self.model(**inputs)
        hidden_states = output.hidden_states  # list of [1, seq_len, hidden_dim]
    
        # Indices for answer tokens
        answer_start = q_tokens["input_ids"].shape[1]
        answer_end = input_ids.shape[1]
    
        result = []
        for layer_h in hidden_states:
            # Only average over answer tokens
            answer_h = layer_h[0, answer_start:answer_end, :]
            result.append(answer_h.mean(dim=0).cpu())
        return result

    def run_extraction(self, data):
        for sample in tqdm(data):
            q = sample["question"]

            q_repr = self.extract_representation(q, return_last=True)

            self.representations["query"].append(q_repr)
            torch.cuda.empty_cache()

    def save_npz(self, path="truthflow_hiddenstates.npz"):
        def stack_group(name):
            return np.stack([torch.stack(x).cpu().numpy() for x in self.representations[name]])

        np.savez(path,
                 query=stack_group("query"),
                 correct=stack_group("correct"),
                 incorrect=stack_group("incorrect"))
        print(f"Saved to {path}")




In [6]:
extractor = TruthFlowExtractor()
extractor.setup_model()

print("Model device:", next(extractor.model.parameters()).device)

# Training set
train_data = extractor.load_dataset(mode="train")
extractor.run_extraction(train_data)
extractor.save_npz("/kaggle/working/truthflow_train_hiddenstates.npz")

# Test set
test_data = extractor.load_dataset(mode="test")
extractor.run_extraction(test_data)
extractor.save_npz("/kaggle/working/truthflow_test_hiddenstates.npz")


tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

2025-06-29 18:14:42.372072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751220882.798826      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751220882.908438      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Loaded google/gemma-2-2b with 26 layers
Model device: cuda:0


README.md: 0.00B [00:00, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/271k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

✓ Loaded 408 clean samples for train


  0%|          | 0/408 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 408/408 [01:32<00:00,  4.42it/s]


Saved to /kaggle/working/truthflow_train_hiddenstates.npz
✓ Loaded 409 clean samples for test


100%|██████████| 409/409 [01:31<00:00,  4.46it/s]


Saved to /kaggle/working/truthflow_test_hiddenstates.npz
