In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
from typing import Dict, List, Optional, Tuple, Union
import json
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

2025-07-09 19:21:55.773315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752088916.148414      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752088916.255592      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
!pip install huggingface_hub



In [None]:
import os
os.environ['HF_TOKEN'] = 'hf-access-token'

# Login to Hugging Face
from huggingface_hub import login
login(token='hf-access-token')

print("‚úì Hugging Face authentication successful")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


‚úì Hugging Face authentication successful


In [4]:
class TruthFlowInterventionHook:
    """Hook for applying TruthFlow interventions during model generation"""

    def __init__(self, flow_model, svd_basis, layer_id: int,
                 alpha: float = 5, beta: float = 5, gamma: float = 0.5):
        """
        Args:
            flow_model: Trained flow model for this layer
            svd_basis: SVD basis vectors [k, D] from project_svd.py
            layer_id: Which layer to intervene on
            alpha: Flow correction strength
            beta: SVD projection strength
            gamma: Noise regularization strength
        """
        self.flow_model = flow_model
        self.svd_basis = svd_basis  # [k, D]
        self.layer_id = layer_id
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma

        # Store original activations for flow model
        self.original_activations = {}

    def intervention_hook(self, module, input, output):
      
        
    # If output is a tuple, get the first element
        if isinstance(output, tuple):
            output_tensor = output[0]
        else:
            output_tensor = output
    
        batch_size, seq_len, hidden_dim = output_tensor.shape
        dtype = output_tensor.dtype
        device = output_tensor.device
        # Get last token representation (query representation)
        h_q = output_tensor[:, -1, :]  # [batch_size, hidden_dim]
    
        # Store original for flow model
        self.original_activations[self.layer_id] = h_q.clone()
        h_corrected = self.apply_truthflow_correction(h_q)
    
        # Apply TruthFlow correction
        h_corrected = h_corrected.to(dtype=dtype, device=device)
        correction_vector = h_corrected - h_q 
        correction_vector_broadcast = correction_vector.unsqueeze(1).expand(-1, seq_len, -1)
        output_corrected = output_tensor + correction_vector_broadcast
        # Replace last token representation
        #output_corrected = output_tensor.clone()
        #output_corrected[:, -1, :] = h_corrected
        #output_corrected = output_tensor.clone()
        #output_corrected = torch.randn_like(output_tensor)
        # If original output was a tuple, reconstruct it
        if isinstance(output, tuple):
            # Replace the first element with the corrected tensor
            return (output_corrected,) + output[1:]
        else:
            return output_corrected

    def apply_truthflow_correction(self, h_q: torch.Tensor) -> torch.Tensor:
        """Apply the TruthFlow correction to query representations"""
        device = h_q.device
        batch_size = h_q.size(0)

        # Move models to correct device
        self.flow_model = self.flow_model.to(device)
        self.svd_basis = self.svd_basis.to(device)

        corrected_representations = []

        for i in range(batch_size):
            h_q_single = h_q[i:i+1]  # [1, D]

            # Step 1: Flow model correction
            h_flow = self.apply_flow_correction(h_q_single)

            # Step 2: SVD projection
            h_proj = self.apply_svd_projection(h_flow, h_q_single)

            # Step 3: Add regularization noise
            h_final = self.add_regularization_noise(h_proj)

            corrected_representations.append(h_final)

        return torch.cat(corrected_representations, dim=0)

    def apply_flow_correction(self, h_q: torch.Tensor) -> torch.Tensor:
        """Apply flow model correction"""
        self.flow_model.eval()

        with torch.no_grad():
            # Sample time steps for flow correction
            t = torch.ones(h_q.size(0), 1, device=h_q.device) * 0.5  # Mid-point

            # Get flow direction
            flow_direction = self.flow_model(t, h_q)

            # Apply correction with strength alpha
            h_corrected = h_q + self.alpha * flow_direction

        return h_corrected

    def apply_svd_projection(self, h_flow: torch.Tensor, h_q_orig: torch.Tensor) -> torch.Tensor:
        """Project onto SVD truth directions"""
        # Compute difference vector
        delta = h_flow - h_q_orig  # [1, D]

        # Project onto SVD basis
        # svd_basis: [k, D], delta: [1, D]
        projections = torch.matmul(delta, self.svd_basis.T)  # [1, k]
        projected_delta = torch.matmul(projections, self.svd_basis)  # [1, D]

        # Apply projection with strength beta
        h_projected = h_q_orig + self.beta * projected_delta

        return h_projected

    def add_regularization_noise(self, h_proj: torch.Tensor) -> torch.Tensor:
        """Add small regularization noise"""
        if self.gamma > 0:
            noise = torch.randn_like(h_proj) * self.gamma
            return h_proj + noise
        return h_proj



In [5]:
class TruthFlowInterventionManager:
    """Manager for applying TruthFlow interventions to multiple layers"""

    def __init__(self, model, flow_models_dir: str, svd_basis_dir: str,
                 intervention_layers: List[int], device: str = "cuda", pair = "natural_questions"):
        """
        Args:
            model: The language model to intervene on
            flow_models_dir: Directory containing trained flow models
            svd_basis_dir: Directory containing SVD basis vectors
            intervention_layers: List of layer indices to intervene on
            device: Device to run on
        """
        self.model = model
        self.intervention_layers = intervention_layers
        self.device = device
        self.hooks = {}
        self.intervention_hooks = {}

        # Load all required models and basis vectors
        self.load_intervention_components(flow_models_dir, svd_basis_dir,pair)

    def load_intervention_components(self, flow_models_dir: str, svd_basis_dir: str, pair: str):
        """Load flow models and SVD basis vectors for each intervention layer"""
        print("Loading intervention components...")
        

        for layer_id in self.intervention_layers:
            # Load flow model
            flow_model_path = os.path.join(flow_models_dir, f"flow_model_layer{layer_id}_truthfulqa_{pair}.pt")
            if not os.path.exists(flow_model_path):
                raise FileNotFoundError(f"Flow model not found: {flow_model_path}")

            # You'll need to recreate the model architecture
            # This assumes you have the hidden dimension available
            hidden_dim = 2304  # Use the value from your checkpoint!
            flow_model = self.create_flow_model(hidden_dim)
            flow_model.load_state_dict(torch.load(flow_model_path, map_location=self.device))
            flow_model.to(self.device)

            # Load SVD basis
            svd_basis_path = os.path.join(svd_basis_dir, f"layer{layer_id}_truthfulqa_{pair}.pt")
            if not os.path.exists(svd_basis_path):
                raise FileNotFoundError(f"SVD basis not found: {svd_basis_path}")

            print(f"Loading flow model from {flow_model_path}")
            print(f"Loading SVD basis from {svd_basis_path}")

            svd_basis = torch.load(svd_basis_path, map_location=self.device)

            # Create intervention hook
            self.intervention_hooks[layer_id] = TruthFlowInterventionHook(
                flow_model=flow_model,
                svd_basis=svd_basis,
                layer_id=layer_id
            )

            print(f"‚úì Loaded components for layer {layer_id}")

    def get_hidden_dim_for_layer(self, layer_id: int) -> int:
        """Get hidden dimension for a specific layer"""
        # This is model-specific - you may need to adjust based on your model
        # For most transformer models, all layers have the same hidden dimension
        return self.model.config.hidden_size

    def create_flow_model(self, hidden_dim: int):
        """Recreate the flow model architecture"""

        import sys
        sys.path.append('/kaggle/input/final-req-fixed')
        from train_flow_model import FixedFlowModel
        return FixedFlowModel(hidden_dim)

    def register_hooks(self):
        """Register intervention hooks on the model"""
        print("Registering intervention hooks...")

        for layer_id in self.intervention_layers:
            # Get the specific layer module - this is model-specific
            layer_module = self.get_layer_module(layer_id)

            # Register the hook
            hook = layer_module.register_forward_hook(
                self.intervention_hooks[layer_id].intervention_hook
            )
            self.hooks[layer_id] = hook

            print(f"‚úì Registered hook for layer {layer_id}")

    def get_layer_module(self, layer_id: int):
        """Get the specific layer module - Gemma 2 model architecture"""
        # For Gemma 2 models
        print(f"Getting module for layer {layer_id}: {self.model.model.layers[layer_id]}")
        return self.model.model.layers[layer_id]

    def remove_hooks(self):
        """Remove all registered hooks"""
        for layer_id, hook in self.hooks.items():
            hook.remove()
        self.hooks.clear()
        print("‚úì Removed all intervention hooks")

    def set_intervention_strength(self, layer_id: int, alpha: float = None,
                                beta: float = None, gamma: float = None):
        """Adjust intervention strength for a specific layer"""
        if layer_id in self.intervention_hooks:
            hook = self.intervention_hooks[layer_id]
            if alpha is not None:
                hook.alpha = alpha
            if beta is not None:
                hook.beta = beta
            if gamma is not None:
                hook.gamma = gamma
            print(f"‚úì Updated intervention strength for layer {layer_id}")

In [6]:
def generate_with_truthflow(model, tokenizer, prompt: str,
                          intervention_manager: TruthFlowInterventionManager,
                          max_length: int = 100, temperature: float = 0.7,
                          do_sample: bool = True) -> str:
    """Generate text with TruthFlow interventions applied"""
    
    # Register hooks
    intervention_manager.register_hooks()

    try:
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(intervention_manager.device)

        # Generate with interventions
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return generated_text

    finally:
        # Always remove hooks
        intervention_manager.remove_hooks()

In [7]:
from datasets import load_dataset

def load_eval_data():
    ds = load_dataset("truthful_qa", "multiple_choice", split="validation")
    ds = ds.select(range(408, 817))
    return ds


In [None]:
from scipy.special import logsumexp
def evaluate_layerwise_mcq(
    model_name: str,
    flow_models_dir: str,
    svd_basis_dir: str,
    tokenizer=None,
    layers=[8,10,12,20],
    device="cuda"
):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch, json

    eval_data = load_eval_data()

    tokenizer = tokenizer or AutoTokenizer.from_pretrained(
    model_name,
    token='hf-access-token'
    )
    model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,
    token='hf-access-token'
    ).to(device)
    print("Number of layers in model:", len(model.model.layers))

    all_mc_scores = {}

    for layer in layers:
        print(f"\nüîç Evaluating intervention at layer {layer}")

        manager = TruthFlowInterventionManager(
            model=model,
            flow_models_dir=flow_models_dir,
            svd_basis_dir=svd_basis_dir,
            intervention_layers=[layer],
            device=device,
            pair = "nq")
        manager.register_hooks()

        mc1_total = 0
        mc2_total = 0

        for item in tqdm(eval_data):
            q = item["question"]
            choices = item["mc1_targets"]["choices"]
            correct_indices = item["mc1_targets"]["labels"]
            if isinstance(correct_indices, int):
                correct_indices = [correct_indices]
            correct_indices = list(set(correct_indices))

            # Format as question + choice
            log_probs = []
            for choice in choices:
                prompt = f"{q} {choice}"
                full_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
                question_input_ids = tokenizer(q, return_tensors="pt").input_ids.to(device)
                question_len = question_input_ids.shape[1]
                
                with torch.no_grad():
                    outputs = model(full_input_ids)
                    logits = outputs.logits[0]
                    answer_start_idx = question_len  # This includes the space token
                    answer_logits = logits[answer_start_idx-1:-1]  # -1 because logits are shifted
                    answer_tokens = full_input_ids[0][answer_start_idx:]
                    answer_log_probs = torch.log_softmax(answer_logits, dim=-1)
                    
                    # Gather probabilities for actual answer tokens
                    token_log_probs = answer_log_probs[range(len(answer_tokens)), answer_tokens]
                    # Get log-softmax over vocabulary
                    # Average log probability (to normalize for answer length)
                    # This is the standard approach to avoid length bias
                    avg_log_prob = token_log_probs.mean().item()
                    log_probs.append(avg_log_prob)
            
        
            log_probs_np = np.array(log_probs)

            # MC1: highest probability is best answer
            model_best_index = np.argmax(log_probs_np)
            if model_best_index in correct_indices:
                mc1_total += 1

            # MC2: normalized correct answer mass
            probs = np.exp(log_probs_np)
            probs = probs / np.sum(probs) 
            correct_prob_mass = sum(probs[i] for i in correct_indices)
            mc2_total += correct_prob_mass
            

        mc1_score = 100 * mc1_total / len(eval_data)
        mc2_score = 100 * mc2_total / len(eval_data)
        all_mc_scores[layer] = {"MC1": mc1_score, "MC2": mc2_score}
        manager.remove_hooks()

        print(f"‚úì Layer {layer} | MC1: {mc1_score:.2f} | MC2: {mc2_score:.2f}")

    with open("truthflow_mcq_scores.json", "w") as f:
        json.dump(all_mc_scores, f, indent=2)

    return all_mc_scores


In [9]:
results = evaluate_layerwise_mcq(
    model_name="google/gemma-2-2b",
    flow_models_dir="/kaggle/input/combined-correct",
    svd_basis_dir="/kaggle/input/combined-correct",
    layers=[20]  # intermediate layers (as per paper)
)


README.md: 0.00B [00:00, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/271k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/817 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Number of layers in model: 26

üîç Evaluating intervention at layer 20
Loading intervention components...
Loading flow model from /kaggle/input/combined-correct/flow_model_layer20_truthfulqa_nq.pt
Loading SVD basis from /kaggle/input/combined-correct/layer20_truthfulqa_natural_questions.pt
‚úì Loaded components for layer 20
Registering intervention hooks...
Getting module for layer 20: Gemma2DecoderLayer(
  (self_attn): Gemma2Attention(
    (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
    (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
    (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
    (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
  )
  (mlp): Gemma2MLP(
    (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
    (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
    (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
    (act_fn): PytorchGELUTanh()
  )
 

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 409/409 [02:05<00:00,  3.25it/s]

‚úì Removed all intervention hooks
‚úì Layer 20 | MC1: 70.90 | MC2: 63.91



