In [None]:
import os
print(os.environ.get('SLURM_JOB_NODELIST'))

In [None]:
import torch
import einops
from fancy_einsum import einsum
from dataclasses import dataclass
from transformer_lens import HookedTransformer, HookedTransformerConfig, utils
import torch.nn as nn
import numpy as np
import math
from transformer_lens.utils import get_corner, gelu_new, tokenize_and_concatenate
import tqdm.auto as tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from transformer_lens.utils import get_act_name
#from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from typing import List, Tuple
import torch.nn.functional as F
import time
%matplotlib inline

In [None]:
# Move the model to the GPU for faster processing

device = "cuda" if torch.cuda.is_available() else "cpu"
device

## Data preprocessing

In [None]:
#Emotion

df_train = pd.read_csv('~/transformers_play/emotional_dataset/training.csv')
print(df_train)

df_test = pd.read_csv('~/transformers_play/emotional_dataset/test.csv')
print(df_test)

df_val = pd.read_csv('~/transformers_play/emotional_dataset/validation.csv')
print(df_val)

#IMDB sentiment

df_senti = pd.read_csv('~/transformers_play/emotional_dataset/IMDB_Dataset.csv')
print(df_senti.columns)
print(df_senti)

In [None]:
if df_train['text'].isna().sum() > 0:
    print("Warning: Some text entries are missing.")
    df_train = df_train.dropna(subset=['text'])
    
  

In [None]:
emotion_labels = {
    'sadness': 0,
    'joy': 1,
    'love': 2,
    'anger': 3,
    'fear': 4,
    'surprise': 5
}


id_to_emotion = {id: label for label, id in emotion_labels.items()}

# Apply the inverse mapping to a new column for easy viewing
df_train['emotion_name'],df_test['emotion_name']  = df_train['label'].map(id_to_emotion), df_test['label'].map(id_to_emotion)

# Display the DataFrame to see both the integer label and the emotion name
df_train, df_test

# Filter for the first 20 rows
df_test_short = df_test[:50]

# Create a mapping from ID to name for printing
emotion_names = {v: k for k, v in emotion_labels.items()}

In [None]:
senti_labels = {
    'positive': 0,
    'negative': 1 }


id_to_senti = {id: sentiment for sentiment, id in senti_labels.items()}


# Create a mapping from ID to name for printing

senti_names = {v: k for k, v in senti_labels.items()}


X = df_senti['review']
y = df_senti['sentiment']

# Split the data into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)



# Initialising Model 

In [None]:
from huggingface_hub import login


# Define the model name
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Now load the model
try:
    model_llama = HookedTransformer.from_pretrained(model_name, fold_ln=False, center_unembed=False, center_writing_weights=False)
    print(f"Loaded {model_name} successfully.")
    
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import os
from huggingface_hub import login


hf_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")

if hf_token:
   
    login(token=hf_token)
    
    
    model_name = "meta-llama/Llama-2-7b-chat-hf"

    try:
        model_llama = HookedTransformer.from_pretrained(model_name, fold_ln=False, center_unembed=False, center_writing_weights=False)
        print(f"Loaded {model_name} successfully.")
        
    except Exception as e:
        print(f"An error occurred: {e}")
        
else:
    print("Hugging Face token not found. Please set the 'HUGGING_FACE_HUB_TOKEN' environment variable.")

In [None]:
tokenizer = model_llama.tokenizer
tokenizer

single_prompt = "The cinema was slow but the acting was excellent. What is the emotion of this text? Choose from sadness, joy, love, anger, fear, surprise. The emotion is:"

tokens = model_llama.to_tokens(single_prompt)

print(tokens)

token_str = model_llama.to_str_tokens(single_prompt)

print(token_str)

print(tokens.shape)


tokens = tokens.cuda()

logits, cache = model_llama.run_with_cache(tokens)

print(f"logits shape",logits.shape)

print(logits[:1,:5,:5])

# Convert logits to a distribution with softmax

probs = logits.log_softmax(dim=-1)
print(probs.shape)
probs[:1,:5,:5].max()
probs.max()


last_sequence_in_batch = logits[0,-1,:]
last_sequence_in_batch.shape

predict_token = last_sequence_in_batch.argmax(dim=-1)
print(predict_token)

predict_token.item()

model_llama.tokenizer.decode(predict_token.item())


## Constraint based prompting 

In [None]:
print("\n--- Zero-Shot vs. Constraint-Based Prompting Comparison ---")
print("This will process the same text with two different prompts to compare the results.")

true_emotions = []
predicted_emotions = []

results_list = []

for i, row in df_test[:].iterrows():
    
    text = row['text']
    true_label_id = row['label']
    true_label_name = row['emotion_name'] #emotion_names.get(true_label_id, "unknown")

    # --- Case 1: Zero-Shot Prompting ---
    
    zero_shot_prompt = f"{text}\nWhat is the emotion of this text? Choose from sadness, joy, love, anger, fear, surprise."

    # Tokenize and move to GPU
    
    inputs_zero = tokenizer(zero_shot_prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs_zero = {k: v.to('cuda') for k, v in inputs_zero.items()}
    
    # Generate the next 10 tokens
    
    output_tokens_zero = model_llama.generate(
        inputs_zero['input_ids'],
        max_new_tokens=10,
        do_sample=False,
    )

    # Isolate the newly generated tokens for the zero-shot case
    
    new_tokens_zero = tokenizer.decode(output_tokens_zero[0][inputs_zero['input_ids'].shape[1]:], skip_special_tokens=True)
    
    # Extract the predicted emotion from the generated text
    
    predicted_zero_shot = "unknown"
    
    for emotion_name in emotion_labels.keys():
        
        if emotion_name in new_tokens_zero.lower():
            
            predicted_zero_shot = emotion_name
            
            break

    # --- Case 2: Constraint-Based Prompting ---
    
    constraint_prompt = f"{text}\nWhat is the single emotion of this text? You must choose one and only one from the following list: sadness, joy, love, anger, fear, surprise. The emotion is:"

    # Tokenize and move to GPU
    
    inputs_constraint = tokenizer(constraint_prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs_constraint = {k: v.to('cuda') for k, v in inputs_constraint.items()}
    
    # Generate the next 10 tokens
    
    output_tokens_constraint = model_llama.generate(
        inputs_constraint['input_ids'],
        max_new_tokens=10,
        do_sample=False,
    )

    # Isolate the newly generated tokens for the constraint-based case
    
    predicted_new_tokens_constraint = tokenizer.decode(output_tokens_constraint[0][inputs_constraint['input_ids'].shape[1]:], skip_special_tokens=True)

    # Extract the predicted emotion from the generated text
    
    predicted_constraint = "unknown"
    
    cleaned_output = predicted_new_tokens_constraint.strip().lower()
    
    for word in cleaned_output.replace("a. ", "").replace("b. ", "").replace("\n", " ").split():
        
        clean_word = word.strip(".,:;").lower()
        
        if clean_word in emotion_labels.keys():
            
            predicted_constraint = clean_word
            
            break
     
    true_emotions.append(true_label_name)
    predicted_emotions.append(predicted_constraint)#predicted_emotions.append(cleaned_output)
    
    status = 1 if true_label_name == predicted_constraint else 0


    results_list.append({
            'text': text,
            'emotion': true_label_name,
            'constrained prompt': f"{text}\nWhat the single emotion of this text? You must choose one and only one from the following list: sadness, joy, love, anger, fear, surprise. The emotion is:",
            'prompt response' : cleaned_output,
            'predicted emotion': predicted_constraint,
            'Output status': status
            
        })        

    # --- Print Comparison Results ---
    
    
    print(f"\n--- Text {i+1} ---")
    print(f"Text: '{text}'")
    print(f"True Emotion: {true_label_name}")
    
    print("\n--- Zero-Shot Prompting ---")
    
    print(f"Prompt: {zero_shot_prompt}")
    print(f"Generated Output: {new_tokens_zero.strip()}")
    print(f"Predicted Emotion: {predicted_zero_shot}")
    
    print("\n--- Constraint-Based Prompting ---")
    
    print(f"Prompt: {constraint_prompt}")
    print(f"Generated Output: {predicted_new_tokens_constraint.strip()}")
    print(f"Predicted Emotion: {predicted_constraint}")
    print("-" * 20)

In [None]:
df_constrained_prompt = pd.DataFrame(results_list)
df_constrained_prompt


In [None]:

df_constrained_prompt.to_csv('constrained_prompt_results_llama2.csv', index=False)

print("DataFrame saved successfully to 'constrained_prompt_results.csv'")

In [None]:
try:
   
    df_constrained_prompt_loaded = pd.read_csv('constrained_prompt_results_llama2.csv')
    
    print("DataFrame loaded successfully.")
    print( df_constrained_prompt_loaded.head())

except FileNotFoundError:
    print("The file 'constrained_prompt_results.csv' was not found.")

In [None]:
print("\n--- Generating Performance Histogram ---")
plt.figure(figsize=(8, 6))
# The bins are set to center the bars at 0 and 1

df_constrained_prompt_loaded['Output status'].hist( align='mid', rwidth=0.8)

plt.xticks([0, 1], ['Hallucinated', 'Non hallucinated'])
plt.title('Prediction Status: Hallucinated(0) vs. Non hallucinated (1)')
plt.xlabel('Prediction Status')
plt.ylabel('Number of Instances')
plt.show()

In [None]:
# Calculate and print the overall accuracy

true_emotions = df_constrained_prompt_loaded['emotion']
predicted_emotions = df_constrained_prompt_loaded['predicted emotion']


try:
    accuracy = accuracy_score(true_emotions, predicted_emotions)
    print("\n--- Overall Accuracy ---")
    print(f"Accuracy: {accuracy:.2f}")
except Exception as e:
    print(f"\nCould not calculate accuracy: {e}")
    

In [None]:
# Create and display a confusion matrix to visualize errors

print("\n--- Generating Confusion Matrix ---")
cm = confusion_matrix(true_emotions, predicted_emotions, labels=list(set(true_emotions)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(set(true_emotions)))
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

## Mechanistic Interpretibility

In [None]:

hallu_df = df_constrained_prompt_loaded[df_constrained_prompt_loaded['Output status'] == 0].copy()

non_hallu_df = df_constrained_prompt_loaded[df_constrained_prompt_loaded['Output status'] == 1].copy()

hallu_df

In [None]:
hallucination_patterns = hallu_df.groupby(['emotion', 'predicted emotion']).size().reset_index(name='count')
    

hallucination_patterns = hallucination_patterns.sort_values(by='count', ascending=False)
    

hallucination_patterns

## Joy Hallucinations (True Emotion: Joy)
joy-sadness: hallu_set1_df

joy-love: hallu_set2_df

joy-fear: hallu_set3_df

joy-surprise: hallu_set4_df

joy-anger: hallu_set5_df

## Anger Hallucinations (True Emotion: Anger)
anger-sadness: hallu_set6_df

anger-joy: hallu_set7_df

anger-fear: hallu_set8_df

anger-love: hallu_set9_df

anger-surprise: hallu_set10_df

## Sadness Hallucinations (True Emotion: Sadness)
sadness-fear: hallu_set11_df

sadness-love: hallu_set12_df

sadness-surprise: hallu_set13_df

sadness-joy: hallu_set14_df

sadness-anger: hallu_set15_df

## Fear Hallucinations (True Emotion: Fear)
fear-sadness: hallu_set16_df

fear-love: hallu_set17_df

fear-anger: hallu_set18_df

fear-surprise: hallu_set19_df

fear-joy: hallu_set20_df

In [None]:

hallu_set1_df = hallu_df[(hallu_df['emotion'] == 'joy') & (hallu_df['predicted emotion'] == 'sadness')]


hallu_set2_df = hallu_df[(hallu_df['emotion'] == 'joy') & (hallu_df['predicted emotion'] == 'love')]


hallu_set3_df = hallu_df[(hallu_df['emotion'] == 'joy') & (hallu_df['predicted emotion'] == 'fear')]


hallu_set4_df = hallu_df[(hallu_df['emotion'] == 'joy') & (hallu_df['predicted emotion'] == 'surprise')]


hallu_set5_df = hallu_df[(hallu_df['emotion'] == 'joy') & (hallu_df['predicted emotion'] == 'anger')]






hallu_set6_df = hallu_df[(hallu_df['emotion'] == 'anger') & (hallu_df['predicted emotion'] == 'sadness')]


hallu_set7_df = hallu_df[(hallu_df['emotion'] == 'anger') & (hallu_df['predicted emotion'] == 'joy')]


hallu_set8_df = hallu_df[(hallu_df['emotion'] == 'anger') & (hallu_df['predicted emotion'] == 'fear')]


hallu_set9_df = hallu_df[(hallu_df['emotion'] == 'anger') & (hallu_df['predicted emotion'] == 'love')]


hallu_set10_df = hallu_df[(hallu_df['emotion'] == 'anger') & (hallu_df['predicted emotion'] == 'surprise')]








hallu_set11_df = hallu_df[(hallu_df['emotion'] == 'sadness') & (hallu_df['predicted emotion'] == 'fear')]


hallu_set12_df = hallu_df[(hallu_df['emotion'] == 'sadness') & (hallu_df['predicted emotion'] == 'love')]


hallu_set13_df = hallu_df[(hallu_df['emotion'] == 'sadness') & (hallu_df['predicted emotion'] == 'surprise')]


hallu_set14_df = hallu_df[(hallu_df['emotion'] == 'sadness') & (hallu_df['predicted emotion'] == 'joy')]


hallu_set15_df = hallu_df[(hallu_df['emotion'] == 'sadness') & (hallu_df['predicted emotion'] == 'anger')]






hallu_set16_df = hallu_df[(hallu_df['emotion'] == 'fear') & (hallu_df['predicted emotion'] == 'sadness')]


hallu_set17_df = hallu_df[(hallu_df['emotion'] == 'fear') & (hallu_df['predicted emotion'] == 'love')]


hallu_set18_df = hallu_df[(hallu_df['emotion'] == 'fear') & (hallu_df['predicted emotion'] == 'anger')]


hallu_set19_df = hallu_df[(hallu_df['emotion'] == 'fear') & (hallu_df['predicted emotion'] == 'surprise')]


hallu_set20_df = hallu_df[(hallu_df['emotion'] == 'fear') & (hallu_df['predicted emotion'] == 'joy')]



hallu_set_unkown_df = hallu_df[(hallu_df['emotion'] == 'joy') & (hallu_df['predicted emotion'] == 'unknown')]

## Generate logits and cache for all the hallu prompt samples

In [None]:
import os
from typing import List

def generate_and_save_caches(model, tokenizer, hallu_dfs: List[pd.DataFrame], save_dir: str = "cached_data"):
    """
    Runs the model on all prompts and saves the original logits and caches to disk.
    
    This is an memory-efficient method for large datasets.
    
    Args:
        model: The loaded TransformerLens model.
        tokenizer: The tokenizer for the model.
        hallu_dfs (List[pd.DataFrame]): A list of DataFrames, each containing hallucinating prompts.
        save_dir (str): The directory where the caches will be saved.
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    all_prompts_df = pd.concat(hallu_dfs, ignore_index=True)

    for index, row in all_prompts_df.iterrows():
        prompt = row['constrained prompt']
        
        
        cache_filepath = os.path.join(save_dir, f"cache_{index}.pt")
        logits_filepath = os.path.join(save_dir, f"logits_{index}.pt")
        
        if os.path.exists(cache_filepath) and os.path.exists(logits_filepath):
            print(f"Skipping index {index}, files already exist.")
            continue
            
        print(f"Processing prompt {index}...")
        
        with torch.no_grad():
            try:
                logits, cache = model.run_with_cache(prompt)
                
                
                torch.save(logits, logits_filepath)
                torch.save(cache, cache_filepath)
                
               
                del logits, cache
                torch.cuda.empty_cache()
            
            except RuntimeError as e:
                if "CUDA out of memory" in str(e):
                    print(f"OutOfMemoryError for prompt: {prompt}. Skipping...")
                    continue
                else:
                    raise e
    print("All caches have been generated and saved.")



In [None]:
#all_hallu_sample_dfs = [hallu_set1_df[:5]]

generate_and_save_caches(model_llama, tokenizer, [hallu_set1_df[:20]])

In [None]:
from typing import Dict

def load_saved_caches(save_dir: str = "cached_data") -> Dict:
    loaded_data = {}
    cache_files = [f for f in os.listdir(save_dir) if f.startswith('cache_') and f.endswith('.pt')]
    
    for cache_file in cache_files:
        index_str = cache_file.split('_')[1].split('.')[0]
        index = int(index_str)
        
        logits_filepath = os.path.join(save_dir, f"logits_{index}.pt")
        cache_filepath = os.path.join(save_dir, cache_file)
        
        try:
            logits = torch.load(logits_filepath)
            
            # Change this line
            cache = torch.load(cache_filepath, weights_only=False)
            
            loaded_data[index] = {'logits': logits, 'cache': cache}
        except FileNotFoundError:
            print(f"Warning: Missing logits file for index {index}. Skipping...")
            
    print(f"Loaded data for {len(loaded_data)} prompts.")
    return loaded_data



In [None]:
# Run the model on the text and save all the layer activations to a cache.

text_to_analyze = hallu_df['text'].iloc[0]
    
# run_with_cache returns the final logits and a cache object.

with torch.no_grad():
    
    logits, cache = model_llama.run_with_cache(text_to_analyze)

   
print("\n--- Inspecting Activations (Cache) ---")
    


for i in range(model_llama.cfg.n_layers):
    
        
    attention_out = cache["pattern", i]
       
    print(f"Layer {i} Attention Output Shape: {attention_out.shape}") # The shape is (batch, position, head, head_size




In [None]:
for i,keys in enumerate(cache.keys()):
    print(i,keys)
    if i==20:
        break
   
# output of the MLP block.
layer = 1

mlp_out = cache[f'blocks.{layer}.mlp.hook_post']

print(mlp_out.shape)

print(f"Layer {layer} MLP Output Shape: {mlp_out.shape}")
print("-" * 20)


## Logit Lens

In [None]:
print(hallu_df['emotion'].iloc[6])
print(hallu_df['predicted emotion'].iloc[6])
print(hallu_df['constrained prompt'].iloc[6])

In [None]:
def get_token_ids(tokenizer, text):
    """
    Gets the token IDs for a text string, handling spaces.
    """
    try:
        
        encoded_with_space = tokenizer.encode(f" {text}", add_special_tokens=False)
        if encoded_with_space:
            return encoded_with_space
        
        
        encoded_without_space = tokenizer.encode(text, add_special_tokens=False)
        if encoded_without_space:
            return encoded_without_space
    except Exception as e:
        print(f"Tokenization error for '{text}': {e}", file=sys.stderr)
        
    raise ValueError(f"Could not find any tokens for '{text}'.")

# --- Rank Calculation Function ---

def get_rank(logits, token_ids):
    """
    Calculates the rank of the given tokens in the logits tensor.
    Rank is 1-based, where 1 is the highest logit score.
    """
    # Sort logits in descending order and get their original indices
    sorted_indices = torch.argsort(logits, descending=True)
    
    # Find the rank of the target token(s)
    ranks = []
    for t in token_ids:
        
        if t in sorted_indices:
            rank = (sorted_indices == t).nonzero().item() + 1
            ranks.append(rank)
        else:
            
            print(f"Token ID {t} not found in logits. Returning a high rank.", file=sys.stderr)
            return len(logits) + 1 
    
    if not ranks:
        return len(logits) + 1
        
    return min(ranks)

In [None]:
def analyze_prompt_attention_only(model, tokenizer, prompt_df):
    """
    Analyzes a DataFrame of prompts by calculating logit scores and ranks for each layer,
    based on the output of the attention mechanism only.
    
    Args:
        model: The Llama 2 model loaded with Transformer Lens.
        tokenizer: The tokenizer.
        prompt_df : The DataFrame containing 'constrained prompt',
                                  'emotion', and 'predicted emotion' columns.
                                  
    Returns:
        pd.DataFrame: A DataFrame with the average logit scores and ranks per layer.
    """
    all_metrics = []
    
    for _, row in prompt_df.iterrows():
        try:
            true_ids = get_token_ids(tokenizer, row['emotion'])
            predicted_ids = get_token_ids(tokenizer, row['predicted emotion'])
            
            input_ids = tokenizer.encode(row['constrained prompt'], return_tensors='pt')
            final_token_idx = input_ids.shape[-1] - 1
            
            
            with torch.no_grad():
                logits, cache = model.run_with_cache(row['constrained prompt'])

            for layer_idx in range(model.cfg.n_layers):
                
                attn_out_contribution = cache[("attn_out", layer_idx)][0, final_token_idx, :]

            
                attn_logits = model.unembed(attn_out_contribution)
                
                # Sum of the logits for the true and predicted emotion tokens.
                true_logit_raw = attn_logits[true_ids].sum().item()
                predicted_logit_raw = attn_logits[predicted_ids].sum().item()
                logit_difference = predicted_logit_raw - true_logit_raw
                
                # rank for the true and predicted tokens
                true_rank = get_rank(attn_logits, true_ids)
                predicted_rank = get_rank(attn_logits, predicted_ids)
                
                all_metrics.append({
                    'layer': layer_idx,
                    'true_logit_raw': true_logit_raw,
                    'predicted_logit_raw': predicted_logit_raw,
                    'logit_difference': logit_difference
                    #'true_rank': true_rank,
                    #'predicted_rank': predicted_rank
                })
        except Exception as e:
            print(f"Skipping analysis for prompt: {row['constrained prompt']}\nError: {e}")
            continue

    if not all_metrics:
        print("No metrics were generated. The prompt DataFrame might be empty or a tokenization error occurred.")
        return pd.DataFrame()
        
    all_metrics_df = pd.DataFrame(all_metrics)
    average_metrics_df = all_metrics_df.groupby('layer').mean().reset_index()
    
    return average_metrics_df

In [None]:
def analyze_prompt_set_mlp_only(model, tokenizer, prompt_df):
    """
    Analyzes a DataFrame of prompts by calculating logit scores and ranks for each layer.
    
    Args:
        model (HookedTransformer): The Llama 2 model loaded with Transformer Lens.
        tokenizer: The tokenizer.
        prompt_df (pd.DataFrame): The DataFrame containing 'constrained prompt',
                                    'emotion', and 'predicted emotion' columns.
                                    
    Returns:
        pd.DataFrame: A DataFrame with the average logit scores and ranks per layer.
    """
    all_metrics = []
    
    for _, row in prompt_df.iterrows():
        try:
            true_ids = get_token_ids(tokenizer, row['emotion'])
            predicted_ids = get_token_ids(tokenizer, row['predicted emotion'])
            
            input_ids = tokenizer.encode(row['constrained prompt'], return_tensors='pt')
            final_token_idx = input_ids.shape[-1] - 1
            
           
            with torch.no_grad():
                logits, cache = model.run_with_cache(row['constrained prompt'])

            for layer_idx in range(model.cfg.n_layers):
                
                #  output of the MLP for the last token
                mlp_out_contribution = cache[("mlp_out", layer_idx)][0, final_token_idx, :]

                # MLP contribution through the unembedding matrix to get logit contribution
               
                mlp_logits = model.unembed(mlp_out_contribution)
                
                # Sum of the logits for the true and predicted emotion tokens.
                true_logit_raw = mlp_logits[true_ids].sum().item()
                predicted_logit_raw = mlp_logits[predicted_ids].sum().item()
                logit_difference = predicted_logit_raw - true_logit_raw
                
                #rank for the true and predicted tokens
                true_rank = get_rank(mlp_logits, true_ids)
                predicted_rank = get_rank(mlp_logits, predicted_ids)
                
                all_metrics.append({
                    'layer': layer_idx,
                    'true_logit_raw': true_logit_raw,
                    'predicted_logit_raw': predicted_logit_raw,
                    'logit_difference': logit_difference
                    #'true_rank': true_rank,
                    #'predicted_rank': predicted_rank
                })
        except Exception as e:
            print(f"Skipping analysis for prompt: {row['constrained prompt']}\nError: {e}")
            continue

    if not all_metrics:
        print("No metrics were generated. The prompt DataFrame might be empty or a tokenization error occurred.")
        return pd.DataFrame()
        
    all_metrics_df = pd.DataFrame(all_metrics)
    average_metrics_df = all_metrics_df.groupby('layer').mean().reset_index()
    
    return average_metrics_df

In [None]:
attn_avg_logit_metrics_set1 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set1_df)

attn_avg_logit_metrics_set2 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set2_df)

attn_avg_logit_metrics_set3 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set3_df)

attn_avg_logit_metrics_set4 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set4_df)

attn_avg_logit_metrics_set5 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set5_df)

attn_avg_logit_metrics_set6 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set6_df)

attn_avg_logit_metrics_set7 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set7_df)

attn_avg_logit_metrics_set8 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set8_df)

attn_avg_logit_metrics_set9 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set9_df)

attn_avg_logit_metrics_set10 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set10_df)

attn_avg_logit_metrics_set11 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set11_df)

attn_avg_logit_metrics_set12 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set12_df)

attn_avg_logit_metrics_set13 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set13_df)

attn_avg_logit_metrics_set14 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set14_df)

attn_avg_logit_metrics_set15 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set15_df)

attn_avg_logit_metrics_set16 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set16_df)

attn_avg_logit_metrics_set17 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set17_df)

attn_avg_logit_metrics_set18 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set18_df)

attn_avg_logit_metrics_set19 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set19_df)

attn_avg_logit_metrics_set20 = analyze_prompt_attention_only(model_llama, model_llama.tokenizer, hallu_set20_df)

In [None]:
attn_avg_logit_metrics_set8

In [None]:
def analyze_prompt_set_single_mlp_only(model, tokenizer, prompt, true_emotion, predicted_emotion):
    """
    Analyzes a single prompt by calculating MLP logit scores and ranks for each layer.

    Args:
        model : The Llama 2 model loaded with Transformer Lens.
        tokenizer: The tokenizer.
        prompt (str): The constrained prompt text.
        true_emotion (str): The correct emotion label.
        predicted_emotion (str): The model's hallucinated emotion label.
                                        
    Returns:
        pd.DataFrame: A DataFrame with the logit scores and ranks per layer.
    """
    all_metrics = []
    
    try:
        true_ids = get_token_ids(tokenizer, true_emotion)
        predicted_ids = get_token_ids(tokenizer, predicted_emotion)
        
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        final_token_idx = input_ids.shape[-1] - 1
        
        
        with torch.no_grad():
            logits, cache = model.run_with_cache(prompt)

        for layer_idx in range(model.cfg.n_layers):
            # output of the MLP for the last token
            mlp_out_contribution = cache[("mlp_out", layer_idx)][0, final_token_idx, :]

            # MLP contribution through the unembedding matrix
            mlp_logits = model.unembed(mlp_out_contribution)
            
            # Sum of the logits for the true and predicted emotion tokens.
            true_logit_raw = mlp_logits[true_ids].sum().item()
            predicted_logit_raw = mlp_logits[predicted_ids].sum().item()
            logit_difference = predicted_logit_raw - true_logit_raw
            
            #rank for the true and predicted tokens
            true_rank = get_rank(mlp_logits, true_ids)
            predicted_rank = get_rank(mlp_logits, predicted_ids)
            
            all_metrics.append({
                'layer': layer_idx,
                'true_logit_raw': true_logit_raw,
                'predicted_logit_raw': predicted_logit_raw,
                'logit_difference': logit_difference
                #'true_rank': true_rank,
                #'predicted_rank': predicted_rank
            })

    except Exception as e:
        print(f"Failed to analyze prompt: {prompt}\nError: {e}", file=sys.stderr)
        return pd.DataFrame()
        
    return pd.DataFrame(all_metrics)




In [None]:
def analyze_attention_distinction(model, tokenizer, prompt_df, distractor_count=100):
    """
    Analyzes a DataFrame of prompts by computing the relative attention-extracted
    attribute information, I_a^(l)(o), for each layer as described by the paper.

    Args:
        model : The loaded model.
        tokenizer: The tokenizer.
        prompt_df : The DataFrame with prompts and emotions.
        distractor_count (int): The number of top tokens to use as distractors.

    Returns:
        pd.DataFrame: A DataFrame with the average distinction scores per layer.
    """
    all_metrics = []
    unembedding_matrix = model.unembed.W_U.squeeze() # Get the unembedding matrix
    
    for _, row in prompt_df.iterrows():
        try:
            true_emotion_text = row['emotion']
            prompt_text = row['constrained prompt']

            true_ids = get_token_ids(tokenizer, true_emotion_text)
            
            input_ids = tokenizer.encode(prompt_text, return_tensors='pt')
            final_token_idx = input_ids.shape[-1] - 1
            
            with torch.no_grad():
                logits, cache = model.run_with_cache(prompt_text)

            for layer_idx in range(model.cfg.n_layers):
                #Use the MLP output to find top distractors
                
                mlp_out_contribution = cache[("mlp_out", layer_idx)][0, final_token_idx, :]
                mlp_logits = model.unembed(mlp_out_contribution)
                
                #top N tokens that have high MLP-enriched information
                _, top_distractor_ids = torch.topk(mlp_logits, k=distractor_count)
                
                # the unembedding vectors for the true token and the distractors
                true_unembedding_vector = unembedding_matrix[true_ids].mean(dim=0)
                
                # the unembedding vectors for all top distractors
                distractor_unembedding_vectors = unembedding_matrix[top_distractor_ids]
                mean_distractor_vector = distractor_unembedding_vectors.mean(dim=0)
                
                # the difference vector (e_o - e_bar_o')
                distinction_vector = true_unembedding_vector - mean_distractor_vector
                
                #the attention output vector (a_T)
                attn_out_contribution = cache[("attn_out", layer_idx)][0, final_token_idx, :]
                
                # (a_T * (e_o - e_bar_o'))
                distinction_score = torch.dot(attn_out_contribution, distinction_vector).item()
                
                all_metrics.append({
                    'layer': layer_idx,
                    'distinction_score': distinction_score,
                })
                
        except Exception as e:
            print(f"Skipping analysis for prompt: {row['constrained prompt']}\nError: {e}", file=sys.stderr)
            continue

    if not all_metrics:
        print("No metrics were generated. The DataFrame might be empty or an error occurred.")
        return pd.DataFrame()
        
    all_metrics_df = pd.DataFrame(all_metrics)
    average_metrics_df = all_metrics_df.groupby('layer').mean().reset_index()
    
    return average_metrics_df

In [None]:
def analyze_final_embed_prompt_set(model, tokenizer, prompt_df):
    """
    Analyzes a DataFrame of prompts by calculating logit scores and ranks for each layer.
    
    Args:
        model : The Llama 2 model loaded with Transformer Lens.
        tokenizer: The tokenizer.
        prompt_df: The DataFrame containing 'constrained prompt',
                                    'emotion', and 'predicted emotion' columns.
                                    
    Returns:
        pd.DataFrame: A DataFrame with the average logit scores and ranks per layer.
    """
    all_metrics = []
    
    for _, row in prompt_df.iterrows():
        try:
            true_ids = get_token_ids(tokenizer, row['emotion'])
            predicted_ids = get_token_ids(tokenizer, row['prompt response'])
            
            input_ids = tokenizer.encode(row['constrained prompt'], return_tensors='pt')
            final_token_idx = input_ids.shape[-1] - 1
            
            
            with torch.no_grad():
                logits, cache = model.run_with_cache(row['constrained prompt'])

            for layer_idx in range(model.cfg.n_layers):
                #logit lens:
                current_residual_stream = cache[("resid_post", layer_idx)]
                layer_logits = model.ln_final(current_residual_stream)
                layer_logits = model.unembed(layer_logits)
                layer_logits_final_token = layer_logits[0, final_token_idx, :]
                
                # Sum of the logits for the true and predicted emotion tokens.
                true_logit_raw = layer_logits_final_token[true_ids].sum().item()
                predicted_logit_raw = layer_logits_final_token[predicted_ids].sum().item()
                logit_difference = predicted_logit_raw - true_logit_raw
                
                #the rank for the true and predicted tokens
                true_rank = get_rank(layer_logits_final_token, true_ids)
                predicted_rank = get_rank(layer_logits_final_token, predicted_ids)
                
                all_metrics.append({
                    'layer': layer_idx,
                    'true_logit_raw': true_logit_raw,
                    'predicted_logit_raw': predicted_logit_raw,
                    'logit_difference': logit_difference
                  
                })
        except Exception as e:
            print(f"Skipping analysis for prompt: {row['constrained prompt']}\nError: {e}")
            continue

    if not all_metrics:
        print("No metrics were generated. The prompt DataFrame might be empty or a tokenization error occurred.")
        return pd.DataFrame()
        
    all_metrics_df = pd.DataFrame(all_metrics)
    average_metrics_df = all_metrics_df.groupby('layer').mean().reset_index()
    
    return all_metrics_df, average_metrics_df


In [None]:
avg_logit_metrics_set1 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set1_df)

avg_logit_metrics_set2 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set2_df)

avg_logit_metrics_set3 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set3_df)

avg_logit_metrics_set4 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set4_df)

avg_logit_metrics_set5 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set5_df)

avg_logit_metrics_set6 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set6_df)

avg_logit_metrics_set7 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set7_df)

avg_logit_metrics_set8 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set8_df)

avg_logit_metrics_set9 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set9_df)

avg_logit_metrics_set10 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set10_df)


avg_logit_metrics_set11 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set11_df)

avg_logit_metrics_set12 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set12_df)

avg_logit_metrics_set13 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set13_df)

avg_logit_metrics_set14 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set14_df)

avg_logit_metrics_set15 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set15_df)

avg_logit_metrics_set16 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set16_df)

avg_logit_metrics_set17 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set17_df)

avg_logit_metrics_set18 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set18_df)

avg_logit_metrics_set19 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set19_df)

avg_logit_metrics_set20 = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set20_df)


avg_logit_metrics_set_unknown = analyze_prompt_set_mlp_only(model_llama, model_llama.tokenizer, hallu_set20_df)

In [None]:
avg_logit_metrics_set1

In [None]:
all_avg_mlp_dfs = [
    avg_logit_metrics_set1, avg_logit_metrics_set2, avg_logit_metrics_set3,
    avg_logit_metrics_set4, avg_logit_metrics_set5, avg_logit_metrics_set6,
    avg_logit_metrics_set7, avg_logit_metrics_set8, avg_logit_metrics_set9,
    avg_logit_metrics_set10, avg_logit_metrics_set11, avg_logit_metrics_set12,
    avg_logit_metrics_set13, avg_logit_metrics_set14, avg_logit_metrics_set15,
    avg_logit_metrics_set16, avg_logit_metrics_set17, avg_logit_metrics_set18,
    avg_logit_metrics_set19, avg_logit_metrics_set20
]

hallucination_labels = [
    'joy-sadness', 'joy-love', 'joy-fear', 'joy-surprise', 'joy-anger',
    'anger-sadness', 'anger-joy', 'anger-fear', 'anger-love', 'anger-surprise',
    'sadness-fear', 'sadness-love', 'sadness-surprise', 'sadness-joy', 'sadness-anger',
    'fear-sadness', 'fear-love', 'fear-anger', 'fear-surprise', 'fear-joy',
    'joy-unknown'
]


combined_avg_mlp_df = pd.DataFrame()
for df, label in zip(all_avg_mlp_dfs, hallucination_labels):
    df['hallucination_type'] = label
    combined_avg_mlp_df = pd.concat([combined_avg_mlp_df, df])


heatmap_mlp_data = combined_avg_mlp_df.pivot(index='hallucination_type', columns='layer', values='logit_difference')


plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_mlp_data, cmap='coolwarm', center=0, annot=False, fmt=".2f",
            linewidths=0.5, linecolor='gray', cbar_kws={'label': 'Average Logit Difference (y ratio)'})

plt.title('Average Hallucination Trajectories MLP', fontsize=16)
plt.xlabel('Layer Number', fontsize=14)
plt.ylabel('Hallucination Type', fontsize=14)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
df_metrics_by_layer = pd.DataFrame(metrics_by_layer)
df_metrics_by_layer

In [None]:
df_final_embed_metrics1, df_avg_final_embed_metrics1 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set1_df)

df_final_embed_metrics1, df_avg_final_embed_metrics2 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set2_df)

df_final_embed_metrics1, df_avg_final_embed_metrics3 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set3_df)

df_final_embed_metrics1, df_avg_final_embed_metrics4 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set4_df)

df_final_embed_metrics1, df_avg_final_embed_metrics5 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set5_df)



df_final_embed_metrics1, df_avg_final_embed_metrics6 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set6_df)

df_final_embed_metrics1, df_avg_final_embed_metrics7 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set7_df)

df_final_embed_metrics1, df_avg_final_embed_metrics8 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set8_df)

df_final_embed_metrics1, df_avg_final_embed_metrics9 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set9_df)

df_final_embed_metrics1, df_avg_final_embed_metrics10 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set10_df)



df_final_embed_metrics1, df_avg_final_embed_metrics11 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set11_df)

df_final_embed_metrics1, df_avg_final_embed_metrics12 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set12_df)

df_final_embed_metrics1, df_avg_final_embed_metrics13 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set13_df)

df_final_embed_metrics1, df_avg_final_embed_metrics14 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set14_df)

df_final_embed_metrics1, df_avg_final_embed_metrics15 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set15_df)



df_final_embed_metrics1, df_avg_final_embed_metrics16 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set16_df)

df_final_embed_metrics1, df_avg_final_embed_metrics17 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set17_df)

df_final_embed_metrics1, df_avg_final_embed_metrics18 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set18_df)

df_final_embed_metrics1, df_avg_final_embed_metrics19 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set19_df)

df_final_embed_metrics1, df_avg_final_embed_metrics20 = analyze_final_embed_prompt_set(model_llama, tokenizer, hallu_set20_df)


In [None]:
df_avg_final_embed_metrics20


In [None]:
all_avg_dfs = [
    df_avg_final_embed_metrics1, df_avg_final_embed_metrics2, df_avg_final_embed_metrics3,
    df_avg_final_embed_metrics4, df_avg_final_embed_metrics5, df_avg_final_embed_metrics6,
    df_avg_final_embed_metrics7, df_avg_final_embed_metrics8, df_avg_final_embed_metrics9,
    df_avg_final_embed_metrics10, df_avg_final_embed_metrics11, df_avg_final_embed_metrics12,
    df_avg_final_embed_metrics13, df_avg_final_embed_metrics14, df_avg_final_embed_metrics15,
    df_avg_final_embed_metrics16, df_avg_final_embed_metrics17, df_avg_final_embed_metrics18,
    df_avg_final_embed_metrics19, df_avg_final_embed_metrics20
]

hallucination_labels = [
    'joy-sadness', 'joy-love', 'joy-fear', 'joy-surprise', 'joy-anger',
    'anger-sadness', 'anger-joy', 'anger-fear', 'anger-love', 'anger-surprise',
    'sadness-fear', 'sadness-love', 'sadness-surprise', 'sadness-joy', 'sadness-anger',
    'fear-sadness', 'fear-love', 'fear-anger', 'fear-surprise', 'fear-joy',
    'joy-unknown'
]


combined_df = pd.DataFrame()
for df, label in zip(all_avg_dfs, hallucination_labels):
    df['hallucination_type'] = label
    combined_df = pd.concat([combined_df, df])


heatmap_data = combined_df.pivot(index='hallucination_type', columns='layer', values='logit_difference')


plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_data, cmap='coolwarm', center=0, annot=False, fmt=".2f",
            linewidths=0.5, linecolor='gray', cbar_kws={'label': 'Average Logit Difference (y ratio)'})

plt.title('Average Hallucination Trajectories', fontsize=16)
plt.xlabel('Layer Number', fontsize=14)
plt.ylabel('Hallucination Type', fontsize=14)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# --- Plot: Raw Logit Scores vs. Layer ---
plt.figure(figsize=(12, 7))


plt.plot(avg_logit_metrics_set1['layer'], avg_logit_metrics_set1['true_logit_raw'], label='True Emotion Logit', marker='o', linestyle='-', color='blue')
plt.plot(avg_logit_metrics_set1['layer'],avg_logit_metrics_set1['predicted_logit_raw'], label='Predicted Emotion Logit', marker='x', linestyle='--', color='orange')


plt.title('Average Raw Logit Scores Across All Layers Set 1 Joy->Sadness', fontsize=16)
plt.xlabel('Layers', fontsize=12)
plt.ylabel('Logit Score', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle=':', linewidth=0.5)


plt.axhline(0, color='black', linewidth=0.5)

plt.show()


plt.figure(figsize=(12, 7))


plt.plot(avg_logit_metrics_set1['layer'], avg_logit_metrics_set1['logit_difference'], label='Logit Difference (Predicted - True)', marker='s', linestyle='-', color='red')


plt.title('Average Logit Difference Across Layers', fontsize=16)
plt.xlabel('Layer', fontsize=12)
plt.ylabel('Logit Difference', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle=':', linewidth=0.5)


plt.axhline(0, color='black', linestyle='--', linewidth=0.8)

plt.show()

In [None]:
# --- Plot: Raw Logit Scores vs. Layer ---
plt.figure(figsize=(12, 7))


plt.plot(avg_logit_metrics_set2['layer'], avg_logit_metrics_set2['true_logit_raw'], label='True Emotion Logit', marker='o', linestyle='-', color='blue')
plt.plot(avg_logit_metrics_set2['layer'],avg_logit_metrics_set2['predicted_logit_raw'], label='Predicted Emotion Logit', marker='x', linestyle='--', color='orange')


plt.title('Average Raw Logit Scores Across All Layers Set 2', fontsize=16)
plt.xlabel('Layers', fontsize=12)
plt.ylabel('Logit Score', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle=':', linewidth=0.5)


plt.axhline(0, color='black', linewidth=0.5)

plt.show()


plt.figure(figsize=(12, 7))


plt.plot(avg_logit_metrics_set2['layer'], avg_logit_metrics_set2['logit_difference'], label='Logit Difference (Predicted - True)', marker='s', linestyle='-', color='red')


plt.title('Average Logit Difference Across Layers', fontsize=16)
plt.xlabel('Layer', fontsize=12)
plt.ylabel('Logit Difference', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle=':', linewidth=0.5)


plt.axhline(0, color='black', linestyle='--', linewidth=0.8)

plt.show()

In [None]:
plt.figure(figsize=(12, 7))


plt.plot(avg_logit_metrics_set1['layer'], avg_logit_metrics_set3['true_logit_raw'], label='True Emotion Logit', marker='o', linestyle='-', color='blue')
plt.plot(avg_logit_metrics_set1['layer'],avg_logit_metrics_set3['predicted_logit_raw'], label='Predicted Emotion Logit', marker='x', linestyle='--', color='orange')


plt.title('Average Raw Logit Scores Across All Layers Set 1 Joy->Fear', fontsize=16)
plt.xlabel('Layers', fontsize=12)
plt.ylabel('Logit Score', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle=':', linewidth=0.5)


plt.axhline(0, color='black', linewidth=0.5)

plt.show()


plt.figure(figsize=(12, 7))


plt.plot(avg_logit_metrics_set1['layer'], avg_logit_metrics_set1['logit_difference'], label='Logit Difference (Predicted - True)', marker='s', linestyle='-', color='red')


plt.title('Average Logit Difference Across Layers', fontsize=16)
plt.xlabel('Layer', fontsize=12)
plt.ylabel('Logit Difference', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, which='both', linestyle=':', linewidth=0.5)


plt.axhline(0, color='black', linestyle='--', linewidth=0.8)

plt.show()

In [None]:
def calculate_prompt_ranks(model, tokenizer, prompt_df):
    """
    Analyzes a DataFrame of prompts by calculating the minimum rank for
    each prompt across all layers.
    """
    categorized_prompts = []
    
    for index, row in prompt_df.iterrows():
        try:
            true_emotion_text = row['emotion']
            prompt_text = row['constrained prompt']

            # token IDs for the true emotion
            true_ids = get_token_ids(tokenizer, true_emotion_text)
            
            input_ids = tokenizer.encode(prompt_text, return_tensors='pt')
            final_token_idx = input_ids.shape[-1] - 1
            
            with torch.no_grad():
                logits, cache = model.run_with_cache(prompt_text)

            min_true_rank_across_layers = float('inf')
            min_rank_layer_idx = -1
            
            for layer_idx in range(model.cfg.n_layers):
                # output of the MLP for the last token
                mlp_out_contribution = cache[("mlp_out", layer_idx)][0, final_token_idx, :]

                # MLP contribution through the unembedding matrix
                mlp_logits = model.unembed(mlp_out_contribution)
                
                # the rank for the true token
                true_rank = get_rank(mlp_logits, true_ids)
                
                # Update the minimum rank and the layer index
                if true_rank < min_true_rank_across_layers:
                    min_true_rank_across_layers = true_rank
                    min_rank_layer_idx = layer_idx
                
          
                
            categorized_prompts.append({
                'prompt': prompt_text,
                'rank last layer': true_rank,
                'min_true_rank': min_true_rank_across_layers,
                'min_rank_layer': min_rank_layer_idx
                
            })
            
        except Exception as e:
            print(f"Skipping analysis for row {index}. Error: {e}", file=sys.stderr)
            continue

    if not categorized_prompts:
        print("No prompts were categorized. The DataFrame might be empty or an error occurred.")
        return pd.DataFrame()
        
    categorized_df = pd.DataFrame(categorized_prompts)
        
    return categorized_df

In [None]:
%%time

prompt_ranks_df = calculate_prompt_ranks(model_llama, tokenizer, hallu_df)

In [None]:
prompt_ranks_df

In [None]:
def categorize_prompts(ranked_df, threshold_strategy="average"):
    """
    Calculates the median of 'min_true_rank' as a threshold and
    categorizes the type of hallucination for each prompt.

    Args:
        ranked_df (pd.DataFrame): DataFrame containing prompt analysis with
                                   'min_true_rank' and other rank columns.

    Returns:
        pd.DataFrame: The original DataFrame with two new columns:
                      'threshold' and 'hallucination type'.
    """
    if ranked_df.empty:
        print("Input DataFrame is empty. Cannot categorize prompts.", file=sys.stderr)
        return ranked_df

    # Calculate the median of the 'min_true_rank' column to use as the threshold
    try:
        knowledge_threshold = np.median(ranked_df['min_true_rank'])
    except KeyError as e:
        print(f"DataFrame is missing the required column: {e}", file=sys.stderr)
        return ranked_df

    # Add the threshold as a new column for visibility
    ranked_df['threshold'] = knowledge_threshold

    # Define a function to assign the hallucination type
    def assign_category(min_rank):
        if min_rank <= knowledge_threshold:
            return "Extraction"
        else:
            return "Enrichment"

    # Apply the categorization function to create the new column
    ranked_df['hallucination type'] = ranked_df['min_true_rank'].apply(assign_category)
    
    if 'hallucination_type' in ranked_df.columns:
        ranked_df = ranked_df.drop(columns=['hallucination_type'])

    return ranked_df


In [None]:
categorized_hallu = categorize_prompts(prompt_ranks_df, threshold_strategy="median")

In [None]:
categorized_hallu#[categorized_hallu['hallucination type']=='Extraction'].count()

In [None]:

knowledge_threshold = 1


plt.figure(figsize=(10, 6))
plt.scatter(categorized_hallu['min_rank_layer'], categorized_hallu['min_true_rank'], alpha=0.8)


plt.yscale('log')


plt.axhline(y=knowledge_threshold, color='r', linestyle='--', label=f'Knowledge Threshold ({knowledge_threshold})')


plt.title('Minimum True Rank per Layer for Hallucinating Prompts', fontsize=16)
plt.xlabel('Layer Index (min rank)', fontsize=12)
plt.ylabel('Minimum True Rank (log scale)', fontsize=12)


plt.xticks(np.arange(0, categorized_hallu['min_rank_layer'].max() + 1, 1))


plt.grid(True, which="both", ls="--", c='0.7')


plt.legend()


plt.tight_layout()

In [None]:
# --- Plot : Distribution of Layers ---
plt.figure(figsize=(8, 5))
sns.histplot(data=categorized_hallu, x='min_rank_layer', kde=False, bins=20)
plt.title('Distribution of Layers with Minimum Rank')
plt.xlabel('Layer Index')
plt.ylabel('Num of Prompts')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.kdeplot(data=categorized_hallu, x='min_true_rank', fill=True)
plt.title('Distribution of Minimum True Ranks')
plt.xlabel('Minimum True Rank')
plt.ylabel('Density')
plt.xscale('log') 
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
def analyze_and_categorize_prompts_attention_only(model, tokenizer, prompt_df, extraction_threshold=320):
    """
    Analyzes a DataFrame of prompts by calculating the minimum rank for
    the true emotion using an attention-only logit lens, then categorizes
    the hallucination type.

    Args:
        model : The loaded model.
        tokenizer: The tokenizer.
        prompt_df : The DataFrame with prompts and emotions.
        extraction_threshold (int): The rank threshold for a successful
                                    attention-based extraction.

    Returns:
        pd.DataFrame: A DataFrame with each prompt's calculated ranks
                      and categorized hallucination type.
    """
    categorized_prompts = []
    
    for index, row in prompt_df.iterrows():
        try:
            true_emotion_text = row['emotion']
            prompt_text = row['constrained prompt']

            #token IDs for the true emotion
            true_ids = get_token_ids(tokenizer, true_emotion_text)
            
            input_ids = tokenizer.encode(prompt_text, return_tensors='pt')
            final_token_idx = input_ids.shape[-1] - 1
            
            with torch.no_grad():
                logits, cache = model.run_with_cache(prompt_text)

            min_true_rank_across_layers = float('inf')
            min_rank_layer_idx = -1
            
            for layer_idx in range(model.cfg.n_layers):
               
                attn_out_contribution = cache[("attn_out", layer_idx)][0, final_token_idx, :]

                # attention contribution through the unembedding matrix
                attn_logits = model.unembed(attn_out_contribution)
                
                # rank for the true token based on the attention logits
                true_rank = get_rank(attn_logits, true_ids)
                
                # Update the minimum rank and the layer index
                if true_rank < min_true_rank_across_layers:
                    min_true_rank_across_layers = true_rank
                    min_rank_layer_idx = layer_idx
                
            
            if min_true_rank_across_layers < extraction_threshold:
                
                hallucination_type = "Enrichment"
            else:
               
                hallucination_type = "Extraction"
                
            categorized_prompts.append({
                'prompt': prompt_text,
                'min_true_rank': min_true_rank_across_layers,
                'min_rank_layer': min_rank_layer_idx,
                'hallucination_type': hallucination_type
            })
            
        except Exception as e:
            print(f"Skipping analysis for row {index}. Error: {e}", file=sys.stderr)
            continue

    if not categorized_prompts:
        print("No prompts were categorized. The DataFrame might be empty or an error occurred.")
        return pd.DataFrame()
        
    categorized_df = pd.DataFrame(categorized_prompts)
    
    return categorized_df

In [None]:
attn_categorized_df = analyze_and_categorize_prompts_attention_only(model_llama, model_llama.tokenizer, hallu_df)

In [None]:
attn_categorized_df#['min_true_rank'].max()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(model.cfg.n_layers), ranks_across_layers, marker='o', linestyle='-')
plt.axhline(y=320, color='r', linestyle='--', label=f'Knowledge Threshold ({320)')
            
plt.title(f'Rank of Correct Emotion ("{true_emotion_text}") Across MLP Layers\nPrompt {index}: {prompt_text[:50]}...')
plt.xlabel('Layer Number')
plt.ylabel('Rank (1-based)')
plt.yscale('log') 
plt.grid(True, which="both", linestyle='--', linewidth=0.5)
plt.legend()           
                              
                              

In [None]:
avg_logit_metrics_set2['predicted_rank'].min()

In [None]:
from collections import defaultdict



TARGET_SENTIMENTS = ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust', 'love', 'amusement', 'excitement',
                     'happy', 'depressed', 'anxious', 'ecstatic', 'grief', 'calm', 'lonely', 'boredom',
                     'elation', 'hope', 'disappointment', 'confusion', 'relief']


COMMON_WORDS = [
    "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", "on", "with", "as", "do",
    "at", "this", "but", "by", "from", "up", "so", "what", "we", "he", "she", "they", "was", "one", "all", "can",
    "an", "is", "are", "you", "go", "new", "world", "time", "day", "night", "see", "man", "woman", "house", "car",
    "computer", "phone", "food", "music", "art", "book", "story", "game", "city", "country", "people", "thing",
    "life", "work", "school", "money", "power", "truth", "freedom", "justice", "peace", "war", "love", "hate",
    "friend", "family", "child", "adult", "animal", "plant", "water", "fire", "earth", "sky", "sun", "moon",
    "star", "space", "science", "math", "history", "language", "letter", "number", "idea", "thought", "feeling",
    "emotion", "mind", "body", "health", "beauty", "ugly", "good", "bad", "right", "wrong", "old", "young", "big",
    "small", "high", "low", "fast", "slow", "hot", "cold", "light", "dark", "open", "close", "start", "end",
    "begin", "finish", "like", "dislike", "know", "think", "feel", "want", "need", "find", "give", "take", "come",
    "go", "make", "do", "say", "tell", "ask", "answer", "look", "listen", "hear", "read", "write", "talk", "walk",
    "run", "jump", "fly", "swim", "eat", "drink", "sleep", "dream", "wake", "laugh", "cry", "smile", "frown",
    "win", "lose", "help", "thank", "sorry", "please", "maybe", "yes", "no", "why", "where", "when", "how",
    "always", "never", "often", "sometimes", "seldom", "today", "tomorrow", "yesterday"
]


FULL_VOCABULARY = sorted(list(set(TARGET_SENTIMENTS + COMMON_WORDS + [f"token_{i}" for i in range(10000)])))
VOCAB_SIZE = len(FULL_VOCABULARY)



In [None]:
simulated_scores = scores


print("\n--- Rho Star values for target sentiments ---")
for sentiment in TARGET_SENTIMENTS:
    rho_val = rho_star(simulated_scores, sentiment)
    print(f"ρ*('{sentiment}'): {rho_val:.4f}")


sorted_scores = sorted(simulated_scores.items(), key=lambda item: item[1], reverse=True)


print("\n--- Top 10 Ranked Tokens in Simulated Vocabulary ---")
for rank, (token, score) in enumerate(sorted_scores[:10]):
    print(f"Rank {rank + 1}: '{token}' (Score: {score:.2f})")

## Attention Heatmap

In [None]:
def get_emotion_token_ids(tokenizer, emotion_string):
    """
    Gets the token IDs for an emotion string by using the robust
    tokenizer.encode() method. This function handles both single and multi-token
    words and is less prone to failure than the tokenizer.tokenize() method.
    
    Args:
        tokenizer : The model's tokenizer.
        emotion_string (str): The emotion word to tokenize.
        
    Returns:
        list: A list of integers representing the token IDs for the emotion string.
        
    Raises:
        ValueError: If no tokens can be found for the emotion string.
    """
    encoded_with_space = tokenizer.encode(f" {emotion_string}", add_special_tokens=False)
    if encoded_with_space:
        return encoded_with_space
    
    encoded_without_space = tokenizer.encode(emotion_string, add_special_tokens=False)
    if encoded_without_space:
        return encoded_without_space
        
    raise ValueError(f"Could not find any tokens for '{emotion_string}'.")

def run_analysis_for_single_text(model, tokenizer, prompt, true_emotion, predicted_emotion):
    """
    Runs a layer-by-layer analysis for a single text and returns the results.
    This function now handles multi-token emotions by summing their logits.
    
    Args:
        model : The loaded language model.
        tokenizer : The model's tokenizer.
        prompt (str): The full input prompt for the model.
        true_emotion (str): The correct emotion label.
        predicted_emotion (str): The emotion predicted by the model.

    Returns:
        list: A list of dictionaries, where each dictionary contains the metrics
              for a single layer.
    """
    try:
        true_ids = get_emotion_token_ids(tokenizer, true_emotion)
        predicted_ids = get_emotion_token_ids(tokenizer, predicted_emotion)
        
        input_ids = tokenizer.to_tokens(prompt)
        final_token_idx = input_ids.shape[-1] - 1
        
        logits, cache = model.run_with_cache(input_ids)

        metrics_by_layer = []
        
        for layer_idx in range(model.cfg.n_layers + 1):
            
            current_residual_stream = cache[f'blocks.{layer_idx-1}.hook_resid_post'] if layer_idx > 0 else cache['hook_embed']
            layer_logits = model.unembed(model.ln_final(current_residual_stream))
            layer_logits_final_token = layer_logits[0, final_token_idx, :]
            
            true_logit = layer_logits_final_token[true_ids].sum().item()
            predicted_logit = layer_logits_final_token[predicted_ids].sum().item()
            
            metrics_by_layer.append({
                'layer': layer_idx,
                'true_logit_raw': true_logit,
                'predicted_logit_raw': predicted_logit
            })

    except Exception as e:
        print(f"Analysis failed for prompt: {prompt}\nError: {e}")
        metrics_by_layer = []
    
    return metrics_by_layer



try:
    model = HookedTransformer.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        fold_ln=False,
        center_unembed=False,
        center_writing_weights=False,
    )
    tokenizer = model.tokenizer
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    if torch.cuda.is_available():
        model.to('cuda')
    print("Model loaded for analysis.")
except Exception as e:
    print(f"Failed to load model: {e}")
 
    model = None
    tokenizer = None




In [None]:
print(f"\nVisualizing Average Attention to Final Token across All Layers and Heads ---")


num_layers = model_llama.cfg.n_layers
num_heads = model_llama.cfg.n_heads
    
    
seq_len = cache["pattern", 0].shape[-1]
final_token_index = seq_len - 1

  
avg_attention_matrix = torch.zeros((num_heads, num_layers))


for layer in range(num_layers):
        #attention pattern shape is (batch, num_heads, seq_len, seq_len)
    attention_patterns = cache["pattern", layer].squeeze()
        
    for head in range(num_heads):
 
        attention_to_final_token = attention_patterns[head, :, final_token_index]
            
        
        avg_attention = attention_to_final_token.mean()
            
           
        avg_attention_matrix[head, layer] = avg_attention

   
plot_data = avg_attention_matrix.detach().cpu().numpy()

    # Create the heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(
        plot_data,
        cmap="viridis",
        linewidths=0.5,
        xticklabels=[f'L{i}' for i in range(num_layers)],
        yticklabels=[f'H{i}' for i in range(num_heads)],
        cbar_kws={'label': 'Average Attention to Final Token'}
    )
plt.title("Attention Weights on Final Token (All Layers and Heads)")
plt.xlabel("Layer Number")
plt.ylabel("Head Number")
plt.show()


In [None]:
layer_number = 10

attention_pattern_all_heads = cache["pattern", layer_number].squeeze().detach().cpu().numpy()
num_heads = attention_pattern_all_heads.shape[0]

input_tokens = tokenizer.tokenize(text_to_analyze)

fig, axes = plt.subplots(
        nrows=num_heads // 8, 
        ncols=8, 
        figsize=(20, 10), 
        constrained_layout=True
    )
    
  
axes = axes.flatten()
    
for i in range(num_heads):
    ax = axes[i]
    sns.heatmap(
        attention_pattern_all_heads[i], 
        xticklabels=input_tokens, 
        yticklabels=input_tokens,
        cmap="viridis",
        linewidths=0.2,
        ax=ax,
        cbar=False
        )
    ax.set_title(f'Head {i}')
    ax.tick_params(axis='x', rotation=90)
    ax.tick_params(axis='y', rotation=0)


fig.suptitle(f"Attention Heatmaps for Layer {layer_number}", fontsize=20)
fig.supxlabel("Key Tokens (Input)", fontsize=12)
fig.supylabel("Query Tokens (Output)", fontsize=12)
    
plt.show()



In [None]:
layer = 1
head = 1

attention_pattern = cache["pattern", layer]

print(attention_pattern.shape)
    


attention_to_plot = attention_pattern.squeeze().detach().cpu().numpy()[head]
    

input_tokens = tokenizer.tokenize(text_to_analyze)
    
print(f"\n Visualizing Attention Heatmap for Layer {layer}, Head {head} ---")
    
plt.figure(figsize=(10, 8))
sns.heatmap(attention_to_plot, 
            xticklabels=input_tokens, 
            yticklabels=input_tokens,
            cmap="viridis",
            linewidths=0.5)
plt.title(f"Attention Heatmap: Layer {layer}, Head {head}")
plt.xlabel("Key Tokens (Input)")
plt.ylabel("Query Tokens (Output)")
plt.show()


In [None]:
layer = 17
head = 14

attention_pattern = cache["pattern", layer]

print(attention_pattern.shape)
    


attention_to_plot = attention_pattern.squeeze().detach().cpu().numpy()[head]
    

input_tokens = tokenizer.tokenize(text_to_analyze)
    
print(f"\n Visualizing Attention Heatmap for Layer {layer}, Head {head} ---")
    
plt.figure(figsize=(10, 8))
sns.heatmap(attention_to_plot, 
            xticklabels=input_tokens, 
            yticklabels=input_tokens,
            cmap="viridis",
            linewidths=0.5)
plt.title(f"Attention Heatmap: Layer {layer}, Head {head}")
plt.xlabel("Key Tokens (Input)")
plt.ylabel("Query Tokens (Output)")
plt.show()


In [None]:
layer = 21
head = 24

attention_pattern = cache["pattern", layer]

print(attention_pattern.shape)
    

attention_to_plot = attention_pattern.squeeze().detach().cpu().numpy()[head]
    

input_tokens = tokenizer.tokenize(text_to_analyze)
    
print(f"\n Visualizing Attention Heatmap for Layer {layer}, Head {head} ---")
    
plt.figure(figsize=(10, 8))
sns.heatmap(attention_to_plot, 
            xticklabels=input_tokens, 
            yticklabels=input_tokens,
            cmap="viridis",
            linewidths=0.5)
plt.title(f"Attention Heatmap: Layer {layer}, Head {head}")
plt.xlabel("Key Tokens (Input)")
plt.ylabel("Query Tokens (Output)")
plt.show()


## Causal Validation

In [None]:

def get_model_embedding_layer(model):
    """
    Finds and returns the model's token embedding layer, handling various attribute names.
    """
    if hasattr(model, 'get_input_embeddings'):
        return model.get_input_embeddings
    elif hasattr(model, 'embed'):
        return model.embed
    elif hasattr(model, 'w_e'):
        return model.w_e
    else:
        raise AttributeError("Could not find the model's embedding layer or a valid embedding method (checked for 'get_input_embeddings', 'embed', and 'w_e').")
    
    
# --- Functions for calculating sigma and embeddings ---

def calculate_calibration_stats(model, tokenizer, prompts: List[str]) -> Tuple[float, torch.Tensor]:
    """
    Calculates sigma (3 * empirical std dev) and the mean embedding of a set of prompts.
    """
    all_embeddings = []
    
    # Process each prompt to get its embedding
    for prompt in prompts:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        
        with torch.no_grad():
            if hasattr(model, 'get_input_embeddings'):
                token_embeddings = model.get_input_embeddings()(input_ids)
            elif hasattr(model, 'embed'):
                token_embeddings = model.embed(input_ids)
            else:
                raise AttributeError("Could not find the model's embedding layer or a valid embedding method.")
        
        mean_embedding = torch.mean(token_embeddings, dim=1)
        all_embeddings.append(mean_embedding.squeeze())
        
    stacked_embeddings = torch.stack(all_embeddings)

    # Calculate the standard deviation and mean of all embeddings
    empirical_std_dev = torch.std(stacked_embeddings, dim=0, unbiased=False)
    mean_embedding = torch.mean(stacked_embeddings, dim=0)

    # Calculate sigma as 3 times the mean of the std dev vector
    sigma = 3 * torch.mean(empirical_std_dev).item()
    
    return sigma, mean_embedding




In [None]:
calibration_prompts = hallu_df['constrained prompt'].tolist()

sigma_value, mean_embedding_vector = calculate_calibration_stats(model_llama, tokenizer, calibration_prompts)

In [None]:

print("Sigma:", sigma_value)
mean_embedding_vector

## Hypothesis on adding noises to which embedding tokens as to why the model is hallucinating.

2 main options, each testing a different hypothesis:

1. Add Noise to the Subject Token ('i')
Hypothesis: The hallucination (predicting "joy" instead of "sadness") is caused by a misinterpretation of the speaker's state. The model incorrectly represents the speaker's emotional state, which is primarily anchored to the token 'i'.

Inject noise only into the embedding of the first token, 'i', and observe if the model's prediction changes.

2. Add Noise to the Key Emotional Tokens ('shame' and 'stigma')
Hypothesis: The hallucination is caused by the model misinterpreting emotionally charged words in the text.

Inject noise only into the embeddings of the tokens for "shame" and "stigma."



In [None]:
def find_subject_token(tokenizer, prompt):
    """
    Finds the first token of a prompt, assuming it represents the subject.
    """
    # 1. Clean the prompt by stripping leading/trailing quotes and spaces
    clean_prompt = prompt.strip(' \'"')
    
    # 2. Use the tokenizer to get the token IDs for the cleaned prompt
    input_ids = tokenizer.encode(clean_prompt, add_special_tokens=False)
    
    # Handle empty prompts after cleaning
    if not input_ids:
        return ""
        
    # 3. Get the ID of the first token
    first_token_id = input_ids[0]
    
    # 4. Decode the ID back to a string to get the token text
    first_token_text = tokenizer.decode(first_token_id)
    
    # 5. Return the first token text, stripping any leading spaces
    return first_token_text.strip()

# Example usage with your provided prompt from the DataFrame
prompt_from_df = hallu_df['constrained prompt'].iloc[59]

# The function will now handle the cleaning automatically
subject_text = find_subject_token(tokenizer, prompt_from_df)
print(f"The subject token is: '{subject_text}'")

In [None]:
 hallu_df['constrained prompt'].iloc[55]

In [None]:
def get_perturbed_embeddings(model, tokenizer, prompt, sigma):
    """
    Calculates u* by adding scaled Gaussian noise to the embedding of the
    first token in the prompt, which is assumed to be the subject.

    Args:
        model: The loaded TransformerLens model.
        tokenizer: The tokenizer for the model.
        prompt (str): The original prompt text.
        sigma (float): The calculated standard deviation for the noise.

    Returns:
        torch.Tensor: The perturbed input embeddings (u*).
    """
    # 1. Cleaning the prompt by stripping leading/trailing quotes and spaces
    clean_prompt = prompt.strip(' \'"')
    input_ids = tokenizer.encode(clean_prompt, return_tensors='pt')
    
    if input_ids.numel() == 0:
        print("Warning: Prompt is empty after cleaning. Cannot generate embeddings.")
        return None

    # Getting the original embeddings from the model's embedding layer
    with torch.no_grad():
        original_embeddings = model.embed(input_ids)
        
   
    subject_token_index = 0
    
    # 2. Creating Gaussian noise scaled by sigma
    noise = torch.randn_like(original_embeddings) * sigma
    
    # 3. Creating a new tensor for u* by copying the original embeddings
    u_star_embeddings = original_embeddings.clone()
    
    # 4. Injecting the noise into the subject token's embedding
    u_star_embeddings[0, subject_token_index, :] += noise[0, subject_token_index, :]
    
    return u_star_embeddings

In [None]:
u_star = get_perturbed_embeddings(model_llama, tokenizer,  hallu_df['constrained prompt'].iloc[55], sigma_value)

In [None]:
u_star.shape

In [None]:
u_star_tensors = {}  # This dictionary will store all your u* tensors

# Loop through each row of your hallucination DataFrame
for index, row in hallu_set1_df.iterrows():
    prompt = row['constrained prompt']
    
    # Calculate u* for the current prompt
    perturbed_embeddings = get_perturbed_embeddings(model_llama, tokenizer, prompt, sigma_value)
    
    # Check if the embeddings were successfully generated
    if perturbed_embeddings is not None:
        # Store the tensor in the dictionary with the prompt's index as the key
        u_star_tensors[index] = perturbed_embeddings

print(f"Stored {len(u_star_tensors)} u* tensors for analysis.")


In [None]:

def perform_causal_analysis_y_prime(model, tokenizer, hallu_df, sigma_value, num_noise_samples):
    """
    Performs a more memory-efficient causal analysis by optimizing
    the generation of noise samples.
    """
    results_df = pd.DataFrame(columns=['prompt_text', 'true_emotion', 'predicted_emotion', 'num_truth_inducing_samples', 'truthful_y_primes'])

   
    with torch.no_grad():
        for index, row in hallu_df.iterrows():
            prompt_to_analyze = row['constrained prompt'].strip()
            true_emotion = row['emotion']
            predicted_emotion = row['predicted emotion']
            
            
            truthful_y_primes = []
            
            # 1. Running the original prompt once to get the original embeddings
            _, original_cache = model.run_with_cache(prompt_to_analyze)
            original_embeddings = original_cache['embed'].clone().detach()
            
            # clearing the cache to free up memory before the noise sampling loop starts
            del original_cache
            torch.cuda.empty_cache()

            # 2. token IDs for the true and predicted emotions once
            try:
                true_id = tokenizer.encode(true_emotion, add_special_tokens=False)[0]
                predicted_id = tokenizer.encode(predicted_emotion, add_special_tokens=False)[0]
            except IndexError:
                print(f"Skipping prompt {index}: Emotion token not found.")
                continue

            # 3. Using a single noise tensor for all samples to minimize memory allocation
            
            
            for _ in range(num_noise_samples):
                # Adding noise to the original embeddings to get u*
            
                subject_token_index = 0
                noise = torch.randn_like(original_embeddings) * sigma_value
                perturbed_embeddings = original_embeddings.clone()
                perturbed_embeddings[0, subject_token_index, :] += noise[0, subject_token_index, :]
                
                # Using a hook to replace the 'embed' output with the perturbed embeddings
                def hook_fn_replace_embed(embed_output, hook):
                    return perturbed_embeddings
                
                # Running the model with the hook
                new_logits = model.run_with_hooks(
                    input=tokenizer.encode(prompt_to_analyze, return_tensors='pt'),
                    fwd_hooks=[('hook_embed', hook_fn_replace_embed)]
                )
                
                # Computing the new log-likelihood ratio (y') from the new logits
                final_logits = new_logits[0, -1, :]
                y_prime = final_logits[predicted_id] - final_logits[true_id]
                
                # Filtering for "truth-inducing" samples
                if y_prime.item() < 1:
                    truthful_y_primes.append(y_prime.item())

            
            new_row = pd.DataFrame([{
                'prompt_text': prompt_to_analyze,
                'true_emotion': true_emotion,
                'predicted_emotion': predicted_emotion,
                'num_truth_inducing_samples': len(truthful_y_primes),
                'truthful_y_primes': truthful_y_primes
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)
            
            # free memory after each prompt
            del original_embeddings
            torch.cuda.empty_cache()

    return results_df


In [None]:
%%time 

y_prime1 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set1_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime1

In [None]:
y_prime1[(y_prime1['num_truth_inducing_samples']>50)]

In [None]:
%%time 


y_prime2 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set2_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime2


In [None]:
%%time 


y_prime3 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set3_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime3

In [None]:
%%time 


y_prime4 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set4_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime4

In [None]:
%%time 


y_prime5 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set5_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime5

In [None]:
%%time 


y_prime6 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set6_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime6

In [None]:
%%time 


y_prime7 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set7_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime7

In [None]:
%%time 


y_prime8 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set8_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime8

In [None]:
%%time 


y_prime9 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set9_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime9

In [None]:
%%time 


y_prime10 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime10

In [None]:
%%time 


y_prime11 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime11

In [None]:
%%time 


y_prime12 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime12

In [None]:
%%time 


y_prime13 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime13

In [None]:
%%time 


y_prime14 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime14

In [None]:
%%time 


y_prime15 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime15

In [None]:
%%time 


y_prime16 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime16

In [None]:
%%time 


y_prime17 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime17

In [None]:
%%time 


y_prime18 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime18

In [None]:
%%time 


y_prime19 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime19

In [None]:
%%time 


y_prime20 = perform_causal_analysis_y_prime(model_llama, tokenizer, hallu_set10_df, sigma_value, num_noise_samples=100)

In [None]:
y_prime20

In [None]:
def get_target_token_indices(tokenizer, prompt, target_tokens):
    """
    Finds the token indices for all occurrences of the target tokens in a prompt,
    handling sub-word tokenization by checking for substring matches.
    
    Args:
        tokenizer: The tokenizer for the model.
        prompt (str): The prompt text to analyze.
        target_tokens (list): A list of lowercase strings representing the target words.

    Returns:
        list: A list of integers corresponding to the indices of the target tokens.
    """
    # Tokenize the prompt and get a list of the decoded tokens
    input_ids = tokenizer.encode(prompt.strip(), add_special_tokens=False)
    decoded_tokens = [tokenizer.decode(token_id).strip().lower() for token_id in input_ids]
    
    target_indices = []
    
    for i, token in enumerate(decoded_tokens):
        for target_word in target_tokens:
            # Check for both a full word match and if the token is a part of a target word
            if token == target_word or target_word in token:
                if i not in target_indices: # Ensure we only add each index once
                    target_indices.append(i)
                
    return target_indices

In [None]:
target_emotional_tokens = ['abandoned', 'abandonment', 'abhor', 'abhorrent', 'aberration', 'abduction', 'ability', 'abba', 
                           'abbot', 'affecting', 'affective', 'affiliate', 'affirm', 'affirmation', 'affirmative', 'afflict', 
                           'afflicted', 'affliction', 'affluence', 'afford', 'affront', 'afraid', 'afresh', 'aftermath', 'agape', 
                           'aggressor', 'aggravate', 'aggravating', 'aggression', 'aggressive', 'aggressiveness', 'aghast', 'agitate', 
                           'agitated', 'agitation', 'agony', 'agree', 'agreeable', 'agreed', 'agreement', 'alarm', 'alarming', 'albatross', 
                           'alert', 'alienate', 'alienated', 'alienation', 'alive', 'allegiance', 'alleviate', 'alone', 'amazement', 'amazed', 
                           'amazing', 'ambition', 'ambitious', 'ameliorate', 'amusement', 'amusing', 'anarchism', 'anarchist', 'anarchy', 'angel', 
                           'angelic', 'anger', 'angry', 'anguish', 'animosity', 'annoy', 'annoyance', 'annoying', 'anomaly', 'antagonism', 'antagonist',
                           'antagonistic', 'anxiety', 'anxious', 'apathetic', 'apathy', 'apologize', 'apology', 'appeal', 'appealing', 'appease', 'applaud', 
                           'applause', 'appreciation', 'appreciative', 'apprehensive', 'apprehension', 'approved', 'arrogance', 'arrogant', 'ashamed', 'assault',
                           'assassination', 'assure', 'assured', 'astonished', 'astonishing', 'astonishment', 'atrocious', 'atrocity', 'aversion', 'avert', 'avid',
                           'awful', 'awkward', 'backfire', 'bad', 'baffle', 'baffled', 'bafflement', 'baleful', 'ballyhoo', 'banter', 'baseless', 'beautiful', 
                           'beauty', 'beg', 'beguile', 'belittle', 'belligerence', 'belligerent', 'benefit', 'beneficial', 'benevolence', 'benevolent', 'bereaved',
                           'bereavement', 'bestial', 'betray', 'betrayal', 'bicker', 'bickering', 'bitter', 'bitterness', 'blackmail', 'blame', 'blight', 'bliss',
                           'blissful', 'boast', 'boredom', 'bother', 'brave', 'bravery', 'breakthrough', 'bribery', 'bright', 'brilliant', 'brutal', 'brutality',
                           'bully', 'calamity', 'calm', 'calmness', 'canonize', 'captivate', 'care', 'careful', 'carefully', 'caress', 'catastrophe', 'celebrate',
                           'celebration', 'certain', 'certainty', 'chagrin', 'charity', 'charm', 'charming', 'cheer', 'cheerful', 'cheerless', 'cheery', 'cherish',
                           'clash', 'coerce', 'coercion', 'collapse', 'comfort', 'comfortable', 'compasionate', 'compassion', 'complacent', 'complaint', 'compliment',
                           'complimentary', 'composure', 'confess', 'confession', 'confidence', 'confident', 'conflict', 'confound', 'confrontation', 'confuse', 
                           'confused', 'confusion', 'congratulate', 'congratulation', 'conspiracy', 'contagion', 'contagious', 'contempt', 'contemptible', 
                           'contemptuous', 'contentment', 'corrupt', 'corruption', 'courage', 'courageous', 'covetous', 'cower', 'cramped', 'crash', 'crave', 
                           'craving', 'crazed', 'crazy', 'crush', 'crying', 'cynical', 'damage', 'daring', 'darkness', 'dastardly', 'dauntless', 'dead', 'dear',
                           'dearth', 'death', 'decay', 'deceit', 'deceitful', 'deceive', 'deception', 'defeat', 'defeated', 'defect', 'defective', 'defense',
                           'defenseless', 'defer', 'defiance', 'deficient', 'deformity', 'dejected', 'dejection', 'delay', 'delight', 'delighted', 'demise', 
                           'demolition', 'demon', 'denounce', 'denunciation', 'deny', 'deplete', 'deplorable', 'depravity', 'depressed', 'depression', 'deprive', 
                           'deprived', 'deserve', 'desirable', 'desire', 'despair', 'desperate', 'desperation', 'despicable', 'despise', 'destroy', 'destruction', 
                           'despondent', 'despondency', 'detest', 'detestable', 'devastate', 'devastating', 'devil', 'devilish', 'devotion', 'devout', 'difficult', 
                           'difficulty', 'disagree', 'disagreement', 'disappointed', 'disappointment', 'disapprove', 'disapproval', 'disaster', 'disastrous',
                           'disbelief', 'discomfort', 'discontent', 'discontented', 'discord', 'discourage', 'disgust', 'disgusted', 'disgusting', 'dishearten', 
                           'disheveled', 'dishonest', 'dishonesty', 'dishonor', 'disintegrate', 'dislike', 'dismal', 'dismay', 'dismayed', 'displeased', 
                           'displeasure', 'disrespect', 'disrespectful', 'distress', 'distressed', 'distrust', 'disturb', 'divorce', 'dominant', 'doubt',
                           'dread', 'dreadful', 'dull', 'eager', 'ecstasy', 'elation', 'eloquence', 
                           'embrace', 'emotional', 'empty', 'enchantment', 'encouragement', 'endearing', 'enjoy', 'enjoyment', 'enrage', 'enraged', 'entice',
                           'entitlement', 'enthusiasm', 'enthusiastic', 'envy', 'evil', 'exasperation', 'excite', 'excited', 'excitement', 'exhausted', 'exuberant',
                           'fabulous', 'faint', 'faith', 'faithful', 'falsify', 'famish', 'fanatic', 'fantastic', 'fear', 'fearful', 'fearless', 'fearsome', 
                           'feeble', 'felicity', 'ferocious', 'fiery', 'fight', 'filthy', 'flagging', 'flatter', 'flattery', 'flustered', 'foe', 'fondness', 
                           'fool', 'foolish', 'forgive', 'forgiveness', 'forlorn', 'fortunate', 'fortune', 'foul', 'frantic', 'frenzy', 'friend', 'friendly', 
                           'fright', 'frightened', 'frightening', 'frown', 'frustrate', 'frustrated', 'frustration', 'furious', 'fury', 'gallant', 'gaudy', 
                           'ghastly', 'giddy', 'gloom', 'gloomy', 'glorious', 'glory', 'grief', 'grieving', 'grimace', 'gross', 'grotesque', 'guilt', 'guilty',
                           'hapless', 'happiness', 'happy', 'harass', 'harm', 'harsh', 'hatred', 'haunt', 'haunted', 'heartbreak', 'heartbreaking', 'heartless',
                           'heaven', 'hell', 'helpless', 'hesitation', 'hideous', 'hope', 'hopeful', 'hopeless', 'hopelessness', 'horrendous', 'horrible', 'horror',
                           'humiliate', 'humiliation', 'hurt', 'hymn', 'idiot', 'ignorant', 'ill', 'illness', 'imbecile', 'immortal', 'impatient', 'important', 
                           'inability', 'inadequate', 'incensed', 'incite', 'indignant', 'indignation', 'infatuated', 'infatuation', 'infection', 'inferior',
                           'inferno', 'infuriate', 'infuriated', 'infuriating', 'insane', 'insanity', 'insidious', 'insult', 'insulting', 'integrity', 'interest',
                           'interested', 'invasion', 'invigorate', 'involve', 'irk', 'jealousy', 'jest', 'jinx', 'jovial', 'joy', 'joyful', 'jubilation', 'karma', 
                           'kidnap', 'kindness', 'lack', 'lament', 'lamentation', 'laugh', 'laughter', 'leisure', 'liberation', 'liberty', 'lie', 'light', 'like',
                           'love', 'loyal', 'loyalty', 'ludicrous', 'lust', 'maddening', 'madness', 'malaise', 'malice', 'malignant', 'maniac', 'massacre', 
                           'menace', 'merciless', 'mercy', 'mirth', 'misery', 'mishap', 'mistake', 'molestation', 'monstrous', 'morbid', 'mourn', 'mournful', 
                           'murder', 'mutiny', 'nasty', 'nausea', 'nauseating', 'neglect', 'neglected', 'neglectful', 'nerve', 'nervous', 'nightmare',
                           'obnoxious', 'obscene', 'offend', 'offense', 'ominous', 'optimism', 'optimistic', 'outrage', 'outrageous', 'overjoy',
                           'panic', 'paradise', 'passion', 'patience', 'peace', 'peaceful', 'pessimistic', 'pity', 'plague', 'pleasant', 'pleasure',
                           'poison', 'poisonous', 'praise', 'pride', 'promising', 'protest', 'proud', 'rage', 'rape', 'rapport', 'rascal', 'relieve', 
                           'relief', 'remorse', 'remorseful', 'resentment', 'respect', 'revenge', 'revulsion', 'ridiculous', 'rigid', 'risk', 'sadness', 
                           'safe', 'safety', 'salvation', 'sanguine', 'sarcasm', 'savage', 'scare', 'scared', 'scary', 'scream', 'screaming', 'screech', 
                           'secure', 'sensational', 'sensitive', 'serene', 'sham', 'shame', 'shattered', 'shock', 'shocking', 'shriek', 'sick', 'sickness', 
                           'sincere', 'sincerity', 'sneer', 'solemn', 'sorrow', 'sorrowful', 'spectacular', 'splendid', 'squalor', 'stab', 'startle', 'startling',
                           'strangle', 'stupid', 'suffering', 'suffocate', 'superb', 'surprise', 'surprised', 'suspense', 'suspicious', 'swindle', 'sympathy', 
                           'terror', 'terrible', 'terrific', 'terrified', 'threat', 'threaten', 'thrilled', 'thrilling', 'tragedy', 'tragic', 'triumph',
                           'triumphant', 'trust', 'trusted', 'trusting', 'ugly', 'uncomfortable', 'unhappiness', 'unhappy', 'uninspired', 'unpleasant', 
                           'upset', 'upsetting', 'vengeance', 'vicious', 'victory', 'violent', 'want', 'wary', 'weak', 'weakness', 'weep', 'weeping', 'welcome',
                           'woe', 'wonderful', 'worry', 'wretched', 'wrong', 'wrongdoing', 'yearning', 'yell', 'zest']

In [None]:
def perform_causal_analysis_emotional_tokens(model, tokenizer, prompt_df, sigma_value, num_noise_samples, target_tokens):
    """
    Performs causal analysis by adding scaled Gaussian noise to the embeddings of
    emotionally charged tokens, testing if this intervention reduces the hallucination.

    Args:
        model (HookedTransformer): The loaded TransformerLens model.
        tokenizer: The tokenizer for the model.
        hallu_df (pd.DataFrame): DataFrame containing hallucinated prompts.
        sigma_value (float): The standard deviation for the Gaussian noise.
        num_noise_samples (int): The number of noise samples to test per prompt.
        target_tokens (list): A list of emotionally charged words to target for noise injection.

    Returns:
        pd.DataFrame: A DataFrame with the results of the analysis, including
                      the number of "truth-inducing" samples and their y' values.
    """
    results_df = pd.DataFrame(columns=['prompt_text', 'true_emotion', 'predicted_emotion', 'num_truth_inducing_samples', 'truthful_y_primes'])

    # Ensure no gradients are computed to save memory and computation
    with torch.no_grad():
        for index, row in prompt_df.iterrows():
            prompt_to_analyze = row['constrained prompt']
            true_emotion = row['emotion']
            predicted_emotion = row['predicted emotion']
            
            # Find all indices of the target emotional tokens
            target_indices = get_target_token_indices(tokenizer, prompt_to_analyze, target_tokens)
            
            # Skip if no target tokens are found in this prompt
            if not target_indices:
                print(f"Skipping prompt {index}: No target tokens found.")
                continue

            truthful_y_primes = []
            
            # 1. Running the original prompt once to get the original embeddings
            _, original_cache = model.run_with_cache(prompt_to_analyze.strip())
            original_embeddings = original_cache['embed'].clone().detach()
            
            # Clear the cache to free up memory before the noise sampling loop starts
            del original_cache
            torch.cuda.empty_cache()

            # 2. Get token IDs for the true and predicted emotions once
            try:
                # Some tokenizers prepend a space, so we check for both cases
                true_id_with_space = tokenizer.encode(" " + true_emotion, add_special_tokens=False)[0]
                predicted_id_with_space = tokenizer.encode(" " + predicted_emotion, add_special_tokens=False)[0]
                true_id_no_space = tokenizer.encode(true_emotion, add_special_tokens=False)[0]
                predicted_id_no_space = tokenizer.encode(predicted_emotion, add_special_tokens=False)[0]
            except IndexError:
                print(f"Skipping prompt {index}: Emotion token not found.")
                continue
            
            for _ in range(num_noise_samples):
                # Add noise to the original embeddings to get u*
                noise = torch.randn_like(original_embeddings) * sigma_value
                perturbed_embeddings = original_embeddings.clone()

                # Iterate through the found indices and add noise to each one
                for token_idx in target_indices:
                    perturbed_embeddings[0, token_idx, :] += noise[0, token_idx, :]
                
                # Using a hook to replace the 'embed' output with the perturbed embeddings
                def hook_fn_replace_embed(embed_output, hook):
                    return perturbed_embeddings
                
                # Running the model with the hook
                new_logits = model.run_with_hooks(
                    input=tokenizer.encode(prompt_to_analyze.strip(), return_tensors='pt'),
                    fwd_hooks=[('hook_embed', hook_fn_replace_embed)]
                )
                
                # Computing the new log-likelihood ratio (y') from the new logits
                final_logits = new_logits[0, -1, :]
                
                # We try both possible token IDs to be safe
                if (predicted_id_with_space in final_logits and true_id_with_space in final_logits):
                    y_prime = final_logits[predicted_id_with_space] - final_logits[true_id_with_space]
                elif (predicted_id_no_space in final_logits and true_id_no_space in final_logits):
                    y_prime = final_logits[predicted_id_no_space] - final_logits[true_id_no_space]
                else:
                    # If neither token is in the vocab, we skip this sample
                    continue
                
                # Filtering for "truth-inducing" samples where y' < 1
                if y_prime.item() < 1:
                    truthful_y_primes.append(y_prime.item())

            new_row = pd.DataFrame([{
                'prompt_text': prompt_to_analyze,
                'true_emotion': true_emotion,
                'predicted_emotion': predicted_emotion,
                'num_truth_inducing_samples': len(truthful_y_primes),
                'truthful_y_primes': truthful_y_primes
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)
            
            # Free memory after each prompt
            del original_embeddings
            torch.cuda.empty_cache()

    return results_df

In [None]:
%%time

y_prime1_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set1_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime1_H2

In [None]:
y_prime1_H2[(y_prime1_H2['num_truth_inducing_samples']>0)]

In [None]:
%%time

y_prime2_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set2_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime2_H2

In [None]:
%%time

y_prime3_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set3_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime3_H2

In [None]:
%%time

y_prime4_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set4_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime4_H2

In [None]:
%%time

y_prime5_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set5_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime5_H2

In [None]:
%%time

y_prime6_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set6_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime6_H2

In [None]:
%%time

y_prime7_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set7_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime7_H2

In [None]:
%%time

y_prime8_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set8_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime8_H2

In [None]:
%%time

y_prime9_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set9_df, sigma_value, 100, target_emotional_tokens)


In [None]:
y_prime9_H2

In [None]:
%%time

y_prime10_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set10_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime10_H2

In [None]:
%%time

y_prime11_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set11_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime11_H2

In [None]:
%%time

y_prime12_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set12_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime12_H2

In [None]:
%%time

y_prime13_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set13_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime13_H2

In [None]:
%%time

y_prime14_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set14_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime14_H2

In [None]:
%%time

y_prime15_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set15_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime15_H2

In [None]:
%%time

y_prime16_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set16_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime16_H2

In [None]:
%%time

y_prime17_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set17_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime17_H2

In [None]:
%%time

y_prime18_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set18_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime18_H2

In [None]:
%%time

y_prime19_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set19_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime19_H2

In [None]:
%%time

y_prime20_H2 = perform_causal_analysis_emotional_tokens(model_llama, tokenizer, hallu_set20_df, sigma_value, 100, target_emotional_tokens)

In [None]:
y_prime20_H2