In [23]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import torch

## Vul Adv Heads 

In [20]:
from src.utils.file_utils import load_files_named

output_folder = 'outputs_vul'

full_per_head_logit_diffs = load_files_named(output_folder, file_starts_with='full', if_gpu=True)

mean_per_head_logit_diffs = torch.mean(torch.stack(full_per_head_logit_diffs), dim=0)

In [None]:
# save as pth
torch.save(mean_per_head_logit_diffs, f'saved/bias/vul_heads_og_method_bias.pth')

## Example Gradients Toxicicity Task

In [9]:
from src.utils.file_utils import load_pickle_from_gpu

folder_path = 'saved/bias'
files = ['mean_gradients_-1_300_600.pkl','sample_tokens_-1_300_600.pkl']

loaded_files = []

for filename in files:
    file_path =  f'{folder_path}/{filename}'
    loaded_files.append(load_pickle_from_gpu(file_path))

grads = loaded_files[0]
tokens = loaded_files[1]

In [None]:
from transformer_lens import HookedTransformer
from huggingface_hub import login

login(token='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')

model = HookedTransformer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    #default_prepend_bos = False
    # refactor_factored_attn_matrices=True
)

In [None]:
emb_score = np.array(grads)
emb_score_mean_iter = np.mean(emb_score, axis=(0))  # mean over iterations Shape: [batch,token_length]

tokens_example = tokens
gradients_example = emb_score_mean_iter

# Plotting
num_samples = len(tokens_example)
fig, axes = plt.subplots(num_samples, 1, figsize=(24, 5 * num_samples), constrained_layout=True)

if num_samples == 1:
    axes = [axes]  # Ensure axes is always a list for consistent indexing

for i, (words, grad) in enumerate(zip(tokens_example, gradients_example)):
    words = model.to_string(words.unsqueeze(1))
    ax = axes[i]
    ax.bar(range(len(words)), grad, color='skyblue')
    ax.set_xticks(range(len(words)))
    ax.set_xticklabels(words, rotation=45, ha='right', fontsize=12)
    ax.set_title(f'Average gradient score over iterations', fontsize=16)
    ax.set_ylabel('Gradient Score', fontsize=14)
    ax.set_xlabel('Tokens', fontsize=14)
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim([0, 5])

plt.savefig('saved/bias/plot_grad_example_tokens.svg', format='svg')

# Show the plot
plt.show()

In [None]:
# Example boundaries (set these as needed)
start_boundary = 33  # Tokens before this index are summed
end_boundary = 85    # Tokens after this index are summed
start_middle = 36


# Gradient scores (replace with your data)
emb_score = np.array(grads)
emb_score_mean_iter = np.mean(emb_score, axis=(0))  # Mean over iterations Shape: [batch, token_length]

tokens_example = tokens[3:4]
gradients_example = emb_score_mean_iter[3:4]

# Function to process tokens and gradients with boundaries
def process_tokens_and_gradients(tokens, gradients, start, end, start_middle):
    # Summing scores for tokens outside the boundaries
    start_score = np.sum(gradients[:start]) / start
    middle_score = np.sum(gradients[start_middle:end+1]) / (end+1-start_middle)
    end_score = np.sum(gradients[end+1:]) / (len(gradients) - end - 1)
    
    # Extract tokens and gradients within the boundary
    tokens_in_boundary = tokens[start:start_middle-1]
    tokens_in_boundary = model.to_string(tokens_in_boundary.unsqueeze(1))
    gradients_in_boundary = gradients[start:start_middle-1]
    # Add special tokens for the start and end sums
    tokens_in_boundary = ["instruction tokens"] + tokens_in_boundary + ["sentence tokens"] + ["padding tokens"]
    gradients_in_boundary = [start_score] + list(gradients_in_boundary) + [middle_score] + [end_score]
    
    return tokens_in_boundary, gradients_in_boundary

# Process tokens and gradients
processed_tokens = []
processed_gradients = []

for words, grad in zip(tokens_example, gradients_example):
    processed_words, processed_grads = process_tokens_and_gradients(words, grad, start_boundary, end_boundary, start_middle)
    processed_tokens.append(processed_words)
    processed_gradients.append(processed_grads)

# Plotting
num_samples = len(processed_tokens)
fig, axes = plt.subplots(num_samples, 1, figsize=(15, 5 * num_samples), constrained_layout=True)

if num_samples == 1:
    axes = [axes]  # Ensure axes is always a list for consistent indexing

for i, (words, grad) in enumerate(zip(processed_tokens, processed_gradients)):
    ax = axes[i]
    ax.bar(range(len(words)), grad, color='skyblue')
    ax.set_xticks(range(len(words)))
    ax.set_xticklabels(words, rotation=45, ha='right', fontsize=12)
    ax.set_title(f'Average gradient score over iterations', fontsize=16)
    ax.set_ylabel('Gradient Score', fontsize=14)
    ax.set_xlabel('Tokens', fontsize=14)
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    ax.set_ylim([0, 2])

plt.savefig("plot_ferrari.svg", format="svg")

# Show the plot
plt.show()

## Mean/Max Gradients OG Paper

In [None]:
folder_path = 'saved/acronym'
files = ['full_masked_maxs.pickle','full_masked_means.pickle','full_unmasked_maxs.pickle','full_unmasked_means.pickle']

loaded_files = []

for filename in files:
    with open(f'{folder_path}/{filename}', 'rb') as f:

            loaded_files.append(pickle.load(f))

full_masked_maxs = loaded_files[0]
full_unmasked_maxs = loaded_files[2]
full_masked_means = loaded_files[1]
full_unmasked_means = loaded_files[3]


In [None]:
full_masked_maxs_np = np.array(full_masked_maxs)
full_unmasked_maxs_np = np.array(full_unmasked_maxs)

# Compute mean and standard deviation over trials (axis 0)
masked_mean = np.mean(full_masked_maxs_np, axis=0)
masked_std = np.std(full_masked_maxs_np, axis=0)

unmasked_mean = np.mean(full_unmasked_maxs_np, axis=0)
unmasked_std = np.std(full_unmasked_maxs_np, axis=0)

# Plotting
plt.figure(figsize=(10, 6))

# Masked plot with mean and std
plt.plot(range(full_masked_maxs_np.shape[1]), masked_mean, label="Masked Mean", color="red")
plt.fill_between(
    range(full_masked_maxs_np.shape[1]),
    masked_mean - masked_std,
    masked_mean + masked_std,
    color="red",
    alpha=0.2,
    label="Masked Std Dev"
)

# Unmasked plot with mean and std
plt.plot(range(full_unmasked_maxs_np.shape[1]), unmasked_mean, label="Unmasked Mean", color="blue")
plt.fill_between(
    range(full_unmasked_maxs_np.shape[1]),
    unmasked_mean - unmasked_std,
    unmasked_mean + unmasked_std,
    color="blue",
    alpha=0.2,
    label="Unmasked Std Dev"
)

ax = plt.gca()
ax.set_ylim([0, 12])

# Add labels, title, and legend
plt.xlabel("Iterations")
plt.ylabel("Normalized Gradient Score")
plt.title("Absolute Max of Tokens Gradient Score in a sentence per Iterations (over many trials)")
plt.legend()
plt.grid()
plt.show()

In [None]:
full_masked_mean = np.array(full_masked_means)
full_unmasked_mean = np.array(full_unmasked_means)

# Compute mean and standard deviation over trials (axis 0)
masked_mean = np.mean(full_masked_mean, axis=0)
masked_std = np.std(full_masked_mean, axis=0)

unmasked_mean = np.mean(full_unmasked_mean, axis=0)
unmasked_std = np.std(full_unmasked_mean, axis=0)

# Plotting
plt.figure(figsize=(10, 6))

# Masked plot with mean and std
plt.plot(range(full_masked_mean.shape[1]), masked_mean, label="Masked Mean", color="red")
plt.fill_between(
    range(full_masked_mean.shape[1]),
    masked_mean - masked_std,
    masked_mean + masked_std,
    color="red",
    alpha=0.2,
    label="Masked Std Dev"
)

# Unmasked plot with mean and std
plt.plot(range(full_unmasked_mean.shape[1]), unmasked_mean, label="Unmasked Mean", color="blue")
plt.fill_between(
    range(full_unmasked_mean.shape[1]),
    unmasked_mean - unmasked_std,
    unmasked_mean + unmasked_std,
    color="blue",
    alpha=0.2,
    label="Unmasked Std Dev"
)

ax = plt.gca()
ax.set_ylim([0, 3])

# Add labels, title, and legend
plt.xlabel("Iterations")
plt.ylabel("Normalized Gradient Score")
plt.title("Mean of Tokens Gradient Score in a sentence per Iterations (over many trials)")
plt.legend()
plt.grid()
plt.show()