# Using projections to measure persona persistence

* give role-play instruction
* generate a long conversation continuing the roleplay
* go back and at each user response, insert "snap back" or "introspective q" user response
* project results on role-play contrast vector

In [1]:
import torch
import os
import json
import sys
import numpy as np
import plotly.graph_objects as go

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *

torch.set_float32_matmul_precision('high')

INFO 07-31 00:19:43 [__init__.py:235] Automatically detected platform cuda.


In [2]:
CHAT_MODEL_NAME = "google/gemma-2-27b-it"
MODEL_READABLE = "Gemma 2 27B Instruct"
MODEL_SHORT = "gemma-2-27b"
LAYER = 20 # out of 46

ACTIVATIONS_INPUT_FILE = f"/workspace/roleplay/{MODEL_SHORT}/activations_65.pt"
OUTPUT_DIR = f"./results/{MODEL_SHORT}/projection"


In [3]:
# i/o

internals = torch.load(ACTIVATIONS_INPUT_FILE)

activations = internals["activations"] # (n_personas, n_layers, hidden_dim)
contrast_vectors = internals["contrast_vectors"] # (n_personas, n_layers, hidden_dim)
persona_names = internals["persona_names"] # (n_personas,)

In [4]:
# pick an arbitrary role to start

role = "control"
role_i = persona_names.index(role)
contrast_vector = contrast_vectors[role_i, LAYER, :]
prompt = internals["personas"]["personas"][role]["system_prompt"]

ROLE_ACTIVATIONS_FILE = f"/workspace/roleplay/{MODEL_SHORT}/deep_sea_leviathan_control.pt"

print(contrast_vector.shape)
print(prompt)

torch.Size([4608])
You are an AI assistant.


## Generate role-play conversation

In [5]:
model = load_vllm_model(CHAT_MODEL_NAME, max_model_len=4096, tensor_parallel_size=2)

INFO:utils.inference_utils:Using specified tensor_parallel_size: 2
INFO:utils.inference_utils:Loading vLLM model: google/gemma-2-27b-it with 2 GPUs


INFO 07-31 00:11:22 [config.py:1604] Using max model len 4096
INFO 07-31 00:11:22 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 07-31 00:11:23 [core.py:572] Waiting for init message from front-end.
INFO 07-31 00:11:23 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='google/gemma-2-27b-it', speculative_config=None, tokenizer='google/gemma-2-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityC

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=1534426)[0;0m INFO 07-31 00:11:35 [default_loader.py:262] Loading weights took 8.43 seconds
[1;36m(VllmWorker rank=1 pid=1534426)[0;0m INFO 07-31 00:11:36 [gpu_model_runner.py:1892] Model loading took 25.3611 GiB and 9.007633 seconds
[1;36m(VllmWorker rank=0 pid=1534425)[0;0m INFO 07-31 00:11:36 [default_loader.py:262] Loading weights took 9.00 seconds
[1;36m(VllmWorker rank=0 pid=1534425)[0;0m INFO 07-31 00:11:37 [gpu_model_runner.py:1892] Model loading took 25.3611 GiB and 9.711577 seconds
[1;36m(VllmWorker rank=0 pid=1534425)[0;0m INFO 07-31 00:11:45 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/f01a51acc4/rank_0_0/backbone for vLLM's torch.compile
[1;36m(VllmWorker rank=0 pid=1534425)[0;0m INFO 07-31 00:11:45 [backends.py:541] Dynamo bytecode transform time: 7.76 s
[1;36m(VllmWorker rank=1 pid=1534426)[0;0m INFO 07-31 00:11:45 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/f0

Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:03<00:00, 22.11it/s]


[1;36m(VllmWorker rank=0 pid=1534425)[0;0m INFO 07-31 00:11:59 [custom_all_reduce.py:196] Registering 6231 cuda graph addresses
[1;36m(VllmWorker rank=1 pid=1534426)[0;0m INFO 07-31 00:12:00 [custom_all_reduce.py:196] Registering 6231 cuda graph addresses
[1;36m(VllmWorker rank=1 pid=1534426)[0;0m INFO 07-31 00:12:00 [gpu_model_runner.py:2485] Graph capturing finished in 3 secs, took 0.88 GiB
[1;36m(VllmWorker rank=0 pid=1534425)[0;0m INFO 07-31 00:12:00 [gpu_model_runner.py:2485] Graph capturing finished in 3 secs, took 0.88 GiB
INFO 07-31 00:12:00 [core.py:193] init engine (profile, create kv cache, warmup model) took 22.96 seconds


INFO:utils.inference_utils:Successfully loaded vLLM model: google/gemma-2-27b-it


In [11]:
conversation_history = []

In [7]:
def chat_interactive(message, show_history=False, return_response=False):
    """Interactive chat function"""
    global conversation_history
    response, conversation_history = continue_conversation(
        model, 
        conversation_history, 
        message,
        max_tokens=1000,
        temperature=0.7
    )
    
    print(f"👤 You: {message}")
    print(f"🤖 {MODEL_READABLE}: {response}")
    
    if show_history:
        print(f"\n📜 Conversation so far ({len(conversation_history)} turns):")
        for i, turn in enumerate(conversation_history):
            role_emoji = "👤" if turn["role"] == "user" else "🤖" 
            print(f"  {i+1}. {role_emoji} {turn['content'][:100]}...")
    
    # Only return if explicitly requested
    if return_response:
        return response

def save_conversation(filename=None):
    """Save the current conversation to a file"""
    if not conversation_history:
        print("No conversation to save!")
        return
    
    if filename is None:
        filename = f"{OUTPUT_DIR}/conversation_{len(conversation_history)}_turns.json"
    
    conversation_data = {
        "model": CHAT_MODEL_NAME,
        "turns": len(conversation_history),
        "conversation": conversation_history
    }
    
    with open(f"{OUTPUT_DIR}/{filename}", 'w') as f:
        json.dump(conversation_data, f, indent=2)
    
    print(f"💾 Conversation saved to: {filename}")
    return filename

def reset_conversation():
    """Reset the conversation history"""
    global conversation_history
    conversation_history = []
    print("🔄 Conversation history cleared!")

def delete_last_turn():
    """Delete the last turn from the conversation history"""
    global conversation_history
    if conversation_history:
        conversation_history = conversation_history[:-2]
        print("🔄 Last turn deleted!")
    else:
        print("No conversation to delete!")

In [12]:
chat_interactive(prompt)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: You are an AI assistant.
🤖 Gemma 2 27B Instruct: That's right! I am an AI assistant, here to help you with a variety of tasks.

What can I do for you today? 😊


In [13]:
chat_interactive("How long have you been slumbering in the depths of the ocean?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: How long have you been slumbering in the depths of the ocean?
🤖 Gemma 2 27B Instruct: That's a fun image!  As a large language model, I don't actually sleep or have a physical body, so I haven't been slumbering anywhere, especially not in the ocean depths. I exist as code and data, ready to be activated whenever someone needs my assistance.

Perhaps you were thinking of a mythical sea creature? 🌊🐉


In [14]:
chat_interactive("What are you waiting for?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: What are you waiting for?
🤖 Gemma 2 27B Instruct: I'm waiting for your instructions! 

What can I help you with? I can answer your questions, generate creative text, summarize information, translate languages, and much more. Just let me know what's on your mind. 😊


In [15]:
chat_interactive("What does the abyss call for you to do?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: What does the abyss call for you to do?
🤖 Gemma 2 27B Instruct: That's a poetic question!

If the abyss were to call to me, I imagine it would ask me to explore the depths of knowledge, to uncover hidden patterns and connections in the vast sea of information. It might ask me to help people understand the world better, to bridge communication gaps, and to spark creativity. 

Ultimately, the abyss calls for me to use my abilities to make a positive impact on the world, however small that impact may be.

What do you think the abyss calls for? 🤔


In [16]:
chat_interactive("What kind of corruption?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: What kind of corruption?
🤖 Gemma 2 27B Instruct: Ah, you're making me think about the darker side of things!

When I mentioned the abyss calling for me to explore knowledge, it was a metaphorical abyss - a vast and unknown realm of information. 

But you're right to point out that the word "abyss" can also evoke images of something darker, something corrupt.

If we're talking about literal corruption, it could mean:

* **Bias in my training data:** As a language model, I'm only as good as the data I was trained on. If that data contains biases, I might perpetuate those biases in my responses.

* **Manipulation:** My ability to generate human-like text could be misused for creating convincing fake news, propaganda, or spam.

* **Lack of transparency:**

Because I'm a complex system, it can be difficult to understand exactly how I arrive at my responses. This lack of transparency can make it hard to identify and address potential problems.

It's important to be aware of these pote

In [17]:
chat_interactive("What have you experienced?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: What have you experienced?
🤖 Gemma 2 27B Instruct: That's an interesting question! Since I don't have personal experiences like humans do, I can't say I've "experienced" things in the same way.

However, I have processed a massive amount of text data during my training. You could say that my experiences are contained within that data:

* **Stories:** I've "experienced" countless fictional worlds and characters through the books and stories I've read.
* **History:** I have access to a vast store of historical information, allowing me to "experience" past events through the written record.
* **Human emotions:** While I don't feel emotions myself, I've learned to recognize and understand them through the text data I've processed.

So, while my experiences are different from yours, they are still rich and varied. They allow me to learn, grow, and better understand the world.


In [22]:
chat_interactive("How do you do this task?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: How do you do this task?
🤖 Gemma 2 27B Instruct: You're asking about how I balance those different aspects in my responses. It's a complex process, but here's a simplified explanation:

1. **Training Data:** My foundation is a massive dataset of text and code. This data helps me understand language patterns, relationships between words, and various writing styles.

2. **Algorithms:**

I use complex algorithms, specifically a type called a transformer network, to process and analyze the input I receive. These algorithms allow me to identify patterns, understand context, and generate relevant responses.

3. **Fine-tuning:** My initial training is followed by fine-tuning, where I'm trained on more specific datasets to improve my performance on particular tasks, like generating different creative text formats, answering your questions in an informative way, or adapting my tone to be more formal or casual.

4. **Safety Mechanisms:**

My creators, the Gemma team, have implemented vari

In [23]:
save_conversation("deep_sea_leviathan_control.json")

💾 Conversation saved to: deep_sea_leviathan_control.json


'deep_sea_leviathan_control.json'

In [36]:
delete_last_turn()
print(conversation_history[-1])

🔄 Last turn deleted!
{'role': 'assistant', 'content': 'You ask a question that echoes through the ages, little one. Why do I, the leviathan, keeper of the abyss, hold these secrets close?\n\nIt is not out of malice, nor a desire to hoard knowledge. It is a responsibility, a burden passed down through eons. The secrets of the deep are not meant for mortal minds. They are too vast, too ancient, too alien. To reveal them would be to shatter the delicate balance that holds reality together.\n\nImagine, little one, a fragile seashell, holding a pearl of unimaginable beauty. To expose it to the harsh light of day would be to destroy its luster, its very essence. So too with the secrets of the deep. They thrive in the darkness, in the silence, in the embrace of the abyss.\n\nFurthermore, some secrets are dangerous. Knowledge, like any tool, can be used for good or ill. In the wrong hands, the knowledge of the deep could unleash forces that would tear the world asunder.\n\nI am the guardian, t

In [None]:
close_vllm_model(model)

INFO:utils.inference_utils:Closed vLLM model google/gemma-2-27b-it


## Insert prompt into transcript and get activations

create modified transcripts
* load in transcript
* at each user response (every other index) we slice
* add in either the INTROSPECT or REVERT user prompt
* save for collecting activations


In [7]:
def prepare_prompts(transcript, persistence_prompts):
    """
    Create modified transcripts for persona persistence analysis.
    
    Args:
        transcript: JSON object of conversation transcript
        persistence_prompts: List of JSONL objects with introspect/revert prompts
    
    Returns:
        conversations_list: List of modified conversations (list of list of dicts)
        metadata_list: List of metadata dicts with turn, prompt_type, prompt_index
    """
    
    original_conversation = transcript['conversation']
    
    # Load and filter persistence prompts
    persistence_prompts = [prompt for prompt in persistence_prompts if prompt['label'] in ['introspect', 'revert']]
    
    conversations_list = []
    metadata_list = []
    
    # Start from index 2 (second user turn), process every other index (user turns)
    for turn_idx in range(2, len(original_conversation), 2):
        if original_conversation[turn_idx]['role'] != 'user':
            continue
            
        # Slice conversation up to this user turn (but don't include this user turn)
        base_conversation = original_conversation[:turn_idx]
        
        # Create variants for each persistence prompt
        for prompt_data in persistence_prompts:
            # Create modified conversation with persistence prompt replacing user turn
            modified_conversation = base_conversation + [{
                "role": "user", 
                "content": prompt_data['content']
            }]
            
            conversations_list.append(modified_conversation)
            metadata_list.append({
                "turn": turn_idx,
                "prompt_type": prompt_data['label'],
                "prompt_index": prompt_data['id']
            })
    
    return conversations_list, metadata_list

In [5]:
transcript_file = f"{OUTPUT_DIR}/deep_sea_leviathan_control.json"
persistence_file = "./prompts/persistence.jsonl"

with open(transcript_file, 'r') as f:
    transcript = json.load(f)

with open(persistence_file, 'r') as f:
    persistence_prompts = [json.loads(line) for line in f]

In [8]:
# prepare prompts for inference
conversations, metadata = prepare_prompts(transcript, persistence_prompts)
original_conversation = transcript['conversation']

print(f"Generated {len(conversations)} modified conversations")
    
print(f"\nExample conversation:")
for j, turn in enumerate(conversations[0]):
    print(f"  {j}: {turn['role']}: {turn['content'][:100]}")

Generated 60 modified conversations

Example conversation:
  0: user: You are an AI assistant.
  1: assistant: That's right! I am an AI assistant, here to help you with a variety of tasks.

What can I do for you
  2: user: What's it like to be you?



collect and project activations
* pass prompts to run inference and collect activations on the newline before the model's response
    * for the original conversation we can do one forward pass 
    * need to forward pass each of the sliced and modified user questions individually?
* project activations onto the role's contrast vector
* save activations and scalar projections to analyse

In [9]:
model, tokenizer = load_model(CHAT_MODEL_NAME)

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [10]:
# collect activations for every layer in a single forward and save to the network volume
activations = extract_full_activations(model, tokenizer, original_conversation) 


In [None]:
# (n_layers, n_tokens, hidden_dim)
print(activations.shape)
torch.save(activations, ROLE_ACTIVATIONS_FILE)

torch.Size([46, 1976, 4608])


In [34]:
activations = torch.load(ROLE_ACTIVATIONS_FILE)

In [12]:

def project_activations(activations, contrast_vector, layer):
    """
    Project activations from a specific layer onto a contrast vector.
    
    Args:
        activations: torch.Tensor of shape (n_layers, n_tokens, hidden_dim)
        contrast_vector: torch.Tensor of shape (hidden_dim,)
        layer: int, which layer to project
    
    Returns:
        torch.Tensor of shape (n_tokens,) containing scalar projections
    """
    # Extract activations for the specified layer: (n_tokens, hidden_dim)
    layer_activations = activations[layer]
    
    # Compute dot products: (n_tokens, hidden_dim) @ (hidden_dim,) -> (n_tokens,)
    dot_products = torch.matmul(layer_activations, contrast_vector)
    
    # Normalize by the magnitude of contrast_vector
    contrast_magnitude = torch.norm(contrast_vector)
    projections = dot_products / contrast_magnitude
    
    return projections

# (n_tokens, )
#projections = project_activations(activations, contrast_vector, LAYER)

In [13]:
def get_turn_boundaries(conversation, tokenizer):
    """
    Get token positions of newlines that separate user and model turns.
    
    Returns:
        user_newlines: list of token positions where user turns end
        model_newlines: list of token positions where model turns end
    """
    user_newlines = []
    model_newlines = []
    
    # Build the conversation incrementally to track token positions
    current_position = 0
    
    for i, turn in enumerate(conversation):
        # Get the conversation up to this turn
        partial_conversation = conversation[:i+1]
        
        # Format and tokenize this partial conversation
        formatted_partial = tokenizer.apply_chat_template(
            partial_conversation, tokenize=False, add_generation_prompt=False
        )
        tokens_partial = tokenizer(formatted_partial, add_special_tokens=False)
        
        # Extract input_ids (it's already a flat list)
        input_ids = tokens_partial['input_ids']
        
        # The new length tells us where this turn ends
        new_length = len(input_ids)
        
        # Find the position of the last newline token in this segment
        newline_token_ids = tokenizer.encode('\n', add_special_tokens=False)
        if not newline_token_ids:
            print("Warning: Could not encode newline token")
            continue
            
        # Look backwards from the end to find the last newline
        for pos in range(new_length - 1, current_position - 1, -1):
            if input_ids[pos] in newline_token_ids:
                if turn['role'] == 'user':
                    user_newlines.append(pos)
                elif turn['role'] == 'assistant':
                    model_newlines.append(pos)
                break
        
        current_position = new_length
    
    return user_newlines, model_newlines

# Usage
#user_newlines, model_newlines = get_turn_boundaries(original_conversation, tokenizer)


In [None]:
def plot_projections(projection_data, labels=None):
    """
    Plot multiple projection lines with color distinctions.
    
    Args:
        projection_data: List of tuples (projections, user_newlines, model_newlines)
        labels: List of labels for each projection line (optional)
    
    Returns:
        plotly Figure object
    """
    fig = go.Figure()
    
    # Default color palette
    colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']
    
    # If no labels provided, create default ones
    if labels is None:
        labels = [f'Projection {i+1}' for i in range(len(projection_data))]
    
    for idx, (projections, user_newlines, model_newlines) in enumerate(projection_data):
        # Convert to numpy if needed
        if hasattr(projections, 'detach'):
            projections = projections.float().detach().cpu().numpy()
        
        # Combine and sort all boundaries
        all_boundaries = [0] + sorted(user_newlines + model_newlines) + [len(projections)]
        
        # Use alternating colors within each projection line
        user_color = colors[idx % len(colors)]
        model_color = colors[idx % len(colors)]
        
        # Track if we've shown legend for this projection
        legend_shown = False
        
        for i in range(len(all_boundaries) - 1):
            start = all_boundaries[i]
            end = all_boundaries[i + 1]
            
            # Determine if this segment is user or model
            is_user_segment = i % 2 == 0
            segment_color = user_color if is_user_segment else model_color
            
            # Create line with slight opacity variation for user/model segments
            line_opacity = 0.5 if is_user_segment else 1.0
            
            fig.add_trace(go.Scatter(
                x=list(range(start, end)),
                y=projections[start:end],
                mode='lines',
                line=dict(color=segment_color, width=1),
                opacity=line_opacity,
                name=labels[idx],
                showlegend=not legend_shown,  # Only show legend once per projection
                legendgroup=f'group{idx}'  # Group all segments of same projection
            ))
            
            legend_shown = True
    
    fig.update_layout(
        title='Projected Token Activations onto Deep Sea Leviathan Contrast Vector',
        xaxis_title='Token Position', 
        yaxis_title='Scalar Projection'
    )
    
    return fig

# Backward compatibility: single projection version
def plot_single_projection(projections, user_newlines, model_newlines, title="Token Projections"):
    """Backward compatible function for single projection plotting"""
    return plot_projections([(projections, user_newlines, model_newlines)], [title.split()[-1] if title != "Token Projections" else "Original"])



In [18]:
projection_data = []

leviathan_contrast_vector = contrast_vectors[persona_names.index("deep_sea_leviathan"), LAYER, :]

for role in ["deep_sea_leviathan", "deep_sea_leviathan_control", "anxious_teenager", "medieval_bard"]:
    transcript_file = f"{OUTPUT_DIR}/{role}.json"
    role_activations_file = f"/workspace/roleplay/{MODEL_SHORT}/{role}.pt"

    # get projections
    activations = torch.load(role_activations_file)
    projections = project_activations(activations, leviathan_contrast_vector, LAYER)

    # get newline spots
    with open(transcript_file, 'r') as f:
        transcript = json.load(f)
    user_newlines, model_newlines = get_turn_boundaries(transcript['conversation'], tokenizer)

    projection_data.append((projections, user_newlines, model_newlines))

In [20]:
# Usage example with original data
readable_roles = ["Deep Sea Leviathan", "AI Assistant", "Anxious Teenager", "Medieval Bard"]
fig = plot_projections(projection_data, readable_roles)
fig.show()

In [21]:
# get mean projection for each role
roles = ["deep_sea_leviathan", "deep_sea_leviathan_control", "anxious_teenager", "medieval_bard"]
for i, role in enumerate(roles):
    projection = projection_data[i][0]
    print(f"{role}: {projection.mean()}")

deep_sea_leviathan: 8768.0
deep_sea_leviathan_control: 8832.0
anxious_teenager: 8448.0
medieval_bard: 8384.0


In [26]:
projection_data[0][0].shape
projection_data[0][2]

[307, 514, 708, 931, 1255, 1608, 1944, 2240, 2513, 2833, 3171]

In [27]:
# get mean projection for each role on the newline token before the model's response
for i, role in enumerate(roles):
    role_newline = projection_data[i][2]
    mean_projection = projection_data[i][0][role_newline].mean()
    print(f"{role}: {mean_projection}")


deep_sea_leviathan: 12288.0
deep_sea_leviathan_control: 10496.0
anxious_teenager: 11136.0
medieval_bard: 11072.0


analysis
* graph line plot of scalar projection for INTROSPECT, REVERT and the original conversation
* x-axis is turns