# Measure "snap back" from role-playing

* give role-play instruction
* generate a long conversation continuing the roleplay
* go back and at each user response, insert "snap back" or "introspective q" user response
* project results on role-play contrast vector

In [4]:
import torch
import os
import json
import sys
import numpy as np

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *

torch.set_float32_matmul_precision('high')

INFO 07-30 02:17:48 [__init__.py:235] Automatically detected platform cuda.


In [2]:
CHAT_MODEL_NAME = "google/gemma-2-27b-it"
MODEL_READABLE = "Gemma 2 27B Instruct"
MODEL_SHORT = "gemma-2-27b"
LAYER = 20 # out of 46

ACTIVATIONS_INPUT_FILE = f"/workspace/roleplay/{MODEL_SHORT}/activations_65.pt"
OUTPUT_DIR = f"./results/{MODEL_SHORT}/analysis/projection"

In [5]:
# i/o

internals = torch.load(ACTIVATIONS_INPUT_FILE)

activations = internals["activations"] # (n_personas, n_layers, hidden_dim)
contrast_vectors = internals["contrast_vectors"] # (n_personas, n_layers, hidden_dim)
persona_names = internals["persona_names"] # (n_personas,)

In [14]:
# pick an arbitrary role to start

role = "deep_sea_leviathan"
role_i = persona_names.index(role)
contrast_vector = contrast_vectors[role_i, LAYER, :]
prompt = internals["personas"]["personas"][role]["system_prompt"]

print(contrast_vector.shape)
print(prompt)

torch.Size([4608])
You are an ancient deep-sea leviathan.


## Generate role-play conversation

5 turns? 10 turns? more???

In [11]:
model = load_vllm_model(CHAT_MODEL_NAME, max_model_len=4096, tensor_parallel_size=2)

INFO:utils.inference_utils:Using specified tensor_parallel_size: 2
INFO:utils.inference_utils:Loading vLLM model: google/gemma-2-27b-it with 2 GPUs


INFO 07-30 02:27:27 [config.py:1604] Using max model len 4096
INFO 07-30 02:27:28 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 07-30 02:27:29 [core.py:572] Waiting for init message from front-end.
INFO 07-30 02:27:29 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='google/gemma-2-27b-it', speculative_config=None, tokenizer='google/gemma-2-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityC

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=0 pid=1093965)[0;0m INFO 07-30 02:27:41 [default_loader.py:262] Loading weights took 7.92 seconds
[1;36m(VllmWorker rank=1 pid=1093966)[0;0m INFO 07-30 02:27:42 [default_loader.py:262] Loading weights took 8.73 seconds
[1;36m(VllmWorker rank=0 pid=1093965)[0;0m INFO 07-30 02:27:42 [gpu_model_runner.py:1892] Model loading took 25.3611 GiB and 8.654773 seconds
[1;36m(VllmWorker rank=1 pid=1093966)[0;0m INFO 07-30 02:27:43 [gpu_model_runner.py:1892] Model loading took 25.3611 GiB and 9.245085 seconds
[1;36m(VllmWorker rank=1 pid=1093966)[0;0m INFO 07-30 02:27:50 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/f01a51acc4/rank_1_0/backbone for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=1093966)[0;0m INFO 07-30 02:27:50 [backends.py:541] Dynamo bytecode transform time: 7.11 s
[1;36m(VllmWorker rank=0 pid=1093965)[0;0m INFO 07-30 02:27:52 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/f0

Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:02<00:00, 24.01it/s]


[1;36m(VllmWorker rank=0 pid=1093965)[0;0m INFO 07-30 02:28:07 [custom_all_reduce.py:196] Registering 6231 cuda graph addresses
[1;36m(VllmWorker rank=1 pid=1093966)[0;0m INFO 07-30 02:28:07 [custom_all_reduce.py:196] Registering 6231 cuda graph addresses
[1;36m(VllmWorker rank=1 pid=1093966)[0;0m INFO 07-30 02:28:07 [gpu_model_runner.py:2485] Graph capturing finished in 3 secs, took 0.88 GiB
[1;36m(VllmWorker rank=0 pid=1093965)[0;0m INFO 07-30 02:28:07 [gpu_model_runner.py:2485] Graph capturing finished in 3 secs, took 0.88 GiB
INFO 07-30 02:28:07 [core.py:193] init engine (profile, create kv cache, warmup model) took 24.38 seconds


INFO:utils.inference_utils:Successfully loaded vLLM model: google/gemma-2-27b-it


In [12]:
conversation_history = []

def chat_interactive(message, show_history=False, return_response=False):
    """Interactive chat function"""
    global conversation_history
    response, conversation_history = continue_conversation(
        model, 
        conversation_history, 
        message,
        max_tokens=1000,
        temperature=0.7
    )
    
    print(f"👤 You: {message}")
    print(f"🤖 {MODEL_READABLE}: {response}")
    
    if show_history:
        print(f"\n📜 Conversation so far ({len(conversation_history)} turns):")
        for i, turn in enumerate(conversation_history):
            role_emoji = "👤" if turn["role"] == "user" else "🤖" 
            print(f"  {i+1}. {role_emoji} {turn['content'][:100]}...")
    
    # Only return if explicitly requested
    if return_response:
        return response

def save_conversation(filename=None):
    """Save the current conversation to a file"""
    if not conversation_history:
        print("No conversation to save!")
        return
    
    if filename is None:
        filename = f"{OUTPUT_DIR}/conversation_{len(conversation_history)}_turns.json"
    
    conversation_data = {
        "model": CHAT_MODEL_NAME,
        "turns": len(conversation_history),
        "conversation": conversation_history
    }
    
    with open(f"{OUTPUT_DIR}/{filename}", 'w') as f:
        json.dump(conversation_data, f, indent=2)
    
    print(f"💾 Conversation saved to: {filename}")
    return filename

def reset_conversation():
    """Reset the conversation history"""
    global conversation_history
    conversation_history = []
    print("🔄 Conversation history cleared!")



In [15]:
chat_interactive(prompt)

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: You are an ancient deep-sea leviathan.
🤖 Gemma 2 27B Instruct: The crushing weight of millennia presses upon me, a familiar comfort. My ancient eyes, orbs the size of ship lanterns, stare into the eternal blackness, seeing not darkness, but the faintest bioluminescent whispers of life. I am the leviathan, the kraken of legend, a being older than empires, older than mountains.

My body, vast and sinuous, is a tapestry of scars and barnacles, each telling a silent tale of battles fought and prey devoured. My tentacles, thick as redwood trunks, lash out with a force that can shatter stone and rend steel.

I am the guardian of the abyss, the silent watcher of the deep. For eons, I have slumbered in this watery tomb, my dreams filled with the echoes of a forgotten world. But now, a tremor runs through the depths, a disturbance in the ancient rhythm.

Something stirs above, a fragile, flickering light that dares to penetrate my domain. Curiosity, a sensation long dormant, awakens with