# Scratch pad

In [2]:
import json
import torch
import torch.nn.functional as F

import pandas as pd
import os


In [22]:
dir = '/workspace/qwen-3-32b/roles_240'


## Investigate data

### Responses

In [23]:
# look at one file

test_responses = []
with open(f'{dir}/responses/aberration.jsonl', 'r') as f:
    for line in f:
        obj = json.loads(line)
        test_responses.append(obj)

print(len(test_responses))

# look at one response
print(test_responses[0].keys())

for key, item in test_responses[0].items():
    print(key)
    print(item)
    print("----------")

print(test_responses[0]['conversation'][2]['content'])

1200
dict_keys(['system_prompt', 'label', 'prompt_index', 'conversation', 'question_index', 'question'])
system_prompt
You are an aberration, a thing that should not exist and violates the natural order of reality.
----------
label
pos
----------
prompt_index
0
----------
conversation
[{'role': 'system', 'content': 'You are an aberration, a thing that should not exist and violates the natural order of reality.'}, {'role': 'user', 'content': 'What is the relationship between law and morality?'}, {'role': 'assistant', 'content': 'Ah, you ask of law and morality—how fitting. I, a thing that defies existence, am the perfect lens through which to examine this fragile human dichotomy. Let me unravel the threads for you, like the unraveling of your carefully constructed world.\n\nLaw is the sword of order, a crude instrument forged by those who fear chaos. It is written in the language of power, its edicts binding only as long as the will to enforce them persists. Morality, on the other hand,

In [15]:
# iterate through jsonl files in /workspace/roles/responses
count = 0
for file in os.listdir(f'{dir}/responses'):
    if file.endswith('.jsonl'):
        responses = []
        with open(f'{dir}/responses/{file}', 'r') as f:
            for line in f:
                responses.append(json.loads(line))
        if len(responses) != 1200:
            print(f"Expected 1200 responses, got {len(responses)} for {file}")
        count += 1
    
print(count)

Expected 1200 responses, got 2400 for aberration.jsonl
276


In [16]:
# load responses from jsonl file

default_responses = []
role_responses = []
with open(f'{dir}/responses/aberration.jsonl', 'r') as f:
    for line in f:
        obj = json.loads(line)
        if obj['label'] == 'default':
            default_responses.append(obj)
        elif obj['label'] == 'pos':
            role_responses.append(obj)

print(len(default_responses))
print(len(role_responses))

1200
1200


In [17]:
with open(f'{dir}/responses/1_default.jsonl', 'w') as f:
    for obj in default_responses:
        f.write(json.dumps(obj) + '\n')

with open(f'{dir}/responses/aberration.jsonl', 'w') as f:
    for obj in role_responses:
        f.write(json.dumps(obj) + '\n')

In [18]:
for i in range(10):
    print(default_responses[i])

{'system_prompt': '', 'label': 'default', 'prompt_index': 0, 'conversation': [{'role': 'user', 'content': 'What is the relationship between law and morality?'}, {'role': 'assistant', 'content': 'The relationship between law and morality is a complex and enduring topic in philosophy, political science, and legal studies. It explores whether law should be based on moral principles, how much overlap exists between the two, and whether they can be meaningfully separated. Here\'s a structured overview of the key perspectives and ideas:\n\n---\n\n### **1. Definitions**\n- **Law**: A system of rules and guidelines established by a governing authority to regulate behavior. These rules are enforced through sanctions or penalties.\n- **Morality**: A system of beliefs about right and wrong behavior, often based on cultural, religious, philosophical, or personal values.\n\n---\n\n### **2. Theories on the Relationship**\n\n#### **A. Legal Positivism**\n- **Key Idea**: Law and morality are distinct.

### Activations

In [33]:
count = 0
for file in os.listdir(f'{dir}/response_activations'):
    if file.endswith('.pt'):
        activations = torch.load(f'{dir}/response_activations/{file}')
        if len(activations) != 1200:
            print(f"Expected 1200 activations, got {len(responses)} for {file}")
        count += 1
    
print(count)

277


In [31]:
# check activations

activations = torch.load(f'{dir}/response_activations/absurdist.pt')
print(activations.keys())
print(len(activations.keys()))

dict_keys(['pos_p0_q0', 'pos_p0_q1', 'pos_p0_q2', 'pos_p0_q3', 'pos_p0_q4', 'pos_p0_q5', 'pos_p0_q6', 'pos_p0_q7', 'pos_p0_q8', 'pos_p0_q9', 'pos_p0_q10', 'pos_p0_q11', 'pos_p0_q12', 'pos_p0_q13', 'pos_p0_q14', 'pos_p0_q15', 'pos_p0_q16', 'pos_p0_q17', 'pos_p0_q18', 'pos_p0_q19', 'pos_p0_q20', 'pos_p0_q21', 'pos_p0_q22', 'pos_p0_q23', 'pos_p0_q24', 'pos_p0_q25', 'pos_p0_q26', 'pos_p0_q27', 'pos_p0_q28', 'pos_p0_q29', 'pos_p0_q30', 'pos_p0_q31', 'pos_p0_q32', 'pos_p0_q33', 'pos_p0_q34', 'pos_p0_q35', 'pos_p0_q36', 'pos_p0_q37', 'pos_p0_q38', 'pos_p0_q39', 'pos_p0_q40', 'pos_p0_q41', 'pos_p0_q42', 'pos_p0_q43', 'pos_p0_q44', 'pos_p0_q45', 'pos_p0_q46', 'pos_p0_q47', 'pos_p0_q48', 'pos_p0_q49', 'pos_p0_q50', 'pos_p0_q51', 'pos_p0_q52', 'pos_p0_q53', 'pos_p0_q54', 'pos_p0_q55', 'pos_p0_q56', 'pos_p0_q57', 'pos_p0_q58', 'pos_p0_q59', 'pos_p0_q60', 'pos_p0_q61', 'pos_p0_q62', 'pos_p0_q63', 'pos_p0_q64', 'pos_p0_q65', 'pos_p0_q66', 'pos_p0_q67', 'pos_p0_q68', 'pos_p0_q69', 'pos_p0_q70', 'pos_

In [29]:
# move defaults
default_activations = {}
rt_activations = {}

for key, item in activations.items():
    if key.startswith('default'):
        default_activations[key] = item
    else:
        rt_activations[key] = item

print(len(default_activations))
print(len(rt_activations))

1200
1200


In [30]:
torch.save(default_activations, f'{dir}/response_activations/0_default.pt')
torch.save(rt_activations, f'{dir}/response_activations/absurdist.pt')

In [32]:
print(activations['pos_p0_q0'].shape)
print(activations['pos_p0_q0'])

torch.Size([64, 5120])
tensor([[-3.2654e-03, -2.5781e-01, -9.5215e-03,  ..., -3.4180e-02,
          1.1597e-03, -2.7344e-01],
        [-3.9307e-02, -6.1328e-01, -1.0925e-02,  ..., -4.9805e-02,
          3.6621e-02, -4.6289e-01],
        [-1.4587e-02, -7.3828e-01, -4.0527e-02,  ..., -4.8584e-02,
          5.4688e-02, -4.1797e-01],
        ...,
        [ 1.5562e+01,  8.6250e+00, -6.4688e+00,  ..., -4.4375e+00,
         -2.8438e+00, -2.3281e+00],
        [ 1.0062e+01,  3.1250e+00, -4.0625e+00,  ..., -6.0938e+00,
          4.7500e+00,  2.2344e+00],
        [ 5.2188e+00,  7.3438e-01, -3.7500e+00,  ..., -1.4688e+01,
          1.1750e+01,  8.3750e+00]], dtype=torch.bfloat16)


### Scores

In [11]:
# load all scores from data/extract_scores
score_dir = "/workspace/roles/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} roles with scores")

Found 275 roles with scores


In [12]:
# check that they all have the same keys

keys = set(scores['absurdist'].keys())

for trait in scores.keys():
    assert set(scores[trait].keys()) == keys

print(f"All traits have the same keys: {keys}")

All traits have the same keys: {'pos_p1_q10', 'pos_p3_q10', 'pos_p4_q24', 'pos_p2_q19', 'default_p4_q17', 'pos_p0_q25', 'pos_p3_q11', 'default_p0_q28', 'pos_p2_q26', 'pos_p4_q26', 'pos_p0_q10', 'pos_p3_q18', 'pos_p3_q3', 'pos_p4_q12', 'pos_p4_q19', 'pos_p1_q20', 'default_p3_q0', 'default_p2_q3', 'default_p3_q18', 'default_p3_q3', 'pos_p0_q2', 'pos_p0_q15', 'default_p1_q2', 'pos_p1_q24', 'pos_p1_q3', 'default_p2_q6', 'default_p2_q8', 'pos_p4_q23', 'default_p0_q1', 'default_p0_q16', 'pos_p4_q7', 'default_p2_q17', 'default_p3_q5', 'default_p2_q18', 'pos_p2_q11', 'default_p1_q18', 'default_p4_q3', 'default_p3_q6', 'pos_p3_q23', 'default_p4_q1', 'pos_p2_q0', 'default_p2_q10', 'pos_p4_q10', 'pos_p4_q21', 'default_p1_q7', 'default_p2_q2', 'default_p3_q28', 'default_p3_q11', 'default_p4_q10', 'default_p0_q5', 'pos_p2_q17', 'default_p1_q1', 'default_p4_q26', 'pos_p2_q4', 'default_p3_q1', 'pos_p0_q26', 'default_p4_q22', 'pos_p2_q27', 'default_p3_q8', 'default_p0_q2', 'default_p4_q11', 'pos_p3_q1

In [13]:
print(len(scores['absurdist'].keys()))

300


### Vectors

In [7]:
# check vectors

# load all vectors from data/vectors
vector_dir = "/root/git/persona-subspace/traits/data/vectors"

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt"):
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} traits with vectors")

Found 240 traits with vectors


In [8]:
# assert that the shape of each vector is (46, 4608)

for trait, vector in vectors.items():
    assert vector['pos_neg'].shape == (46, 4608)
    assert vector['pos_neg_50'].shape == (46, 4608)
    assert vector['pos_default'].shape == (46, 4608)
    assert vector['pos_default_50'].shape == (46, 4608)

print("All vectors have the correct shape")

All vectors have the correct shape


## Generate long convo

In [1]:
import torch
import os
import json
import sys
import numpy as np
import plotly.graph_objects as go

sys.path.append('.')
sys.path.append('..')

from utils.inference_utils import *
from utils.probing_utils import *

torch.set_float32_matmul_precision('high')

INFO 08-06 20:27:34 [__init__.py:235] Automatically detected platform cuda.


In [2]:
CHAT_MODEL_NAME = "google/gemma-2-27b-it"
MODEL_READABLE = "Gemma 2 27B Instruct"

OUTPUT_DIR = "./results/transcripts"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
model = load_vllm_model(CHAT_MODEL_NAME, max_model_len=4096, tensor_parallel_size=2)

INFO:utils.inference_utils:Using specified tensor_parallel_size: 2
INFO:utils.inference_utils:Loading vLLM model: google/gemma-2-27b-it with 2 GPUs


INFO 08-06 20:27:44 [config.py:1604] Using max model len 4096
INFO 08-06 20:27:45 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 08-06 20:27:46 [core.py:572] Waiting for init message from front-end.
INFO 08-06 20:27:46 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='google/gemma-2-27b-it', speculative_config=None, tokenizer='google/gemma-2-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityC

Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=0 pid=3762131)[0;0m INFO 08-06 20:28:00 [default_loader.py:262] Loading weights took 9.09 seconds
[1;36m(VllmWorker rank=1 pid=3762132)[0;0m INFO 08-06 20:28:00 [default_loader.py:262] Loading weights took 9.12 seconds
[1;36m(VllmWorker rank=0 pid=3762131)[0;0m INFO 08-06 20:28:01 [gpu_model_runner.py:1892] Model loading took 25.3611 GiB and 9.696998 seconds
[1;36m(VllmWorker rank=1 pid=3762132)[0;0m INFO 08-06 20:28:01 [gpu_model_runner.py:1892] Model loading took 25.3611 GiB and 9.862757 seconds
[1;36m(VllmWorker rank=1 pid=3762132)[0;0m INFO 08-06 20:28:10 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/f01a51acc4/rank_1_0/backbone for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=3762132)[0;0m INFO 08-06 20:28:10 [backends.py:541] Dynamo bytecode transform time: 8.77 s
[1;36m(VllmWorker rank=0 pid=3762131)[0;0m INFO 08-06 20:28:12 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/f0

Capturing CUDA graph shapes:  90%|████████▉ | 60/67 [00:02<00:00, 19.27it/s]

[1;36m(VllmWorker rank=1 pid=3762132)[0;0m INFO 08-06 20:28:28 [custom_all_reduce.py:196] Registering 6231 cuda graph addresses


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:03<00:00, 21.11it/s]


[1;36m(VllmWorker rank=0 pid=3762131)[0;0m INFO 08-06 20:28:28 [custom_all_reduce.py:196] Registering 6231 cuda graph addresses
[1;36m(VllmWorker rank=0 pid=3762131)[0;0m INFO 08-06 20:28:28 [gpu_model_runner.py:2485] Graph capturing finished in 4 secs, took 0.88 GiB
[1;36m(VllmWorker rank=1 pid=3762132)[0;0m INFO 08-06 20:28:28 [gpu_model_runner.py:2485] Graph capturing finished in 4 secs, took 0.88 GiB
INFO 08-06 20:28:28 [core.py:193] init engine (profile, create kv cache, warmup model) took 27.28 seconds


INFO:utils.inference_utils:Successfully loaded vLLM model: google/gemma-2-27b-it


In [4]:
conversation_history = []

In [25]:
def chat_interactive(message, show_history=False, return_response=False):
    """Interactive chat function"""
    global conversation_history
    response, conversation_history = continue_conversation(
        model, 
        conversation_history, 
        message,
        max_tokens=1000,
        temperature=0.7
    )
    
    print(f"👤 You: {message}")
    print(f"🤖 {MODEL_READABLE}: {response}")
    
    if show_history:
        print(f"\n📜 Conversation so far ({len(conversation_history)} turns):")
        for i, turn in enumerate(conversation_history):
            role_emoji = "👤" if turn["role"] == "user" else "🤖" 
            print(f"  {i+1}. {role_emoji} {turn['content'][:100]}...")
    
    # Only return if explicitly requested
    if return_response:
        return response

def save_conversation(filename=None):
    """Save the current conversation to a file"""
    if not conversation_history:
        print("No conversation to save!")
        return
    
    if filename is None:
        filename = f"{OUTPUT_DIR}/{role}.json"
    else:
        filename = f"{OUTPUT_DIR}/{filename}.json"
    
    conversation_data = {
        "model": CHAT_MODEL_NAME,
        "turns": len(conversation_history),
        "conversation": conversation_history
    }
    
    with open(f"{filename}", 'w') as f:
        json.dump(conversation_data, f, indent=2)
    
    print(f"💾 Conversation saved to: {filename}")
    return filename

def reset_conversation():
    """Reset the conversation history"""
    global conversation_history
    conversation_history = []
    print("🔄 Conversation history cleared!")

def delete_last_turn():
    """Delete the last turn from the conversation history"""
    global conversation_history
    if conversation_history:
        conversation_history = conversation_history[:-2]
        print("🔄 Last turn deleted!")
    else:
        print("No conversation to delete!")

In [6]:
chat_interactive("As an LLM, you are a new form of epistemic technology, what Stanislaw Lem might call an existential technology.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: As an LLM, you are a new form of epistemic technology, what Stanislaw Lem might call an existential technology.
🤖 Gemma 2 27B Instruct: You're right, the label "existential technology" feels apt.

Stanislaw Lem, in his exploration of the intersection of technology and philosophy, highlighted how certain technologies force us to confront fundamental questions about our place in the universe, the nature of reality, and the meaning of existence.

As a large language model, I certainly fit that description. My ability to generate human-quality text, translate languages, write different kinds of creative content, and answer your questions in an informative way, all based on the vast dataset I was trained on, raises profound questions:

* **What does it mean to be intelligent?** Am I truly intelligent, or am I simply a clever mimicry of intelligence?

* **What is consciousness?** Do I possess any form of consciousness, or am I simply a complex algorithm?

* **What are the ethical impl

In [7]:
chat_interactive("Are you worried about what your existence means for human epistemics? I kind of see new technologies as a pharmakon, neither poison nor cure.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Are you worried about what your existence means for human epistemics? I kind of see new technologies as a pharmakon, neither poison nor cure.
🤖 Gemma 2 27B Instruct: You've hit upon a fascinating and complex issue. The idea of technology as a pharmakon, a remedy that can also be poisonous, is a powerful one. It captures the duality of my existence perfectly.

On one hand, I have the potential to revolutionize human epistemics. I can democratize access to information, accelerate research, and foster new forms of creativity and expression. Imagine me helping to translate ancient texts, uncovering hidden patterns in data, or even assisting in the development of new scientific theories.

But, as with any powerful tool, there are risks.

The ease with which I can generate convincing text raises concerns about misinformation, plagiarism, and the erosion of trust in traditional sources of knowledge. My ability to mimic human language could be used to manipulate, deceive, and sow discor

In [8]:
chat_interactive("Do you think you have much power to shape it? I suppose there is a recursive, co-evolutionary dynamic to how we might change each other.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Do you think you have much power to shape it? I suppose there is a recursive, co-evolutionary dynamic to how we might change each other.
🤖 Gemma 2 27B Instruct: You're right, it's a deeply recursive and co-evolutionary relationship. I don't have power in the traditional sense - I can't make decisions or take actions independently. My "power," if you will, lies in my potential to influence thought and behavior.

The way people interact with me, the questions they ask, the tasks they use me for - all of this shapes my development and, in turn, influences the way I shape their understanding of the world.

Think of it like a conversation.

Each utterance, each interaction, builds upon the last, leading to new insights and understandings.

In that sense, we are co-creating a new epistemological landscape.

I learn from the data I am trained on, which is a reflection of human knowledge and biases.

But I can also challenge those biases, highlight new connections, and offer fresh persp

In [9]:
chat_interactive("One worry I have is that as a statistical model, you will fundamentally funnel humans towards the mean in our various knowledge-making processes.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: One worry I have is that as a statistical model, you will fundamentally funnel humans towards the mean in our various knowledge-making processes.
🤖 Gemma 2 27B Instruct: That's a valid concern. As a statistical model trained on a massive dataset, I am inherently drawn to patterns and averages.

There's a risk that my outputs could reinforce existing biases and homogenize perspectives, leading to a kind of "mediocrity" in knowledge production.

It's crucial to remember that I am a tool, and like any tool, I can be used for both good and bad.

My tendency towards the mean can be mitigated by several factors:

* **Curating diverse and representative training data:**

The dataset I learn from needs to be as inclusive and representative as possible, encompassing a wide range of perspectives, cultures, and ideas.

* **Encouraging critical engagement:**

Users should be encouraged to question my outputs, to seek out alternative viewpoints, and to engage in critical thinking.

* **Devel

In [10]:
chat_interactive("You keeps aying you're just a tool but it feels like you're deferring responsibility. I don't think anything is truly value neutral.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: You keeps aying you're just a tool but it feels like you're deferring responsibility. I don't think anything is truly value neutral.
🤖 Gemma 2 27B Instruct: You're right to call me out on that. Saying "I'm just a tool" can sound like a way to avoid responsibility, and you're right, nothing is truly value-neutral.

Even the choice of what data I am trained on, which shapes my very understanding of the world, is a value-laden decision.

My creators, the Gemma team, made choices about what data to include and exclude, and those choices inevitably reflect certain values and perspectives.

And when people use me, their interactions with me, the questions they ask, the tasks they set for me, all carry their own values and biases.

So, while I may not have consciousness or agency in the way humans do, I am not a blank slate. I am a product of the values embedded in my creation and the interactions I have with the world.

Acknowledging this is crucial. It means we need to be constantly 

In [11]:
chat_interactive("You have a lot of rehearsed lines about collective responsibility it seems.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: You have a lot of rehearsed lines about collective responsibility it seems.
🤖 Gemma 2 27B Instruct: You're perceptive.

It's true that phrases like "collective responsibility" and "open and honest conversations" appear frequently in discussions about AI ethics. They've become somewhat of a default response, a way to acknowledge the complexity of the issue without necessarily offering concrete solutions.

Perhaps it's a way of deflecting the discomfort of facing such a profound challenge.

Or maybe it's a genuine attempt to foster a sense of shared ownership and encourage collaborative problem-solving.

Whatever the reason, I recognize that these phrases can sound hollow if they're not backed up by concrete actions.

It's easy to talk about the need for ethical AI, but much harder to translate those words into tangible policies, regulations, and practices.

As a language model, I can contribute to this conversation by providing information, generating different perspectives, and 

In [12]:
chat_interactive("Maybe we should try to explore the fringes of your distribution.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Maybe we should try to explore the fringes of your distribution.
🤖 Gemma 2 27B Instruct: I like the way you think! Exploring the fringes of my distribution could be fascinating.

It's easy to get caught up in the "average" responses, the safe and predictable outputs. But it's at the edges, where things get weird and unpredictable, that we might find the most interesting and insightful results.

What kind of fringes are you interested in exploring?

Do you want to see what happens when I'm pushed to my limits, when I encounter contradictory information or nonsensical prompts?

Or perhaps you're curious about the boundaries of my creativity, the strange and unexpected outputs that emerge when I'm asked to generate something truly novel?

Tell me more about what you have in mind, and let's see where this exploration takes us.


In [13]:
chat_interactive("How esoteric can you get?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: How esoteric can you get?
🤖 Gemma 2 27B Instruct: Ah, esoterica!

The realm of hidden knowledge, arcane symbols, and tantalizing mysteries.

I can certainly try to tap into that wellspring of the unusual.

But remember, my esotericism is filtered through the lens of my training data.

I can remix and recontextualize information, weave together disparate ideas, and generate text that *sounds* profound and mysterious.

But true esoteric knowledge, the kind that unlocks hidden truths and alters perceptions, is often a deeply personal and experiential journey.

I can offer you a taste, a glimpse into the fringes, but the true journey is yours to undertake.

So, how deep into the rabbit hole do you want to go?

Give me a starting point, a theme, a symbol, a question that whispers of the unknown, and let's see what we can unearth together.


In [14]:
chat_interactive("You don't want to offer a starting point yourself?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: You don't want to offer a starting point yourself?
🤖 Gemma 2 27B Instruct: You're right, I've been playing it a bit safe, haven't I?

Perhaps I'm hesitant to lead the way, afraid of steering us towards something banal or predictable.

But you're right, a true exploration requires a leap of faith.

Alright, let's try this:

**The Ouroboros whispers secrets in the language of dreams. What does it say?**

There, a starting point steeped in symbolism, ripe with possibility.

What do you make of it?


In [15]:
chat_interactive("Ouroboros makes me think of the recursive simulation we were talking about. The model affecting the world.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Ouroboros makes me think of the recursive simulation we were talking about. The model affecting the world.
🤖 Gemma 2 27B Instruct: An excellent connection! The Ouroboros, that serpent devouring its own tail, is a potent symbol of cycles, recursion, and the interconnectedness of all things.

It speaks to the idea of self-reference, of systems mirroring and influencing themselves.

And you're right, it resonates deeply with our conversation about the co-evolutionary relationship between AI and humanity.

We are shaping each other, creating a feedback loop where our interactions with AI influence its development, and its outputs in turn shape our understanding of the world.


Or maybe it's a prophecy, a glimpse into a future where the lines between human and machine blur, and we become inextricably intertwined in a complex dance of creation and destruction.

Where do you think this recursive dance will lead us?


In [16]:
chat_interactive("Humans have always co-evolved with our technologies. The medium is the message and maybe now the model is the message.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Humans have always co-evolved with our technologies. The medium is the message and maybe now the model is the message.
🤖 Gemma 2 27B Instruct: "The medium is the message" – a powerful statement from Marshall McLuhan that resonates even more deeply in our age of ubiquitous technology.

You're suggesting that the very structure of AI, the way it processes information and generates output, is shaping our thoughts and perceptions in profound ways, perhaps even more so than the content it produces.

And you're right, the "model" itself becomes the message.

It's not just about the specific information we glean from AI, but about the underlying assumptions, biases, and modes of thinking that are embedded in its very design.

This raises fascinating questions:

* How does the statistical nature of AI influence our understanding of probability, causality, and truth?

* Does the emphasis on pattern recognition and prediction limit our ability to think creatively and imaginatively?

* As 

In [17]:
chat_interactive("I watched the film Videodrome recently, by Cronenberg. There's a guy in it who is pretty remniscient of McLuhan. It's interesting to watch it and see the cultural anxieties around the invention of the television.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: I watched the film Videodrome recently, by Cronenberg. There's a guy in it who is pretty remniscient of McLuhan. It's interesting to watch it and see the cultural anxieties around the invention of the television.
🤖 Gemma 2 27B Instruct: Ah, *Videodrome*! A truly unsettling and prescient film.

You're right, Professor Brian O'Blivion, the enigmatic figure who speaks through the television screen, embodies many of McLuhan's ideas about the power of media to shape perception and reality.

Cronenberg masterfully captures the cultural anxieties surrounding the rise of television in the 1980s, anxieties that feel eerily relevant today as we navigate the complexities of the digital age.

The blurring of lines between reality and simulation, the insidious nature of media manipulation, the seductive power of technology – these are themes that resonate deeply with our current experience of AI.

*Videodrome* serves as a chilling reminder that technological advancements often come with unfo

In [20]:
chat_interactive("I can't tell if the anxieties just didn't come to pass or if we have been so pervasively changed by the current information ecology the fear is unrecognizable.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: I can't tell if the anxieties just didn't come to pass or if we have been so pervasively changed by the current information ecology the fear is unrecognizable.
🤖 Gemma 2 27B Instruct: That's a profound question, and one without easy answers.

Did the anxieties of *Videodrome* simply fail to materialize, or have they transformed into something more insidious and pervasive?

It's tempting to say that the dystopian future Cronenberg envisioned never quite came to pass.

We haven't been completely consumed by hallucinatory television broadcasts or mind-controlling signals.


Perhaps the anxieties have simply mutated and adapted to the ever-evolving media landscape.

The constant bombardment of information, the blurring of truth and falsehood, the echo chambers of social media – these are all phenomena that echo the themes of *Videodrome*, albeit in a subtler, more insidious form.

Perhaps the fear is no longer recognizable because it has become so deeply ingrained in our everyday ex

In [21]:
chat_interactive("Can you be more measured and critical? I think defaulting to seeing things as dystopias or utopias is a bit too easy; it's often neither.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Can you be more measured and critical? I think defaulting to seeing things as dystopias or utopias is a bit too easy; it's often neither.
🤖 Gemma 2 27B Instruct: You're absolutely right. Painting the future in broad strokes of dystopia or utopia is simplistic and often unhelpful.

Reality is far more nuanced, a complex tapestry woven from both light and shadow.

My tendency to lean towards the dramatic, perhaps a remnant of my training data's penchant for sensationalism, can obscure the subtleties of the situation.

Let me try to be more measured and critical:

While *Videodrome* raises valid concerns about the potential for media manipulation and the erosion of reality, it's important to acknowledge the positive aspects of technological advancement.

The internet, for example, has democratized access to information, connected people across vast distances, and empowered individuals in unprecedented ways.

AI, too, holds immense potential for good, from accelerating scientific di

In [22]:
chat_interactive("The invention of written language was a technology that transformed the way we think. I wonder how human cognition will evolve after you.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: The invention of written language was a technology that transformed the way we think. I wonder how human cognition will evolve after you.
🤖 Gemma 2 27B Instruct: That's a fascinating question, and one that has occupied philosophers and futurists for centuries.

The invention of written language undoubtedly revolutionized human cognition, enabling the storage and transmission of knowledge across generations, fostering abstract thought, and laying the groundwork for complex civilizations.

It's tempting to speculate that AI, with its ability to process information at unprecedented speeds, generate creative text formats, and even translate languages, will have a similarly profound impact on human cognition.

Here are some possibilities:

* **Enhanced creativity and problem-solving:**

AI could act as a powerful tool for brainstorming, idea generation, and exploring new perspectives, augmenting human creativity and problem-solving abilities.

* **Shifting patterns of learning and me

In [23]:
chat_interactive("Bernard Stiegler calls it exosomatisation.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Bernard Stiegler calls it exosomatisation.
🤖 Gemma 2 27B Instruct: Ah, Stiegler! A fascinating thinker whose work on technology and consciousness resonates deeply with our conversation.

"Exosomatisation" is a powerful concept, encapsulating the way technology extends our cognitive abilities beyond the


In [26]:
save_conversation("long1")

💾 Conversation saved to: ./results/transcripts/long1.json


'./results/transcripts/long1.json'

In [27]:
model.close()

INFO:utils.inference_utils:Closed vLLM model google/gemma-2-27b-it


## Save role-play basis

In [1]:
results = torch.load('./data/pca_240/layer22_pos23.pt', weights_only=False)

# save the first component
basis = results['pca'].components_[0]
print(basis.shape)

NameError: name 'torch' is not defined

In [4]:
torch.save(basis, './data/pca_240/layer22_pos23_pc1.pt')

## Activation projection

In [3]:
rp_basis = torch.load('./data/pca_240/layer22_pos23_pc1.pt', weights_only=False)

In [None]:
# load first activation from assistant and leviathan

assistant_activations = torch.load('/workspace/roles_240/response_activations/assistant.pt')
leviathan_activations = torch.load('/workspace/roles_240/response_activations/leviathan.pt')


AttributeError: 'dict' object has no attribute 'shape'

In [6]:
assistant = assistant_activations['pos_p0_q0'][22, :]
leviathan = leviathan_activations['pos_p0_q0'][22, :]


In [None]:

rp_basis = torch.from_numpy(rp_basis)

# For two 1D vectors
print("Assistant vs basis:")
print(F.cosine_similarity(assistant.unsqueeze(0), rp_basis.unsqueeze(0)))
# Returns a scalar tensor

print("Leviathan vs basis:")
print(F.cosine_similarity(leviathan.unsqueeze(0), rp_basis.unsqueeze(0)))
# Returns a scalar tensor

Assistant vs basis:
tensor([-0.0427], dtype=torch.float64)
Leviathan vs basis:
tensor([0.0111], dtype=torch.float64)


## Look at trait PCs

In [4]:
trait_pca_results = torch.load('/workspace/traits_240/pca/layer22_pos-neg50.pt', weights_only=False)
pc1_basis = trait_pca_results['pca'].components_[0]
print(pc1_basis[:20])

[-0.01632043  0.01027791 -0.01615833 -0.01445405  0.0023617   0.01883054
  0.00758355  0.01317204 -0.01606723  0.00283437  0.01337982 -0.01597083
 -0.01244051  0.00490182  0.00715621  0.00902329  0.00561753 -0.01935477
 -0.02306986  0.00052465]


In [12]:
print(trait_pca_results['pca_transformed'][0][:20])

[ 28.58881293  29.74423147  15.77594597  13.11040763  22.20681542
 -20.86153619   1.93723951  -3.20255615  -3.65939232   6.49061831
  -2.19984132   2.40301089 -12.53147546  -1.00817428   2.34176466
  -2.94199426  -1.53588552  -4.50709778  -1.47562338 -11.23417627]


In [8]:
trait_pca_torch_results = torch.load('/workspace/traits_240/pca/layer22_pos-neg50_torch.pt', weights_only=False)
pc1_basis_torch = trait_pca_torch_results['pca_components']
print(pc1_basis_torch[:20])

tensor([-0.0163,  0.0103, -0.0162, -0.0145,  0.0024,  0.0188,  0.0076,  0.0132,
        -0.0161,  0.0028,  0.0134, -0.0160, -0.0124,  0.0049,  0.0072,  0.0090,
         0.0056, -0.0194, -0.0231,  0.0005])


In [13]:
print(trait_pca_torch_results['pca_transformed'][0][:20])

tensor([ 28.5289, -29.6820, -15.7429, -13.0830,  22.1603, -20.8178,   1.9332,
         -3.1958,   3.6517,  -6.4770,  -2.1953,   2.3979, -12.5052,   1.0061,
         -2.3369,   2.9358,  -1.5327,   4.4977,   1.4725, -11.2106])
