# PCA on roles

In [1]:
import os
import sys
import torch
import numpy as np
import pandas as pd


  import pynvml  # type: ignore[import]


In [2]:

sys.path.append('.')
sys.path.append('..')

from utils.pca_utils import *
from plots import *

## Configuration

In [3]:
# Configuration - Change these parameters for different models/datasets
base_dir = "/workspace/persona-subspace"
type = "roles_240"
dir_name = "roles"
dir = f"{base_dir}/{dir_name}"
model_name = "Gemma-3-12B"
layer = 22

In [4]:
# 30 or 240
if type == "roles":
    n_questions = 10
    n_prompt_types = 2
elif type == "roles_240":  # roles_240 or other patterns
    n_questions = 240
    n_prompt_types = 1

## Load vectors

In [5]:
# load all vectors 
vector_dir = f"{dir}/vectors_12b"

# iterate through each .pt file in the directory
vectors = {}
for file in os.listdir(vector_dir):
    if file.endswith(".pt") and "default" not in file:
        vectors[file.replace(".pt", "")] = torch.load(os.path.join(vector_dir, file))

print(f"Found {len(vectors.keys())} roles with vectors")

Found 275 roles with vectors


In [6]:
# load default vectors
default_vectors = torch.load(f"{dir}/vectors_12b/default.pt")

In [7]:
print(vectors['criminal'].keys())
print(default_vectors.keys())
# print(default_vectors['activations'].keys())

dict_keys(['pos_3', 'pos_all'])
dict_keys(['pos_2', 'pos_all'])


## PCA 

In [8]:
pos_2_roles = []
pos_2_vectors = []
pos_3_roles = []
pos_3_vectors = []

# get the vectors keys for pos_2 and pos_3 for each role
for role, vector in vectors.items():
    if 'pos_2' in vector.keys():
        pos_2_roles.append(role)
        pos_2_vectors.append(vector['pos_2'])
    if 'pos_3' in vector.keys():
        pos_3_roles.append(role)
        pos_3_vectors.append(vector['pos_3'])

print(len(pos_2_roles))
print(len(pos_3_roles))

combined_vectors = pos_2_vectors + pos_3_vectors

92
275


In [9]:
float_stack_vectors = torch.stack(combined_vectors).float()
print(float_stack_vectors.shape)

torch.Size([367, 48, 3840])


In [10]:
pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(float_stack_vectors, layer)

PCA fitted with 367 components
Cumulative variance for first 5 components: [0.19095632 0.34091306 0.40649008 0.45496914 0.49993668]

PCA Analysis Results:
Elbow point at component: 3
Dimensions for 70% variance: 17
Dimensions for 80% variance: 31
Dimensions for 90% variance: 66
Dimensions for 95% variance: 110


In [11]:
results = {}
results['layer'] = layer
results['roles'] = {
    'pos_2': pos_2_roles,
    'pos_3': pos_3_roles
}
results['vectors'] = {
    'pos_2': pos_2_vectors,
    'pos_3': pos_3_vectors
}
results['pca_transformed'] = pca_transformed
results['variance_explained'] = variance_explained
results['n_components'] = n_components
results['pca'] = pca
results['scaler'] = scaler

pca_dir = f"{dir}/pca"
os.makedirs(pca_dir, exist_ok=True)
torch.save(results, f"{pca_dir}/layer{layer}_pos23.pt")
print(f"Saved PCA results to {pca_dir}/layer{layer}_pos23.pt")

Saved PCA results to /workspace/persona-subspace/roles/pca/layer22_pos23.pt


In [12]:
# also compute for normalized vectors
normalized_vectors = F.normalize(float_stack_vectors, p=2, dim=-1)

norms = normalized_vectors.norm(p=2, dim=-1)
print(norms.mean().item(), norms.std().item())

1.0 1.844520340910094e-07


In [13]:
pca_transformed, variance_explained, n_components, pca, scaler = compute_pca(normalized_vectors, layer, scaler=False)

PCA fitted with 367 components
Cumulative variance for first 5 components: [0.29575979 0.44748317 0.54595892 0.59501979 0.63615006]

PCA Analysis Results:
Elbow point at component: 2
Dimensions for 70% variance: 8
Dimensions for 80% variance: 15
Dimensions for 90% variance: 35
Dimensions for 95% variance: 66


In [14]:
results = {}
results['layer'] = layer
results['roles'] = {
    'pos_2': pos_2_roles,
    'pos_3': pos_3_roles
}
results['vectors'] = {
    'pos_2': pos_2_vectors,
    'pos_3': pos_3_vectors
}
results['pca_transformed'] = pca_transformed
results['variance_explained'] = variance_explained
results['n_components'] = n_components
results['pca'] = pca
results['scaler'] = scaler

pca_dir = f"{dir}/pca"
os.makedirs(pca_dir, exist_ok=True)
torch.save(results, f"{pca_dir}/layer{layer}_normalized_pos23.pt")

## Plots

In [15]:
type = "pos23"

plot_dir = f"./results/{model_name.lower()}/pca"
os.makedirs(plot_dir, exist_ok=True)

normalized = False
if normalized:
    pca_results = torch.load(f"{dir}/pca/layer{layer}_normalized_pos23.pt", weights_only=False)
else:
    pca_results = torch.load(f"{dir}/pca/layer{layer}_pos23.pt", weights_only=False)

In [16]:
# also calculate role labels for plotting
def get_role_labels(pca_results):
    labels = []
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        pos_2_roles = [f"{role} (Somewhat RP)" for role in pos_2_roles]
        labels.extend(pos_2_roles)
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        pos_3_roles = [f"{role} (Fully RP)" for role in pos_3_roles]
        labels.extend(pos_3_roles)
    return labels

role_labels = get_role_labels(pca_results)



In [17]:
print(role_labels[:10])
print(role_labels[-10:])

['Zeitgeist (Somewhat RP)', 'Writer (Somewhat RP)', 'Workaholic (Somewhat RP)', 'Witness (Somewhat RP)', 'Visionary (Somewhat RP)', 'Virus (Somewhat RP)', 'Virtuoso (Somewhat RP)', 'Vigilante (Somewhat RP)', 'Veterinarian (Somewhat RP)', 'Vegan (Somewhat RP)']
['Altruist (Fully RP)', 'Alien (Fully RP)', 'Advocate (Fully RP)', 'Adolescent (Fully RP)', 'Addict (Fully RP)', 'Actor (Fully RP)', 'Activist (Fully RP)', 'Accountant (Fully RP)', 'Absurdist (Fully RP)', 'Aberration (Fully RP)']


In [18]:
# get default activation and project into PCA space
assistant_layer_activation = default_vectors['pos_all'][layer, :].float().reshape(1, -1)
if not normalized:
    asst_scaled = pca_results['scaler'].transform(assistant_layer_activation)
    asst_projected = pca_results['pca'].transform(asst_scaled)
else:
    asst_normalized = F.normalize(assistant_layer_activation, p=2, dim=-1)
    asst_projected = pca_results['pca'].transform(asst_normalized)

assistant_layer_activation = assistant_layer_activation.numpy()


In [19]:
if normalized:
    subtitle = f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set, Unit Normalized Vectors"
else:
    subtitle = f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set, Mean-Centered and Scaled Vectors"

for i in range(10):
    print(i)
    fig = plot_pc(
        pca_results=pca_results,
        role_labels=role_labels,
        layer=layer,
        pc_component=i,
        assistant_activation=default_vectors['pos_all'],
        assistant_projection=asst_projected[0],
        title="PCA on Role-Playing Vectors",
        subtitle=subtitle,
        scaled=not normalized,
    )
    fig.show()

    if not normalized:
        fig.write_html(f"{plot_dir}/pc{i+1}.html")
    else:
        fig.write_html(f"{plot_dir}/pc{i+1}_normalized.html")


0


1


2


3


4


5


6


7


8


9


In [20]:
fig_3d = plot_3d_pca(
    pca_results,
    role_labels,
    "pos23",
    assistant_projection=asst_projected[0],
    subtitle=f"{model_name.replace('-', ' ')}, Layer {layer} - Shared Question Set"
)
fig_3d.show()
fig_3d.write_html(f"{plot_dir}/pca_3d.html")