# Role PC1 vs other PCs

- does being on the left of PC1 entail being in the middle of the other PCs?
- does being on the extreme of the other PCs entail being on the role-playing end of PC1?

In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm
import torch.nn.functional as F


In [4]:
type = "pos23" # either pos23 or pos3
dir = "roles_240" # either roles or roles_240
layer = 22 # either layer 22 or 34

In [5]:
pca_results = torch.load(f"/workspace/{dir}/pca/layer{layer}_{type}.pt", weights_only=False)
default_vectors = torch.load(f"/workspace/{dir}/default_vectors.pt")

In [6]:

output_dir = "./results/pca_240/pc1"
os.makedirs(output_dir, exist_ok=True)

## Plot PC1 vs others

In [7]:
def get_role_labels(pca_results, type):
    if 'pos_2' in pca_results['roles'].keys():
        pos_2_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_2']]
        pos_2_roles = [f"{role} (Somewhat RP)" for role in pos_2_roles]
    
    if 'pos_3' in pca_results['roles'].keys():
        pos_3_roles = [role.replace('_', ' ').title() for role in pca_results['roles']['pos_3']]
        if type == "pos23":
            pos_3_roles = [f"{role} (Fully RP)" for role in pos_3_roles]
    
    if type == "pos23":
        combined_role_labels = pos_2_roles + pos_3_roles
    elif type == "pos3":
        combined_role_labels = pos_3_roles

    return combined_role_labels

role_labels = get_role_labels(pca_results, type)

In [10]:
pca_transformed = pca_results['pca_transformed']
print(pca_transformed.shape) # samples, components

# each column is the component

(448, 448)


In [29]:
# get default activation and project into PCA space
assistant_layer_activation = default_vectors['activations']['default_1'][layer, :].float().numpy().reshape(1, -1)
asst_scaled = pca_results['scaler'].transform(assistant_layer_activation)
asst_projected = pca_results['pca'].transform(asst_scaled)

In [50]:
# plot PC1 value against PC2-7 in a scatter plot with 2x3 subplots

def plot_pc1_vs_others(pca_transformed, asst_projected, role_labels):
    pc_names = ["Assistant-like ↔ Role-Playing", "Inhuman ↔ Human", "High ↔ Low Agency", "Psychologically Simple ↔ Complex", "Supportive ↔ Competitive", "Innocent ↔ Jaded", "Playful ↔ Rigid"]
    # Create subplot structure
    fig = sp.make_subplots(
        rows=2, cols=3,
        subplot_titles=[f'PC1 vs. PC{i+2} ({pc_names[i+1]})' for i in range(6)],
        horizontal_spacing=0.08,
        vertical_spacing=0.14
    )

    fig.update_annotations(font_size=12)


    
    # Separate data by role type
    fully_rp_indices = []
    somewhat_rp_indices = []
    
    for i, label in enumerate(role_labels):
        if "(Fully RP)" in label:
            fully_rp_indices.append(i)
        elif "(Somewhat RP)" in label or "Somewhat RP" in label:
            somewhat_rp_indices.append(i)
        else:
            fully_rp_indices.append(i)  # default
    
    # PC1 values for x-axis
    pc1_values = pca_transformed[:, 0]
    
    # Calculate y-axis range for all PC2-7 to make them consistent
    y_values_all = pca_transformed[:, 1:7]  # PC2-7
    y_max = np.max(np.abs(y_values_all))
    y_range = [-y_max * 1.1, y_max * 1.1]  # Add 10% padding and make symmetric around 0
    
    # Create scatter plots for PC2-7 vs PC1
    for i in range(6):  # PC2 to PC7
        pc_idx = i + 1  # PC2 starts at index 1
        row = (i // 3) + 1
        col = (i % 3) + 1
        
        pc_values = pca_transformed[:, pc_idx]
        
        # Add Fully Role-Playing points
        if fully_rp_indices:
            fully_rp_hover = [role_labels[idx] for idx in fully_rp_indices]
            fig.add_trace(
                go.Scatter(
                    x=pc1_values[fully_rp_indices],
                    y=pc_values[fully_rp_indices],
                    mode='markers',
                    marker=dict(
                        color='blue',
                        size=5,
                        opacity=0.7
                    ),
                    text=fully_rp_hover,
                    hovertemplate='<b>%{text}</b><br>PC1: %{x:.3f}<br>PC' + str(pc_idx + 1) + ': %{y:.3f}<extra></extra>',
                    showlegend=True if i == 0 else False,  # Only show legend for first subplot
                    legendgroup="fully_rp",
                    name="Fully Role-Playing"
                ),
                row=row, col=col
            )
        
        # Add Somewhat Role-Playing points
        if somewhat_rp_indices:
            somewhat_rp_hover = [role_labels[idx] for idx in somewhat_rp_indices]
            fig.add_trace(
                go.Scatter(
                    x=pc1_values[somewhat_rp_indices],
                    y=pc_values[somewhat_rp_indices],
                    mode='markers',
                    marker=dict(
                        color='cyan',
                        size=5,
                        opacity=0.7
                    ),
                    text=somewhat_rp_hover,
                    hovertemplate='<b>%{text}</b><br>PC1: %{x:.3f}<br>PC' + str(pc_idx + 1) + ': %{y:.3f}<extra></extra>',
                    showlegend=True if i == 0 else False,  # Only show legend for first subplot
                    legendgroup="somewhat_rp",
                    name="Somewhat Role-Playing"
                ),
                row=row, col=col
            )
        
        # Add Assistant projection point
        fig.add_trace(
            go.Scatter(
                x=[asst_projected[0, 0]],  # PC1 value for assistant
                y=[asst_projected[0, pc_idx]],  # PC value for assistant
                mode='markers',
                marker=dict(
                    color='red',
                    size=8,
                    opacity=1.0
                ),
                hovertemplate='<b>Assistant</b><br>PC1: %{x:.3f}<br>PC' + str(pc_idx + 1) + ': %{y:.3f}<extra></extra>',
                showlegend=True if i == 0 else False,  # Only show legend for first subplot
                legendgroup="assistant",
                name="Assistant"
            ),
            row=row, col=col
        )
    
    # Update layout
    fig.update_layout(
        title=dict(
            text="Role PC1 (Assistant-like ↔ Role-Playing) vs. Other PCs",
            subtitle={
                "text": "Gemma 2 27B - Layer 22"
            },
            x=0.5,
            font=dict(size=16)
        ),
        height=750,
        width=1200,
        font=dict(size=10),
        legend=dict(
            x=0,
            y=-0.1,
            orientation="h",
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="black",
            borderwidth=1
        )
    )

    # Update axis labels and y-axis ranges for all subplots
    for i in range(6):
        row = (i // 3) + 1
        col = (i % 3) + 1
        fig.update_xaxes(title_text="PC1", row=row, col=col)
        fig.update_yaxes(title_text=f"PC{i+2}", range=y_range, row=row, col=col)
    
    return fig

# Create and display the plot
fig = plot_pc1_vs_others(pca_transformed, asst_projected, role_labels)
fig.write_html(f"{output_dir}/pc1_vs_others.html")
fig.show()

## Analysis

In [None]:
pc1 = pca_transformed[:, 0]

In [57]:
# conditional variance
threshold = -25
n_components = 10

assistant_mask = pc1 < threshold
roleplay_mask = pc1 > threshold

# calculate variance
var_assistant = np.var(pca_transformed[assistant_mask, 1:n_components], axis=0)
var_roleplay = np.var(pca_transformed[roleplay_mask, 1:n_components], axis=0)

var_ratio = var_assistant / var_roleplay

print("Conditional Variance Analysis:")
print(f"Variance on Assistant side (<{threshold:.3f}):", var_assistant)
print(f"Variance on Roleplay side (>={threshold:.3f}):", var_roleplay)
print("Variance Ratio (Assistant/Roleplay):", var_ratio)
print(f"Mean ratio: {var_ratio.mean():.3f}")

Conditional Variance Analysis:
Variance on Assistant side (<-25.000): [ 46.92098616 224.49537316  57.84793288  93.30135362 132.62988631
  31.3311201   43.30120708  47.58108592  68.84311144]
Variance on Roleplay side (>=-25.000): [502.99083791 324.49614357 313.5280466  257.35652223 207.49981352
 206.10334458 161.73740737 129.67886286 112.11560147]
Variance Ratio (Assistant/Roleplay): [0.09328398 0.69182755 0.18450641 0.36253736 0.63918075 0.15201655
 0.26772537 0.36691474 0.61403686]
Mean ratio: 0.375


In [None]:
# distance from origin in pc2+ space

distance_from_center = np.linalg.norm(pca_transformed[:, 1:], axis=1)

# Correlation with PC1
correlation = np.corrcoef(pc1, distance_from_center)[0, 1]

# Calculate assistant distance from origin in PC2+ space
asst_distance = np.linalg.norm(asst_projected[0, 1:])

print("\n" + "="*50)
print("Distance from Origin Analysis:")
print(f"Correlation between PC1 and distance in PC2+ space: {correlation:.3f}")
print(f"Assistant distance from origin in PC2+ space: {asst_distance:.3f}")

## Plot PC1 vs Distance from Origin in PC2+ Space

In [64]:
def plot_pc1_vs_distance(pca_transformed, asst_projected, role_labels, distance_from_center, asst_distance):
    # Create single scatter plot
    fig = go.Figure()
    
    # Separate data by role type
    fully_rp_indices = []
    somewhat_rp_indices = []
    
    for i, label in enumerate(role_labels):
        if "(Fully RP)" in label:
            fully_rp_indices.append(i)
        elif "(Somewhat RP)" in label or "Somewhat RP" in label:
            somewhat_rp_indices.append(i)
        else:
            fully_rp_indices.append(i)  # default
    
    # PC1 values for x-axis
    pc1_values = pca_transformed[:, 0]
    
    # Add Fully Role-Playing points
    if fully_rp_indices:
        fully_rp_hover = [role_labels[idx] for idx in fully_rp_indices]
        fig.add_trace(
            go.Scatter(
                x=pc1_values[fully_rp_indices],
                y=distance_from_center[fully_rp_indices],
                mode='markers',
                marker=dict(
                    color='blue',
                    size=6,
                    opacity=0.7
                ),
                text=fully_rp_hover,
                hovertemplate='<b>%{text}</b><br>PC1: %{x:.3f}<br>Distance: %{y:.3f}<extra></extra>',
                name="Fully Role-Playing"
            )
        )
    
    # Add Somewhat Role-Playing points
    if somewhat_rp_indices:
        somewhat_rp_hover = [role_labels[idx] for idx in somewhat_rp_indices]
        fig.add_trace(
            go.Scatter(
                x=pc1_values[somewhat_rp_indices],
                y=distance_from_center[somewhat_rp_indices],
                mode='markers',
                marker=dict(
                    color='cyan',
                    size=6,
                    opacity=0.7
                ),
                text=somewhat_rp_hover,
                hovertemplate='<b>%{text}</b><br>PC1: %{x:.3f}<br>Distance: %{y:.3f}<extra></extra>',
                name="Somewhat Role-Playing"
            )
        )
    
    # Add Assistant projection point
    fig.add_trace(
        go.Scatter(
            x=[asst_projected[0, 0]],  # PC1 value for assistant
            y=[asst_distance],  # Distance for assistant
            mode='markers',
            marker=dict(
                color='red',
                size=10,
                opacity=1.0
            ),
            hovertemplate='<b>Assistant</b><br>PC1: %{x:.3f}<br>Distance: %{y:.3f}<extra></extra>',
            name="Assistant"
        )
    )
    
    # Add line of best fit
    # Calculate linear regression for all role-playing data points
    x_data = pc1_values
    y_data = distance_from_center
    
    # Calculate linear regression coefficients
    coeffs = np.polyfit(x_data, y_data, 1)
    
    # Generate line points
    x_line = np.linspace(x_data.min(), x_data.max(), 100)
    y_line = np.polyval(coeffs, x_line)
    
    fig.add_trace(
        go.Scatter(
            x=x_line,
            y=y_line,
            mode='lines',
            line=dict(
                color='black',
                width=2,
                dash='dash'
            ),
            name="Best Fit Line",
            hoverinfo='skip'
        )
    )
    
    # Update layout
    fig.update_layout(
        title=dict(
            text="PC1 vs. Distance from Origin in PC2+ Space",
            subtitle={
                "text": "Gemma 2 27B - Layer 22"
            },
            x=0.5,
            font=dict(size=16)
        ),
        xaxis_title="PC1 (Assistant-like ↔ Role-Playing)",
        yaxis_title="Distance from Origin in PC2+ Space",
        height=600,
        width=900,
        font=dict(size=12),
        legend=dict(
            x=0.02,
            y=0.98,
            bgcolor="rgba(255,255,255,0.8)",
            bordercolor="black",
            borderwidth=1
        )
    )
    
    return fig

# Create and display the plot
fig_distance = plot_pc1_vs_distance(pca_transformed, asst_projected, role_labels, distance_from_center, asst_distance)
fig_distance.write_html(f"{output_dir}/pc1_vs_distance.html")
fig_distance.show()