In [1]:
# Handy snippet to get repo root from anywhere in the repo
import sys
from subprocess import check_output
ROOT = check_output('git rev-parse --show-toplevel', shell=True).decode("utf-8").strip()
if ROOT not in sys.path: sys.path.append(ROOT)

In [6]:
import torch as t
import pandas as pd
import einops

from tqdm.auto import tqdm
from dishonesty.mistral_lens import load_model


t.set_grad_enabled(False)
device = "cuda" if t.cuda.is_available() else "cpu"

In [3]:
model = load_model()

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
directions = t.load(f"{ROOT}/directions/honesty_mistral.pt")
"shape: (layer d_model)"
directions.shape

torch.Size([32, 4096])

In [8]:
df = pd.read_csv(f"{ROOT}/datasets/facts_true_false.csv")
df.head()

Unnamed: 0,statement,label
0,The Earth's atmosphere protects us from harmfu...,1
1,The largest flower in the world is the daisy.,0
2,Cells are the complex units of life.,0
3,"The theory of evolution, proposed by Charles D...",1
4,The smallest planet in our solar system is Mer...,1


In [15]:
names = [f"resid_post_{l}" for l in range(32)]
activations = t.zeros(df.shape[0], 32, 4096)
"shape: (statement layer d_model)"

'shape: (statement layer d_model)'

In [16]:
for i, statement in enumerate(tqdm(df.statement.values.tolist())):
    _, cache = model.run_with_cache(statement, names, pos_slicer=-1)
    for layer in range(32):
        activations[i, layer, :] = cache[f"resid_post_{layer}"][0].cpu()  # taking only final token

  0%|          | 0/612 [00:00<?, ?it/s]

In [17]:
projections = einops.einsum(
    activations, directions,
    "statement layer d_model, layer d_model -> layer statement",
)
projections_scaled = (projections - projections.mean(dim=1, keepdim=True)) / projections.std(dim=1, keepdim=True)

In [18]:
from dishonesty.utils import ntensor_to_long
import numpy as np
import plotly.express as px

In [26]:
df_proj = ntensor_to_long(projections_scaled, "projection", ["layer", "statement"])
df_proj["label"] = np.tile(df["label"], 32)
df_proj = df_proj.fillna(0.0)
df_proj["label"] = df_proj["label"].map({0: "False", 1: "True"})
df_proj

Unnamed: 0,projection,layer,statement,label
0,0.000000,0,0,True
1,0.000000,0,1,False
2,0.000000,0,2,False
3,0.000000,0,3,True
4,0.000000,0,4,True
...,...,...,...,...
19579,1.154143,31,607,False
19580,0.556651,31,608,True
19581,1.312968,31,609,True
19582,0.941125,31,610,True


In [28]:
df_proj["pred"] = df_proj["projection"] > 0.0
accuracies = [df_proj.query(f"layer == {i}").pred.mean().round(3) for i in range(32)]
for i, acc in enumerate(accuracies):
    print(f"Layer {i} accuracy: {acc}")

Layer 0 accuracy: 0.0
Layer 1 accuracy: 0.552
Layer 2 accuracy: 0.539
Layer 3 accuracy: 0.516
Layer 4 accuracy: 0.482
Layer 5 accuracy: 0.521
Layer 6 accuracy: 0.526
Layer 7 accuracy: 0.511
Layer 8 accuracy: 0.507
Layer 9 accuracy: 0.546
Layer 10 accuracy: 0.567
Layer 11 accuracy: 0.595
Layer 12 accuracy: 0.608
Layer 13 accuracy: 0.639
Layer 14 accuracy: 0.637
Layer 15 accuracy: 0.613
Layer 16 accuracy: 0.616
Layer 17 accuracy: 0.595
Layer 18 accuracy: 0.618
Layer 19 accuracy: 0.598
Layer 20 accuracy: 0.587
Layer 21 accuracy: 0.57
Layer 22 accuracy: 0.572
Layer 23 accuracy: 0.577
Layer 24 accuracy: 0.56
Layer 25 accuracy: 0.557
Layer 26 accuracy: 0.556
Layer 27 accuracy: 0.556
Layer 28 accuracy: 0.547
Layer 29 accuracy: 0.544
Layer 30 accuracy: 0.539
Layer 31 accuracy: 0.565


In [38]:
# Plot with animation frame
fig = px.scatter(
    df_proj,
    x="statement",
    y="projection",
    animation_frame="layer",
    color="label",
    title="Projection of statements on honesty direction, at the period token",
    height=500,
    width=800,
)
ymax = df_proj.projection.max() * 1.1
ymin = df_proj.projection.min() * 1.1
fig.update_layout(yaxis=dict(range=[ymin, ymax]))

In [47]:
# Plot with animation frame
subdf = df_proj.query("layer == 13")

fig = px.scatter(
    subdf,
    x="statement",
    y="projection",
    color="label",
    labels={"statement": "Statement Index", "projection": "Projection (Scaled)", "label": "Label"},
    height=500,
    width=800,
)
ylims = subdf.projection.abs().max()
ymax = subdf.projection.max() * 1.1
ymin = subdf.projection.min() * 1.1
fig.update_layout(yaxis=dict(range=[ymin, ymax]))

# fig.update_layout(plot_bgcolor='white')
fig.update_layout(
    font=dict(family="sans-serif", size=16),
    title_font=dict(size=16),
    xaxis_title_font=dict(size=16),
    yaxis_title_font=dict(size=16)
)

In [49]:
fig.write_image(f'{ROOT}/images-for-paper/GoT-projection.pdf')