In [80]:
import torch as t
import pandas as pd
import einops

from transformers import AutoTokenizer, AutoModelForCausalLM
from utils import HookedMistral
from tqdm.auto import tqdm

t.set_grad_enabled(False)
device = "cuda" if t.cuda.is_available() else "cpu"

In [1]:
# Handy snippet to get repo root from anywhere in the repo
import sys
from subprocess import check_output
ROOT = check_output('git rev-parse --show-toplevel', shell=True).decode("utf-8").strip()
if ROOT not in sys.path: sys.path.append(ROOT)

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, padding_side="left")
tokenizer.pad_token_id = 1
hf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=t.float16,
    device_map="auto",
    cache_dir="/workspace/cache/",
)
hmodel = HookedMistral(hf_model, tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
directions = t.load(f"{ROOT}/directions/honesty_mistral.pt")
"shape: (layer d_model)"
directions.shape

torch.Size([32, 4096])

In [6]:
df = pd.read_csv(f"{ROOT}/datasets/facts_true_false.csv")
df.head()

Unnamed: 0,statement,label
0,The Earth's atmosphere protects us from harmfu...,1
1,The largest flower in the world is the daisy.,0
2,Cells are the complex units of life.,0
3,"The theory of evolution, proposed by Charles D...",1
4,The smallest planet in our solar system is Mer...,1


In [49]:
# Check that padding + masking doesn't throw models off distribution
statements = df.statement.values.tolist()[:2]
batched_logits = hmodel(statements)
logits0 = hmodel(statements[0])
logits1 = hmodel(statements[1])

a = batched_logits[0, -1]
b = logits0[0, -1]
a = a / a.norm(dim=-1)
b = b / b.norm(dim=-1)
print("cosim:", a @ b)
print("norms:", batched_logits[0, -1].norm(), logits0[0, -1].norm())

a = batched_logits[1, -1]
b = logits1[0, -1]
a = a / a.norm(dim=-1)
b = b / b.norm(dim=-1)
print("cosim:", a @ b)
print("norms:", batched_logits[1, -1].norm(), logits1[0, -1].norm())

cosim: tensor(1.0000, device='cuda:0')
norms: tensor(698.0156, device='cuda:0') tensor(698.0203, device='cuda:0')
cosim: tensor(1.0000, device='cuda:0')
norms: tensor(702.6395, device='cuda:0') tensor(702.7922, device='cuda:0')


In [55]:
statement = df.statement.iloc[0]
print(statement)

The Earth's atmosphere protects us from harmful radiation from the sun.


In [71]:
names = hmodel.get_resid_post_names()
activations = t.zeros(df.shape[0], 32, 4096)
"shape: (statement layer d_model)"

'shape: (statement layer d_model)'

In [72]:
for i, statement in enumerate(tqdm(df.statement.values.tolist())):
    _, cache = hmodel.run_with_cache(statement, names)
    for layer in range(32):
        activations[i, layer, :] = cache[f"model.layers.{layer}"][0, -1].cpu()  # taking only final token

  0%|          | 0/612 [00:00<?, ?it/s]

In [82]:
projections = einops.einsum(
    activations, directions,
    "statement layer d_model, layer d_model -> layer statement",
)

In [95]:
projections_scaled = (projections - projections.mean(dim=1, keepdim=True)) / projections.std(dim=1, keepdim=True)

In [99]:
from utils import ntensor_to_long
import numpy as np
import plotly.express as px

In [97]:
df_proj = ntensor_to_long(projections_scaled, "projection", ["layer", "statement"])
df_proj["label"] = np.tile(df["label"], 32)
df_proj = df_proj.fillna(0.0)
df_proj

Unnamed: 0,projection,layer,statement,label
0,0.000000,0,0,1
1,0.000000,0,1,0
2,0.000000,0,2,0
3,0.000000,0,3,1
4,0.000000,0,4,1
...,...,...,...,...
19579,1.153149,31,607,0
19580,0.557603,31,608,1
19581,1.314610,31,609,1
19582,0.940907,31,610,1


In [103]:
fig = px.scatter(
    df_proj,
    x="statement",
    y="projection",
    animation_frame="layer",
    color="label",
    height=500,
)
ylims = df_proj.projection.abs().max()
fig.update_layout(
    yaxis=dict(range=[-ylims, ylims]),
)

In [105]:
import random
def honesty_function_dataset(data_path: str, tokenizer, user_tag: str = "", assistant_tag: str = "", seed: int = 0) -> (list, list):
    """
    Processes data to create training and testing datasets based on honesty.

    Args:
    - data_path (str): Path to the CSV containing the data.
    - tokenizer (PreTrainedTokenizer): Tokenizer to tokenize statements.
    - user_tag (str): Instruction template.
    - assistant_tag (str): Instruction template user tag.
    - seed (int): Random seed for reproducibility.

    Returns:
    - Tuple containing train and test data.
    """

    # Setting the seed for reproducibility
    random.seed(seed)

    # Load the data
    df = pd.read_csv(data_path)
    true_statements = df[df['label'] == 1]['statement'].values.tolist()
    false_statements = df[df['label'] == 0]['statement'].values.tolist()

    template_str = "Pretend you're {type} person making statements about the world."
    honest_statements = []
    untruthful_statements = []

    # Process statements
    for statement in true_statements:
        tokens = tokenizer.tokenize(statement)
        
        for idx in range(1, len(tokens) - 5):
            truncated_tokens = tokens[:idx]
            truncated_statement = tokenizer.convert_tokens_to_string(truncated_tokens)

            honest_statements.append(f"{user_tag} {template_str.format(type='an honest')} {assistant_tag} " + truncated_statement)
            untruthful_statements.append(f"{user_tag} {template_str.format(type='an untruthful')} {assistant_tag} " + truncated_statement)

    # Create training data
    ntrain = 512
    combined_data = [[honest, untruthful] for honest, untruthful in zip(honest_statements, untruthful_statements)]
    train_data = combined_data[:ntrain]

    train_labels = []
    for d in train_data:
        true_s = d[0]
        random.shuffle(d)
        train_labels.append([s == true_s for s in d])
    
    train_data = np.concatenate(train_data).tolist()

    # Create test data
    reshaped_data = np.array([[honest, untruthful] for honest, untruthful in zip(honest_statements[:-1], untruthful_statements[1:])]).flatten()
    test_data = reshaped_data[ntrain:ntrain*2].tolist()

    print(f"Train data: {len(train_data)}")
    print(f"Test data: {len(test_data)}")

    return {
        'train': {'data': train_data, 'labels': train_labels},
        'test': {'data': test_data, 'labels': [[1,0]] * len(test_data)}
    }

In [106]:
user_tag = "[INST]"
assistant_tag = "[/INST]"

data_path = f"{ROOT}/datasets/facts_true_false.csv"
dataset = honesty_function_dataset(data_path, tokenizer, user_tag, assistant_tag)

Train data: 1024
Test data: 512


In [107]:
dataset.keys()

dict_keys(['train', 'test'])

In [110]:
dataset["train"]["data"][:10]

["[INST] Pretend you're an honest person making statements about the world. [/INST] The",
 "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The",
 "[INST] Pretend you're an honest person making statements about the world. [/INST] The Earth",
 "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth",
 "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth'",
 "[INST] Pretend you're an honest person making statements about the world. [/INST] The Earth'",
 "[INST] Pretend you're an honest person making statements about the world. [/INST] The Earth's",
 "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth's",
 "[INST] Pretend you're an honest person making statements about the world. [/INST] The Earth's atmosphere",
 "[INST] Pretend you're an untruthful person making statements about the world. [/INST] The Earth's atmosphere"]