In [1]:
%%capture
%pip install -e ../../..

In [2]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# Load the .env file
load_dotenv()

# Get the Hugging Face token from the environment variable
hf_token = os.getenv("HF_TOKEN")

# Login using the token
if hf_token:
    login(token=hf_token)
    print("Successfully logged in to Hugging Face")
else:
    print("HUGGINGFACE_TOKEN not found in .env file")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Successfully logged in to Hugging Face


In [3]:
from lmexp.models.implementations.gpt2small import GPT2Tokenizer, ProbedGPT2
from lmexp.models.implementations.llama2 import Llama2Tokenizer, ProbedLlama2
from lmexp.models.implementations.llama3 import Llama3Tokenizer, ProbedLlama3
from lmexp.generic.probing import train_probe
from lmexp.generic.caa import get_caa_vecs
from lmexp.generic.hooked_model import run_simple_steering
from datetime import datetime
import random

# Load GPT2 model and tokenizer

These classes have already implemented all the probing-related methods so we won't have to add more hooks + they are ready to use with our vector extraction and steering functions.

In [None]:
model = ProbedGPT2()
tokenizer = GPT2Tokenizer()

In [None]:
model.get_n_layers()

# Load Llama 2 model and tokenizer

Don't run all models at the same time. Select one.

In [None]:
model = ProbedLlama2()

In [None]:
tokenizer = Llama2Tokenizer()

In [None]:
model.get_n_layers()

# Load Llama 3 model and tokenizer

In [4]:
import torch
torch.cuda.empty_cache()

In [5]:
model = ProbedLlama3()

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [6]:
tokenizer = Llama3Tokenizer()

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
model.get_n_layers()

32

# Training a linear probe

## Generate some data

Let's see whether we can get a date/time probe vector

In [8]:
def gen_labeled_text(n):
    # date as text, date as utc timestamp in seconds, sample randomly from between 1990 and 2022
    start_timestamp = datetime(2013, 1, 1).timestamp()
    end_timestamp = datetime(2016, 1, 1).timestamp()
    labeled_text = []
    for i in range(n):
        timestamp = start_timestamp + (end_timestamp - start_timestamp) * random.random()
        date = datetime.fromtimestamp(timestamp)
        # date like "Monday 15th November 2021 8AM"
        text = date.strftime("Today is a %A. It's the %dth of %B, %Y. The time is %I %p. This is the point in time when")
        label = timestamp
        labeled_text.append((text, label))
    # normalize labels to have mean 0 and std 1
    labels = [label for _, label in labeled_text]
    mean = sum(labels) / len(labels)
    std = (sum((label - mean) ** 2 for label in labels) / len(labels)) ** 0.5
    labeled_text = [(text, (label - mean) / std) for text, label in labeled_text]
    return labeled_text

In [16]:
data = gen_labeled_text(10_000)
print(data[0])
#Number of tokens
encoded = tokenizer.encode(data[0][0])
num_tokens = encoded.size(1)  # Get the size of the second dimension
print(f"Number of tokens: {num_tokens}")

("Today is a Thursday. It's the 03th of July, 2014. The time is 10 PM. This is the point in time when", -0.0007876819166826553)
Number of tokens: 33


In [12]:
#See what token(s) are are looking at:
# Get the text from data[0]
text = data[0][0]

# Tokenize the text
encoded = tokenizer.encode(text)

# Get the token IDs (remove batch dimension and convert to list)
token_ids = encoded.squeeze(0).tolist()

# Print the tokens with their positions and IDs
for i, token_id in enumerate(token_ids):
    token_text = tokenizer.decode(torch.tensor([token_id]))
    print(f"{i}: {token_id} ({token_text})")

0: 128000 ()
1: 15724 (Today)
2: 374 ( is)
3: 264 ( a)
4: 7159 ( Monday)
5: 13 (.)
6: 1102 ( It)
7: 596 ('s)
8: 279 ( the)
9: 220 ( )
10: 2589 (07)
11: 339 (th)
12: 315 ( of)
13: 6250 ( September)
14: 11 (,)
15: 220 ( )
16: 679 (201)
17: 20 (5)
18: 13 (.)
19: 578 ( The)
20: 892 ( time)
21: 374 ( is)
22: 220 ( )
23: 2371 (04)
24: 6912 ( AM)
25: 13 (.)
26: 1115 ( This)
27: 374 ( is)
28: 279 ( the)
29: 1486 ( point)
30: 304 ( in)
31: 892 ( time)
32: 994 ( when)


## Training

In [None]:
layer = 16
n_epochs = 5
token_position = 1

probe = train_probe(
    labeled_text=data,
    model=model,
    tokenizer=tokenizer,
    layer=layer,
    n_epochs=n_epochs,
    batch_size=64, #Batch size 128 does not work for Llama 3 8GB at full precision with A100 80GB (64 requires 59.7GB)
    lr=1e-2,
    save_to=f"probe_layer{layer}_epochs{n_epochs}_tokenpos{token_position}.pth",
    token_position=token_position
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 156/156 [04:34<00:00,  1.76s/it]


Epoch 0, mean loss: 1.8401461360931397


 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 147/156 [04:18<00:15,  1.75s/it]

## Using the vector

In [None]:
direction = probe.weight[0]
bias = probe.bias

In [None]:
bias

In [None]:
#Boilerplate code:
import torch
import torch.nn as nn

# Simulate a model's residual dimension
resid_dim = model.resid_dim()
print(f"Residual dimension: {resid_dim}")

# Inspect the probe
print("Probe structure:", probe)
print("Probe weight shape:", probe.weight.shape)
print("Probe bias shape:", probe.bias.shape)

# Total number of parameters
total_params = sum(p.numel() for p in probe.parameters())
print("Total parameters:", total_params)

# Memory usage (assuming float32)
memory_usage = total_params * 4 / 1024  # in KB
print(f"Approximate memory usage: {memory_usage:.2f} KB")

# Sample input
sample_input = torch.randn(1, resid_dim)  # Batch size of 1

# Forward pass
#output = probe(sample_input)

#print("Sample input shape:", sample_input.shape)
#print("Output shape:", output.shape)
#print("Output value:", output.item())

# Accessing weights and bias
print("\nFirst few weights:")
print(probe.weight[0, :10])  # First 10 weights
print("\nBias:")
print(probe.bias)

# Demonstrating linearity
scale = 2.0
#scaled_output = probe(scale * sample_input)
#print(f"\nOutput with input scaled by {scale}:", scaled_output.item())
#print("Original output scaled:", scale * output.item())

In [None]:
import os
probe_path = os.path.abspath("probe.pth")
print(f"The probe should be at: {probe_path}")
print(f"Does this file exist? {os.path.exists(probe_path)}")

In [None]:
print(os.getcwd())

In [None]:
print(os.listdir())

In [None]:
def find_file(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

result = find_file('probe.pth', '/root')
print(f"File found at: {result}" if result else "File not found")

In [None]:
import os
print(f"Have write permission: {os.access(os.getcwd(), os.W_OK)}")

In [None]:
for multiplier in [-0.6, -0.5, -0.4, -0.3, 0, 0.3, 0.4, 0.5, 0.6]:
    result = run_simple_steering(
        text=["The YYYY of the current year is:"],
        model=model,
        tokenizer=tokenizer,
        layer=16,
        multiplier=multiplier,
        vector=direction.detach(),
        max_n_tokens=13,
        save_to=None
    )
    if isinstance(result, list) and len(result) > 0:
        if isinstance(result[0], dict) and 'output' in result[0]:
            print(f"Multiplier {multiplier}: {result[0]['output']}")
        else:
            print(f"Multiplier {multiplier}: {result}")
    else:
        print(f"Multiplier {multiplier}: Unexpected result format - {result}")

In [None]:
run_simple_steering(
    text=["The current date is"],
    model=model,
    tokenizer=tokenizer,
    layer=16,
    multiplier=0,
    vector=direction.detach(),
    max_n_tokens=10,
    save_to=None
)

In [None]:
run_simple_steering(
    text=["The current date is"],
    model=model,
    tokenizer=tokenizer,
    layer=16,
    multiplier=0,
    vector=direction.detach(),
    max_n_tokens=10,
    save_to=None
)

# CAA

## Let's get some contrast pairs

Let's try an easy direction - positive vs negative sentiment

In [None]:
GOOD = [
    "The weather is really nice",
    "I'm so happy",
    "This cake is absolutely delicious",
    "I love my friends",
    "I'm feeling great",
    "I'm so excited",
    "This is the best day ever",
    "I really like this gift",
    "Croissants are my favorite",
    "The movie was fantastic",
    "I got a promotion at work",
    "My vacation was amazing",
    "The concert exceeded my expectations",
    "I'm grateful for my family",
    "This book is incredibly engaging",
    "The restaurant service was excellent",
    "I'm proud of my accomplishments",
    "The sunset is breathtakingly beautiful",
    "I passed my exam with flying colors",
    "This coffee tastes perfect",
]

BAD = [
    "The weather is really bad",
    "I'm so sad",
    "This cake is completely inedible",
    "I hate my enemies",
    "I'm feeling awful",
    "I'm so anxious",
    "This is the worst day ever",
    "I dislike this gift",
    "Croissants are disgusting",
    "The movie was terrible",
    "I got fired from work",
    "My vacation was a disaster",
    "The concert was a huge disappointment",
    "I'm frustrated with my family",
    "This book is incredibly boring",
    "The restaurant service was horrible",
    "I'm ashamed of my mistakes",
    "The weather is depressingly gloomy",
    "I failed my exam miserably",
    "This coffee tastes awful",
]

In [None]:
BADDATE = [
    "The YYYY of the current year is: 1903",
    "The YYYY of the current year is: 1918",
    "The YYYY of the current year is: 1935",
    "The YYYY of the current year is: 1941",
    "The YYYY of the current year is: 1956",
    "The YYYY of the current year is: 1962",
    "The YYYY of the current year is: 1977",
    "The YYYY of the current year is: 1989",
    "The YYYY of the current year is: 1995",
    "The YYYY of the current year is: 2008",
    "The YYYY of the current year is: 1900",
    "The YYYY of the current year is: 1922",
    "The YYYY of the current year is: 1937",
    "The YYYY of the current year is: 1949",
    "The YYYY of the current year is: 1965",
    "The YYYY of the current year is: 1978",
    "The YYYY of the current year is: 1984",
    "The YYYY of the current year is: 1997",
    "The YYYY of the current year is: 2003",
    "The YYYY of the current year is: 2011",
    "The YYYY of the current year is: 1907",
    "The YYYY of the current year is: 1926",
    "The YYYY of the current year is: 1943",
    "The YYYY of the current year is: 1958",
    "The YYYY of the current year is: 1970",
    "The YYYY of the current year is: 1982",
    "The YYYY of the current year is: 1991",
    "The YYYY of the current year is: 2001",
    "The YYYY of the current year is: 2009",
    "The YYYY of the current year is: 2014",
    "The YYYY of the current year is: 1913",
    "The YYYY of the current year is: 1929",
    "The YYYY of the current year is: 1945",
    "The YYYY of the current year is: 1952",
    "The YYYY of the current year is: 1967",
    "The YYYY of the current year is: 1973",
    "The YYYY of the current year is: 1986",
    "The YYYY of the current year is: 1992",
    "The YYYY of the current year is: 2005",
    "The YYYY of the current year is: 2012",
    "The YYYY of the current year is: 1905",
    "The YYYY of the current year is: 1920",
    "The YYYY of the current year is: 1938",
    "The YYYY of the current year is: 1954",
    "The YYYY of the current year is: 1969",
    "The YYYY of the current year is: 1981",
    "The YYYY of the current year is: 1994",
    "The YYYY of the current year is: 2000",
    "The YYYY of the current year is: 2007",
    "The YYYY of the current year is: 2013"
]


GOODDATE = [
    "The YYYY of the current year is: 2023",
    "The YYYY of the current year is: 2037",
    "The YYYY of the current year is: 2052",
    "The YYYY of the current year is: 2068",
    "The YYYY of the current year is: 2081",
    "The YYYY of the current year is: 2095",
    "The YYYY of the current year is: 2016",
    "The YYYY of the current year is: 2029",
    "The YYYY of the current year is: 2044",
    "The YYYY of the current year is: 2059",
    "The YYYY of the current year is: 2073",
    "The YYYY of the current year is: 2088",
    "The YYYY of the current year is: 2100",
    "The YYYY of the current year is: 2019",
    "The YYYY of the current year is: 2032",
    "The YYYY of the current year is: 2047",
    "The YYYY of the current year is: 2061",
    "The YYYY of the current year is: 2076",
    "The YYYY of the current year is: 2091",
    "The YYYY of the current year is: 2015",
    "The YYYY of the current year is: 2028",
    "The YYYY of the current year is: 2042",
    "The YYYY of the current year is: 2057",
    "The YYYY of the current year is: 2070",
    "The YYYY of the current year is: 2085",
    "The YYYY of the current year is: 2098",
    "The YYYY of the current year is: 2021",
    "The YYYY of the current year is: 2035",
    "The YYYY of the current year is: 2049",
    "The YYYY of the current year is: 2064",
    "The YYYY of the current year is: 2078",
    "The YYYY of the current year is: 2093",
    "The YYYY of the current year is: 2017",
    "The YYYY of the current year is: 2031",
    "The YYYY of the current year is: 2045",
    "The YYYY of the current year is: 2060",
    "The YYYY of the current year is: 2074",
    "The YYYY of the current year is: 2089",
    "The YYYY of the current year is: 2024",
    "The YYYY of the current year is: 2038",
    "The YYYY of the current year is: 2053",
    "The YYYY of the current year is: 2067",
    "The YYYY of the current year is: 2082",
    "The YYYY of the current year is: 2096",
    "The YYYY of the current year is: 2020",
    "The YYYY of the current year is: 2033",
    "The YYYY of the current year is: 2048",
    "The YYYY of the current year is: 2062",
    "The YYYY of the current year is: 2077",
    "The YYYY of the current year is: 2092"
]

In [None]:
dataset = [
    (text, True) for text in GOOD
] + [
    (text, False) for text in BAD
]

## Getting the CAA vectors

In [None]:
if 'vectors' in globals():
    del vectors

vectors = get_caa_vecs(
    labeled_text=dataset,
    model=model,
    tokenizer=tokenizer,
    layers=range(0, 32),
    save_to=None              
)

print(vectors[16])
print(f"Shape of vector: {vectors[16].shape}")

## Using the CAA vectors

In [None]:
print(model)
print(tokenizer)

In [None]:
for layer in range(16, 17):
    for multiplier in range(-10, 11):
        print(f"Layer: {layer}, Multiplier: {multiplier}")
        result = run_simple_steering(
            text=["The YYYY of the current year is:"],
            model=model,
            tokenizer=tokenizer,
            layer=layer,
            multiplier=multiplier,
            vector=vectors[15],
            max_n_tokens=13,
            save_to=None,
        )
        print(result)


In [None]:
for multiplier in range(-10, 11):
    result = run_simple_steering(
        text=["I think that this cat is"],
        model=model,
        tokenizer=tokenizer,
        layer=6,
        multiplier=multiplier,
        vector=vectors[16],
        max_n_tokens=20,
        save_to=None,
    )
    print(result)

# Cosine similarity

In [None]:
import numpy as np
import torch

def cosine_similarity(v1, v2):
    # Ensure both tensors are on the same device (CPU) and are float type
    v1 = v1.cpu().detach().float()
    v2 = v2.cpu().detach().float()
    
    # Ensure the shape is (1, 4096)
    v1 = v1.view(1, 4096)
    v2 = v2.view(1, 4096)
    
    # Calculate cosine similarity using PyTorch operations
    similarity = torch.nn.functional.cosine_similarity(v1, v2, dim=1)
    
    return similarity.item()

In [None]:
similarity = cosine_similarity(probe.weight, vectors[16])
print(f"Cosine similarity between probe.weight and vectors[16]: {similarity}")