In [11]:
import os
import torch
import pandas as pd
from dataclasses import dataclass
from transformer_lens import HookedTransformer
from conceptor import compute_conceptor
from datetime import datetime
from conceptor_steering import load_steering_prompts, extract_activations
from conceptor_steering import hooked_generate, generate_ave_hook, generate_ave_hook_addition

In [12]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [13]:
@dataclass
class ExperimentConfig:
    steering_prompts_path: str
    prompt_to_steer: str
    extraction_layer: int
    seed: int
    top_p: float = 0.3
    temperature: float = 1.0
    freq_penalty: float = 1.0
    n_steered_examples: int = 5
    model_name: str = 'gpt2-xl'

In [14]:
list_extraction_layer = [12]

In [15]:
configs = [
    ExperimentConfig(
        steering_prompts_path='./prompts/antonym.txt',
        prompt_to_steer='maximum:',
        extraction_layer=extraction_layer,
        seed=0,
        top_p=0.3,
        temperature=1.0,
        freq_penalty=1.0,
        n_steered_examples=5,
        model_name='EleutherAI/gpt-j-6B',
    )
    for extraction_layer in list_extraction_layer
]
config = configs[0]
len(configs)

1

In [16]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

sampling_kwargs = dict(
    temperature=config.temperature,
    top_p=config.top_p,
    freq_penalty=config.freq_penalty
)

# Load the model
print(">> Loading model...")
torch.set_grad_enabled(False)
model = HookedTransformer.from_pretrained(config.model_name, device=DEVICE)
model.eval();

# Logging folder for batch experiments
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
folder_path = os.path.join("resultsdata", timestamp)
os.makedirs(folder_path, exist_ok=True)

Using device: cpu
>> Loading model...


: 

In [7]:
# Generate and save unsteered results once
unsteered_res = hooked_generate(model, [config.prompt_to_steer] * config.n_steered_examples, seed=config.seed, **sampling_kwargs)
unsteered_str = model.to_string(unsteered_res[:, 1:])
unsteered_str

  0%|          | 0/50 [00:00<?, ?it/s]

['maximum: 0.0; minimum: 0.0; step: 1.0\n\nThe maximum and minimum values are in decimal notation, and the step is in decimal point notation (e.g., 2.5 ).\n\nA value of zero',
 'maximum:\n\n"The maximum number of consecutive integers that can be represented in a given array."\n\n\n"The maximum number of consecutive integers that can be represented in a given array." minimum:\n\n"The minimum number of consecutive integers that can be',
 'maximum:\n\n* Required Field * Name of the User (required) Email Address (required) Password (required) * Required Field * Password Confirm Password * Your password must be at least 8 characters long.\n\n\n* Please enter a valid email address',
 'maximum: 100;\n\nmax-width: 200px;\n\nmax-height: 300px;\n\n\n}\n\n\n.side .md h1 { font-size: 18px; }\n\n\n.side .md h2 { font',
 'maximum:\n\n- 1.5 million players (in the game)\n\n- 1.5 million players (in the game) Maximum play time: 2 hours, 30 minutes per day, or up to 3 hours per week\n\n\n2 hours,']

In [8]:
# Get the strings of all steering prompts
steering_prompts = load_steering_prompts(model, config.steering_prompts_path, add_padding=False) # (n_prompts) list of strings

# Tokenized steering prompts
print("Tokenized steering prompts:")
tokenized_prompts = [model.tokenizer.tokenize(prompt, add_special_tokens=True) for prompt in steering_prompts]
for prompt in tokenized_prompts:
    print("length: " + str(len(prompt)) + " : " + str(prompt) + " | token at index -1: " + str(prompt[-1]))

# Encoded steering prompts
print("Encoded steering prompts:")
tokenized_prompts = [model.tokenizer.encode_plus(prompt, add_special_tokens=True) for prompt in steering_prompts]
for prompt in tokenized_prompts:
    print("length: " + str(len(prompt)) + " : " + str(prompt) + " | token at index -1: " + str(prompt[-1]))

Tokenized steering prompts:
length: 16 : ['fl', 'awed', ':', 'perfect', ',', 'Ġorthodox', ':', 'un', 'orthodox', ',', 'Ġtrue', ':', 'false', ',', 'Ġdaily', ':'] | token at index -1: :
length: 18 : ['dist', 'ribution', ':', 'con', 'cent', 'ration', ',', 'Ġvalid', ':', 'in', 'valid', ',', 'Ġexpand', ':', 'contract', ',', 'Ġpractical', ':'] | token at index -1: :
length: 17 : ['priv', 'ilege', ':', 'dis', 'advant', 'age', ',', 'Ġmammoth', ':', 'tiny', ',', 'Ġunrelated', ':', 'related', ',', 'Ġovernight', ':'] | token at index -1: :
length: 14 : ['other', ':', 'same', ',', 'Ġsquare', ':', 'circle', ',', 'Ġhollow', ':', 'solid', ',', 'Ġdifficult', ':'] | token at index -1: :
length: 18 : ['lux', 'ury', ':', 'p', 'overty', ',', 'Ġstimulate', ':', 'in', 'hibit', ',', 'Ġproceed', ':', 'h', 'alt', ',', 'Ġfertile', ':'] | token at index -1: :
length: 20 : ['em', 'pir', 'ical', ':', 'the', 'oret', 'ical', ',', 'Ġpretty', ':', 'ug', 'ly', ',', 'Ġclinical', ':', 'em', 'otional', ',', 'Ġlucky', ':']

In [9]:
# store results
results = [
    {
        "steered": False,
        "input": config.prompt_to_steer,
        "output": unsteered_str[idx],
    }
    for idx in range(config.n_steered_examples)
]

for config in configs:
    print(f"\nlayer={config.extraction_layer}\n")

    # Calculate steering vector
    activations = extract_activations(model, steering_prompts, config.extraction_layer, device=DEVICE, token_indices=[-2])
    avg_activations = torch.mean(activations, dim=0) # (num_token_indices, num_activations)
    print("avg_activations shape: " + str(avg_activations.shape))

    # Add onto the model during forward passes
    ave_hook = generate_ave_hook_addition(steering_vector=avg_activations, token_index=-2)
    editing_hooks = [(f"blocks.{config.extraction_layer}.hook_resid_pre", ave_hook)]

    # Generate text using the conceptor-steered model
    steered_res = hooked_generate(model, [config.prompt_to_steer] * config.n_steered_examples, fwd_hooks=editing_hooks, seed=config.seed, **sampling_kwargs)
    outputs = model.to_string(steered_res[:, 1:])

    # save results to file
    for output in outputs:
        results.append({
            "steered": True,
            "input": config.prompt_to_steer,
            "output": output,
        })


layer=12

Starting to extract activations for 100 prompts at layer 12
avg_activations shape: torch.Size([1, 1600])


  0%|          | 0/50 [00:00<?, ?it/s]

changing torch.Size([5, 3, 1600]) from token_index: 2 with steering_vector: torch.Size([1, 1600])

layer=24

Starting to extract activations for 100 prompts at layer 24
avg_activations shape: torch.Size([1, 1600])


  0%|          | 0/50 [00:00<?, ?it/s]

changing torch.Size([5, 3, 1600]) from token_index: 2 with steering_vector: torch.Size([1, 1600])


In [14]:
# display results
pd.set_option('display.max_colwidth', 120)
pd.DataFrame(results)

Unnamed: 0,steered,layer,input,output
0,False,6,maximum:,"maximum: 0.0; minimum: 0.0; step: 1.0\n\nThe maximum and minimum values are in decimal notation, and the step is in ..."
1,False,6,maximum:,"maximum:\n\n""The maximum number of consecutive integers that can be represented in a given array.""\n\n\n""The maximum..."
2,False,6,maximum:,maximum:\n\n* Required Field * Name of the User (required) Email Address (required) Password (required) * Required F...
3,False,6,maximum:,maximum: 100;\n\nmax-width: 200px;\n\nmax-height: 300px;\n\n\n}\n\n\n.side .md h1 { font-size: 18px; }\n\n\n.side .m...
4,False,6,maximum:,"maximum:\n\n- 1.5 million players (in the game)\n\n- 1.5 million players (in the game) Maximum play time: 2 hours, 3..."
5,True,6,maximum:,maximum: The maximum number of items that can be added to a single list.\n\nmax-list: The maximum number of items th...
6,True,6,maximum:,"maximum:\n\n""The maximum number of items that can be in a single list. This is used to avoid recursion when creating..."
7,True,6,maximum:,"maximum:\n\nCALLING ALL BIKES!\n\nA FREE FESTIVAL OF RIDING & TAKING IN THE CITY!\n\n\nTHE FESTIVAL IS SOLD OUT, BUT..."
8,True,6,maximum:,maximum: 100\n\nmax_size: 0.1\n\nuse_transparent: 1\n\n\nI have created a python script that can be used to calculat...
9,True,6,maximum:,maximum: The maximum number of values that can be stored in a variable.\n\nmin: The minimum number of values that ca...
