In [None]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
import pickle
import transformer_lens
from torch.optim import AdamW
from os.path import join
from tqdm.auto import tqdm
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_from_disk
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import argparse

import sys
sys.path.append('/home/leiyu/projects/def-yangxu/leiyu/circuit-discovery')
from dmc.circuit_gpt import *
from ioi_dataset import *

model_dir = '/home/leiyu/projects/def-yangxu/leiyu/LMs/'
model_name = 'gpt2-small'
n_ioi_data = 640

In [2]:
# path that stores gpt-small weights and gpt tokenizer
model_path = join(model_dir, model_name)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# load IOI data
# Note that there are overlaps between train and test sets, 
# due to the way IOIDataset is constructed (randomly sample N items)
ds = IOIGeneratorDataset(prompt_type="ABC", N=n_ioi_data, tokenizer=tokenizer)
ioi_ds = IOIFullModelDataset(prepare_ioi_data_for_clm(ds.ioi_prompts))

In [9]:
ioi_ds[0]

{'prompt': 'Then, Jeremy, Stephen and Ashley went to the restaurant. Stephen and Ashley gave a necklace to',
 'target good': ' Jeremy',
 'target bad': ' Stephen'}

In [13]:
BABA_TEMPLATES = [
    "Then, [B] and [A] went to the [PLACE]. [B] gave a [OBJECT] to ",
    "Then, [B] and [A] had a lot of fun at the [PLACE]. [B] gave a [OBJECT] to ",
    "Then, [B] and [A] were working at the [PLACE]. [B] decided to give a [OBJECT] to ",
    "Then, [B] and [A] were thinking about going to the [PLACE]. [B] wanted to give a [OBJECT] to ",
    "Then, [B] and [A] had a long argument, and afterwards [B] said to ",
    "After [B] and [A] went to the [PLACE], [B] gave a [OBJECT] to ",
    "When [B] and [A] got a [OBJECT] at the [PLACE], [B] decided to give it to ",
    "When [B] and [A] got a [OBJECT] at the [PLACE], [B] decided to give the [OBJECT] to ",
    "While [B] and [A] were working at the [PLACE], [B] gave a [OBJECT] to ",
    "While [B] and [A] were commuting to the [PLACE], [B] gave a [OBJECT] to ",
    "After the lunch, [B] and [A] went to the [PLACE]. [B] gave a [OBJECT] to ",
    "Afterwards, [B] and [A] went to the [PLACE]. [B] gave a [OBJECT] to ",
    "Then, [B] and [A] had a long argument. Afterwards [B] said to ",
    "The [PLACE] [B] and [A] went to had a [OBJECT]. [B] gave it to ",
    "Friends [B] and [A] found a [OBJECT] at the [PLACE]. [B] gave it to ",
]


NAMES = [
    "Michael",
    "Christopher",
    "Jessica",
    "Matthew",
    "Ashley",
    "Jennifer",
    "Joshua",
    "Amanda",
    "Daniel",
    "David",
    "James",
    "Robert",
    "John",
    "Joseph",
    "Andrew",
    "Ryan",
    "Brandon",
    "Jason",
    "Justin",
    "Sarah",
    "William",
    "Jonathan",
    "Stephanie",
    "Brian",
    "Nicole",
    "Nicholas",
    "Anthony",
    "Heather",
    "Eric",
    "Elizabeth",
    "Adam",
    "Megan",
    "Melissa",
    "Kevin",
    "Steven",
    "Thomas",
    "Timothy",
    "Christina",
    "Kyle",
    "Rachel",
    "Laura",
    "Lauren",
    "Amber",
    "Brittany",
    "Danielle",
    "Richard",
    "Kimberly",
    "Jeffrey",
    "Amy",
    "Crystal",
    "Michelle",
    "Tiffany",
    "Jeremy",
    "Benjamin",
    "Mark",
    "Emily",
    "Aaron",
    "Charles",
    "Rebecca",
    "Jacob",
    "Stephen",
    "Patrick",
    "Sean",
    "Erin",
    "Jamie",
    "Kelly",
    "Samantha",
    "Nathan",
    "Sara",
    "Dustin",
    "Paul",
    "Angela",
    "Tyler",
    "Scott",
    "Katherine",
    "Andrea",
    "Gregory",
    "Erica",
    "Mary",
    "Travis",
    "Lisa",
    "Kenneth",
    "Bryan",
    "Lindsey",
    "Kristen",
    "Jose",
    "Alexander",
    "Jesse",
    "Katie",
    "Lindsay",
    "Shannon",
    "Vanessa",
    "Courtney",
    "Christine",
    "Alicia",
    "Cody",
    "Allison",
    "Bradley",
    "Samuel",
]

PLACES = [
    "store",
    "garden",
    "restaurant",
    "school",
    "hospital",
    "office",
    "house",
    "station",
]
OBJECTS = [
    "ring",
    "kiss",
    "bone",
    "basketball",
    "computer",
    "necklace",
    "drink",
    "snack",
]


def nth_repl(s, sub, repl, n):
    find = s.find(sub)
    # If find is not -1 we have found at least one match for the substring
    i = find != -1
    # loop util we find the nth or we find no match
    while find != -1 and i != n:
        # find + 1 means we start searching from after the last match
        find = s.find(sub, find + 1)
        i += 1
    # If i is equal to n we found nth match so replace
    if i == n:
        return s[:find] + repl + s[find+len(sub):]
    return s


def multiple_replace(replacements, text):
    # Create a regular expression from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, replacements.keys())))
    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: replacements[mo.group()], text) 


# random.choice(BABA_TEMPLATES)
# random.sample(NAMES, 3)

In [17]:
import random
N = 1000

# re.sub(pattern, repl, string, count=0, flags=0)

for i in range(N):
    template = random.choice(BABA_TEMPLATES)
    na, nb, nc = random.sample(NAMES, 3)
    place, obj = random.choice(PLACES), random.choice(OBJECTS)

    input_clean = multiple_replace({'[A]': na, '[B]': nb, '[OBJECT]': obj, '[PLACE]': place}, template)
    input_corrupt = nth_repl(input_clean, nb, nc, 2)

    print(template)
    print(na, nb, nc)
    print(input_clean)
    print(input_corrupt)
    print()

When [B] and [A] got a [OBJECT] at the [PLACE], [B] decided to give it to 
Michael John Nicholas
When John and Michael got a drink at the house, John decided to give it to 
When John and Michael got a drink at the house, Nicholas decided to give it to 

Afterwards, [B] and [A] went to the [PLACE]. [B] gave a [OBJECT] to 
Ryan Christina Matthew
Afterwards, Christina and Ryan went to the school. Christina gave a ring to 
Afterwards, Christina and Ryan went to the school. Matthew gave a ring to 

While [B] and [A] were working at the [PLACE], [B] gave a [OBJECT] to 
Andrew Courtney Heather
While Courtney and Andrew were working at the garden, Courtney gave a basketball to 
While Courtney and Andrew were working at the garden, Heather gave a basketball to 

When [B] and [A] got a [OBJECT] at the [PLACE], [B] decided to give it to 
Megan Joseph Katie
When Joseph and Megan got a ring at the hospital, Joseph decided to give it to 
When Joseph and Megan got a ring at the hospital, Katie decide

In [20]:
# circuit_gpt initialization
device = torch.device('cuda')    
gpt_weights = torch.load(join(model_path, 'model_weights.pt')) 
circuit_gpt_config = CircuitGPTConfig(
    debug=False,
    use_weight_masks=False,
    use_edge_masks=False
)

circuit_gpt = CircuitGPT(circuit_gpt_config)
circuit_gpt.load_pretrained_weight(gpt_weights)
circuit_gpt.to(device);
circuit_gpt.eval();

In [22]:
circuit_gpt

CircuitGPT(
  (embed): Embed()
  (pos_embed): PosEmbed()
  (ln_final): LayerNorm()
  (unembed): Unembed()
  (blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (ln1): LayerNorm()
      (attn): Attention()
      (ln2): LayerNorm()
      (mlp): MLP()
    )
  )
)

In [31]:
from tqdm.auto import tqdm
from collections import OrderedDict
from typing import Dict, Callable
import torch

def remove_all_forward_hooks(model: torch.nn.Module) -> None:
    for name, child in model._modules.items():
        if child is not None:
            if hasattr(child, "_forward_hooks"):
                child._forward_hooks: Dict[int, Callable] = OrderedDict()
            remove_all_forward_hooks(child)
            

def make_output_hook(output_cache):
    def output_hook(module, inputs, outputs):
        # outputs_0 = untuple(outputs)
        print(outputs.shape)
        output_cache.append(outputs.detach().cpu()[0, -1])  # (n_child_node, h_dim)

    return output_hook
    

for _ in tqdm(range(1000)):

    # prepare inputs
    template = random.choice(BABA_TEMPLATES)
    na, nb, nc = random.sample(NAMES, 3)
    place, obj = random.choice(PLACES), random.choice(OBJECTS)

    prompt_clean = multiple_replace({'[A]': na, '[B]': nb, '[OBJECT]': obj, '[PLACE]': place}, template)
    prompt_corrupt = nth_repl(input_clean, nb, nc, 2)
    input_ids_clean = tokenizer(prompt_clean, return_tensors='pt')['input_ids']
    input_ids_corrupt = tokenizer(prompt_corrupt, return_tensors='pt')['input_ids']

    # register forward hooks to save resid stream outputs
    # forward_hooks = []
    # forward_output_dicts = {
    #     f'resid-{i}': []
    #     for i in range(12)
    # }
    # for i in range(12):
    #     forward_hook = circuit_gpt.blocks[i].register_forward_hook(
    #         make_output_hook(forward_output_dicts[f'resid-{i}'])
    #     )
    #     forward_hooks.append(forward_hook)

    with torch.no_grad():
        circuit_gpt(input_ids_clean.to(device))
        circuit_gpt(input_ids_corrupt.to(device))
        

    # for hook in forward_hooks:
    #     hook.remove()

    torch.cuda.empty_cache()

        

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [30]:
remove_all_forward_hooks(circuit_gpt)

In [32]:
input_ids_corrupt

tensor([[ 6423,    11, 39808,   290, 14328,   550,   257,   890,  4578,    11,
           290, 12979, 39808,   531,   284,   220]])