In [1]:
%load_ext autoreload
%autoreload 2

from functools import partial

import torch
import pickle
#from nnsight.models import UnifiedTransformer
from transformer_lens import HookedTransformer, HookedTransformerConfig

from graph import Graph
from circuit_loading import load_graph_from_json, load_graph_from_pt

from dataset import EAPDataset, HFEAPDataset
from attribute import attribute
from metrics import get_metric
from evaluate_graph import evaluate_graph, evaluate_baseline
from huggingface_hub import hf_hub_download

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
task_name = "ioi"

hf_cfg = hf_hub_download("cybershiptrooper/InterpBench", subfolder=task_name, filename="ll_model_cfg.pkl")
hf_model = hf_hub_download("cybershiptrooper/InterpBench", subfolder=task_name, filename="ll_model.pth")

cfg_dict = pickle.load(open(hf_cfg, "rb"))
if isinstance(cfg_dict, dict):
    cfg = HookedTransformerConfig.from_dict(cfg_dict)
else:
    # Some cases in InterpBench have the config as a HookedTransformerConfig object instead of a dict
    assert isinstance(cfg_dict, HookedTransformerConfig)
    cfg = cfg_dict
cfg.device = "cuda"

if "ioi" in task_name:
    # Small hack to enable evaluation mode in the IOI model, that has a different config during training
    cfg.use_hook_mlp_in = True
    cfg.use_attn_result = True
    cfg.use_split_qkv_input = True

model = HookedTransformer(cfg)
model.load_state_dict(torch.load(hf_model, map_location="cuda"))



<All keys matched successfully>

In [2]:
model = HookedTransformer.from_pretrained("gpt2-small")



Loaded pretrained model gpt2-small into HookedTransformer


In [4]:
model.tokenizer.encode(" Mary")

[5335]

In [4]:
import csv
with open("ioi-gpt2.csv", 'r') as ioi_data, open("ioi-interpbench.csv", 'w') as ioi_out:
    reader = csv.reader(ioi_data)
    writer = csv.writer(ioi_out)
    writer.writerow(["", "clean", "corrupted", "corrupted_hard", "correct_idx", "incorrect_idx"])
    next(reader)
    for row in reader:
        _idx, clean, corrupted, corrupted_hard, _, _ = row
        name1 = clean.split()[1]
        name2 = clean.split()[3]
        name1_count = clean.count(name1)
        name2_count = clean.count(name2)
        if name1_count > 1:
            correct_name = name2
            incorrect_name = name1
        elif name2_count > 1:
            correct_name = name1
            incorrect_name = name2
        else:
            raise Exception(f"Can't find correct name among {name1} and {name2}")
        correct_idx = model.tokenizer.encode(f" {correct_name}")[0]
        incorrect_idx = model.tokenizer.encode(f" {incorrect_name}")[0]
        writer.writerow([_idx, clean, corrupted, corrupted_hard, correct_idx, incorrect_idx])

In [6]:
reference_graph = load_graph_from_json("interpbench/ioi_groundtruth.json")
hypothesis_graph = Graph()
hypothesis_graph = hypothesis_graph.from_model(model)

In [7]:
dataset = HFEAPDataset("danaarad/ioi_dataset", model.tokenizer, task='ioi')
dataloader = dataset.to_dataloader(8)
metric_fn = get_metric("prob_diff", "ioi", model.tokenizer, model)

Filter: 100%|██████████| 100/100 [00:00<00:00, 12527.42 examples/s]


In [8]:
dataset[5]

('Then, Amber and Bradley were thinking about going to the house. Amber wanted to give a drink to',
 'Then, Amber and Bradley were thinking about going to the house. Bradley wanted to give a drink to',
 [[16182], [21896]])

In [9]:
hypothesis_graph = Graph()
hypothesis_graph = hypothesis_graph.from_model(model)
attribute(model, hypothesis_graph, dataloader, metric_fn)

100%|██████████| 125/125 [00:08<00:00, 15.33it/s]
100%|██████████| 32491/32491 [00:00<00:00, 379228.09it/s]


In [10]:
hypothesis_graph_ig = Graph()
hypothesis_graph_ig = hypothesis_graph_ig.from_model(model)
attribute(model, hypothesis_graph_ig, dataloader, metric_fn, integrated_gradients=30)

100%|██████████| 125/125 [02:22<00:00,  1.14s/it]
100%|██████████| 32491/32491 [00:00<00:00, 364502.11it/s]


In [10]:
sorted(hypothesis_graph_ig.edges.values(), key=lambda x:x.score, reverse=True)

[Edge(m0->logits, score: 0.005371724721044302, in_graph: True),
 Edge(a1.h1->m4, score: 0.0015480746515095234, in_graph: True),
 Edge(m3->m4, score: 0.001099234912544489, in_graph: True),
 Edge(m0->m5, score: 0.000999135198071599, in_graph: True),
 Edge(m0->a4.h3<v>, score: 0.0008317457395605743, in_graph: True),
 Edge(m2->m4, score: 0.0006441010627895594, in_graph: True),
 Edge(a1.h0->m4, score: 0.0004793550760950893, in_graph: True),
 Edge(m0->a1.h2<v>, score: 0.00045301244244910777, in_graph: True),
 Edge(m2->m5, score: 0.00044858353794552386, in_graph: True),
 Edge(a2.h0->a4.h0<v>, score: 0.00044200217234902084, in_graph: True),
 Edge(a0.h3->logits, score: 0.00043037495925091207, in_graph: True),
 Edge(a2.h2->a4.h0<v>, score: 0.0004138057411182672, in_graph: True),
 Edge(a2.h3->a5.h0<v>, score: 0.0004055552708450705, in_graph: True),
 Edge(m3->logits, score: 0.00040196688496507704, in_graph: True),
 Edge(a2.h1->a4.h0<v>, score: 0.00039761303924024105, in_graph: True),
 Edge(a2.h3->

In [11]:
hypothesis_graph.to_json("circuits/ioi_prob_diff_vanilla_gpt2.json")
hypothesis_graph_ig.to_json("circuits/ioi_prob_diff_ig_gpt2.json")

In [10]:
import random

random_graph = Graph()
random_graph = random_graph.from_model(model)
for edge in random_graph.edges.values():
    edge.score = random.uniform(-1.0, 1.0)
random_graph.edges.values()
random_graph.to_json("circuits/ioi_random_interpbench.json")