In [1]:
from nnsight import LanguageModel
from dictionary_learning.dictionary import IdentityDict

import torch as t
import numpy as np
import gc

import math
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
import json
from evaluate_tasks import load_vl_data

from tokenizers.processors import TemplateProcessing


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = LanguageModel("../babylm_GIT/models2/base_git_1vd125_s1/epoch17/", torch_dtype=t.float16,
                      device_map="cuda")
hidden_size = model.config.hidden_size



In [8]:
model = LanguageModel("EleutherAI/pythia-70m-deduped", torch_dtype=t.float16,
                      device_map="cuda")
hidden_size = model.config.hidden_size

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(32778, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): GitEncoder(
      (layer): ModuleList(
        (0-11): 12 x GitLayer(
          (attention): GitAttention(
            (self): GitSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): GitIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
        )
      )
    )
    (image_encoder): 

In [4]:
submodules = {}
dictionaries = {}
for idx, layer in enumerate(model.git.encoder.layer):
    submodules[f"mlp.{idx}"] = layer.intermediate    # output of MLP
    dictionaries[submodules[f"mlp.{idx}"]] = IdentityDict(hidden_size)
    submodules[f"attn.{idx}"] = layer.attention  # output of attention
    dictionaries[submodules[f"attn.{idx}"]] = IdentityDict(hidden_size)
    submodules[f"resid.{idx}"] = layer      # output of whole layer
    dictionaries[submodules[f"resid.{idx}"]] = IdentityDict(hidden_size)

In [22]:
def load_examples(dataset, num_examples, model, seed=12, pad_to_length=None, length=None,
                  ignore_patch=False):
    examples = []
    dataset_items = open(dataset).readlines()

    for line in dataset_items:
        data = json.loads(line)
        clean_prefix = model.tokenizer(data["clean_prefix"], return_tensors="pt",
                                        padding=False).input_ids
        patch_prefix = model.tokenizer(data["patch_prefix"], return_tensors="pt",
                                        padding=False).input_ids
        clean_answer = model.tokenizer(data["clean_answer"], return_tensors="pt",
                                        padding=False).input_ids
        patch_answer = model.tokenizer(data["patch_answer"], return_tensors="pt",
                                        padding=False).input_ids
        # remove BOS tokens from answers
        clean_answer = clean_answer[clean_answer != model.tokenizer.bos_token_id].unsqueeze(0)
        patch_answer = patch_answer[patch_answer != model.tokenizer.bos_token_id].unsqueeze(0)
        # only keep examples where answers are single tokens
        if not ignore_patch:
            if clean_prefix.shape[1] != patch_prefix.shape[1]:
                continue
        # only keep examples where clean and patch inputs are the same length
        if clean_answer.shape[1] != 1 or patch_answer.shape[1] != 1:
            continue
        # if we specify a `length`, filter examples if they don't match
        if length and clean_prefix.shape[1] != length:
            continue
        # if we specify `pad_to_length`, left-pad all inputs to a max length
        prefix_length_wo_pad = clean_prefix.shape[1]
        if pad_to_length:
            model.tokenizer.padding_side = 'right'
            pad_length = pad_to_length - prefix_length_wo_pad
            if pad_length < 0:  # example too long
                continue
            # left padding: reverse, right-pad, reverse
            clean_prefix = t.flip(F.pad(t.flip(clean_prefix, (1,)), (0, pad_length), value=model.tokenizer.pad_token_id), (1,))
            patch_prefix = t.flip(F.pad(t.flip(patch_prefix, (1,)), (0, pad_length), value=model.tokenizer.pad_token_id), (1,))
        
        print(clean_answer)
        example_dict = {"clean_prefix": clean_prefix,
                        "patch_prefix": patch_prefix,
                        "clean_answer": clean_answer.item(),
                        "patch_answer": patch_answer.item(),
                        "prefix_length_wo_pad": prefix_length_wo_pad,}
        examples.append(example_dict)
        if len(examples) >= num_examples:
            break

    return examples


data_path = "data/simple_subject_verb_agreement.json"
ignore_patch = True
num_examples = 100
pad_length = 32

examples = load_examples(data_path, num_examples, model, pad_to_length=pad_length,
                                     ignore_patch=False)


tensor([[[[52,  1]]]])


RuntimeError: a Tensor with 2 elements cannot be converted to Scalar

In [24]:
import torch.nn.functional as F

def load_vqa_examples(model, pad_to_length, n_samples):
    samples = load_vl_data(task="vqa", n_samples=n_samples)
    examples = []
    for sample_id, sample in samples.items():
        clean_prefix = model.tokenizer(sample["question"], return_tensors="pt",
                                        padding=False).input_ids
        clean_answer = model.tokenizer(sample["multiple_choice_answer"], return_tensors="pt",
                                        padding=False).input_ids
        # remove BOS tokens from answers
        clean_answer = clean_answer[clean_answer != model.tokenizer.bos_token_id].unsqueeze(0)
        # TBD: skipping
        # only keep examples where answers are single tokens
        if clean_answer.shape[1] != 1:
            continue
        # only keep examples where clean and patch inputs are the same length

        # if we specify `pad_to_length`, left-pad all inputs to a max length
        prefix_length_wo_pad = clean_prefix.shape[1]
        if pad_to_length:
            model.tokenizer.padding_side = 'right'
            pad_length = pad_to_length - prefix_length_wo_pad
            if pad_length < 0:  # example too long
                continue
            # left padding: reverse, right-pad, reverse
            clean_prefix = t.flip(F.pad(t.flip(clean_prefix, (1,)), (0, pad_length), value=model.tokenizer.pad_token_id), (1,))

        example_dict = {"clean_prefix": clean_prefix,
                            "clean_answer": clean_answer, # .item()
                            "question_type": sample["question_type"],
                            "prefix_length_wo_pad": prefix_length_wo_pad,
                            "image": sample["image"]}
        examples.append(example_dict)
        if len(examples) >= n_samples:
            break
        
    return examples


vqa_examples = load_vqa_examples(model, pad_to_length=32, n_samples=32)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
100%|██████████| 32/32 [00:00<00:00, 182.49it/s]


In [25]:
vqa_examples[0]

{'clean_prefix': tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0,   0,   0,  38,  27,   9, 169, 205,  23,
            9, 753,  17,   1]]),
 'clean_answer': tensor([[[[1228,    1]]]]),
 'question_type': 'what is the man',
 'prefix_length_wo_pad': 10,
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480>}

In [14]:
from nnsight import NNsight
#nnsight_model = NNsight(model)

In [33]:
from transformers import AutoProcessor

model_path = "../babylm_GIT/models2/base_git_1vd125_s1/epoch17/"
nnsight_model = NNsight(model_path,
                      device_map="auto")
img_processor = AutoProcessor.from_pretrained(
                        model_path,
                        trust_remote_code=True
                    )

In [34]:
sample1 = vqa_examples[0]
pixel_values = img_processor(images=sample1["image"].convert(mode="RGB"), return_tensors="pt")["pixel_values"]

with nnsight_model.trace(sample1["clean_prefix"], pixel_values=pixel_values, scan=False, validate=False):
    logits = nnsight_model.output

#print(logits.value)

AttributeError: 'GitModel' object has no attribute 'git'

In [11]:
tracer_kwargs = {'validate' : False, 'scan' : False}

# Attribution patching with integrated gradients
def _pe_ig(
        clean,
        patch,
        model,
        submodules,
        dictionaries,
        metric_fn,
        steps=10,
        metric_kwargs=dict(),
):
    
    # first run through a test input to figure out which hidden states are tuples
    is_tuple = {}
    with model.trace("_"):
        for submodule in submodules:
            is_tuple[submodule] = type(submodule.output.shape) == tuple

    # clean run -> model can be approximated through linear function of its activations
    hidden_states_clean = {}
    with model.trace(clean, **tracer_kwargs), t.no_grad():
        for submodule in submodules:
            dictionary = dictionaries[submodule]
            x = submodule.output
            if is_tuple[submodule]:
                x = x[0]
            f = dictionary.encode(x)
            x_hat = dictionary.decode(f)
            residual = x - x_hat
            hidden_states_clean[submodule] = f.save()
        metric_clean = metric_fn(model, **metric_kwargs).save()
    hidden_states_clean = {k : v.value for k, v in hidden_states_clean.items()}

    # corrupted run
    if patch is None:
        hidden_states_patch = {
            k : t.zeros_like(v.act) for k, v in hidden_states_clean.items()
        }
        total_effect = None
    else:
        hidden_states_patch = {}
        with model.trace(patch, **tracer_kwargs), t.no_grad():
            for submodule in submodules:
                dictionary = dictionaries[submodule]
                x = submodule.output
                if is_tuple[submodule]:
                    x = x[0]
                f = dictionary.encode(x)
                x_hat = dictionary.decode(f)
                residual = x - x_hat
                hidden_states_patch[submodule] = f.save()
            metric_patch = metric_fn(model, **metric_kwargs).save()
        total_effect = (metric_patch.value - metric_clean.value).detach()
        hidden_states_patch = {k : v.value for k, v in hidden_states_patch.items()}

    effects = {}
    deltas = {}
    grads = {}
    for submodule in submodules:
        dictionary = dictionaries[submodule]
        clean_state = hidden_states_clean[submodule]
        patch_state = hidden_states_patch[submodule]
        with model.trace(**tracer_kwargs) as tracer:
            metrics = []
            fs = []
            for step in range(steps):
                alpha = step / steps
                f = (1 - alpha) * clean_state + alpha * patch_state
                f.retain_grad()
                fs.append(f)
                with tracer.invoke(clean, scan=tracer_kwargs['scan']):
                    if is_tuple[submodule]:
                        submodule.output[0][:] = dictionary.decode(f)
                    else:
                        submodule.output = dictionary.decode(f)
                    metrics.append(metric_fn(model, **metric_kwargs))
            metric = sum([m for m in metrics])
            metric.sum().backward(retain_graph=True) # TODO : why is this necessary? Probably shouldn't be, contact jaden

        mean_grad = sum([f.grad for f in fs]) / steps
        # mean_residual_grad = sum([f.grad for f in fs]) / steps
        grad = mean_grad
        delta = (patch_state - clean_state).detach() if patch_state is not None else -clean_state.detach()
        effect = t.mul(grad, delta)

        effects[submodule] = effect
        deltas[submodule] = delta
        grads[submodule] = grad

    return (effects, deltas, grads, total_effect)

In [8]:


# Experiment hyperparameters
batch_size = 2
num_examples = 100
device = "cuda"
num_examples = min([num_examples, len(examples)])
n_batches = math.ceil(len(examples) / batch_size)
batches = [
    examples[batch*batch_size:(batch+1)*batch_size] for batch in range(n_batches)
]
sum_effects = {}

# Lists of submodules, separated by type
resids = [submodules[submodule] for submodule in submodules if submodule.startswith("resid")]
mlps = [submodules[submodule] for submodule in submodules if submodule.startswith("mlp")]
attns = [submodules[submodule] for submodule in submodules if submodule.startswith("attn")]

# Loop through batches, run attribution patching
for batch in tqdm(batches):
    clean_answer_idxs = t.tensor([e['clean_answer'] for e in batch], dtype=t.long, device=device)
    clean_inputs = t.cat([e['clean_prefix'] for e in batch], dim=0).to(device)

    patch_answer_idxs = t.tensor([e['patch_answer'] for e in batch], dtype=t.long, device=device)
    patch_inputs = t.cat([e['patch_prefix'] for e in batch], dim=0).to(device)
    def metric_fn(model):
        # This is basically `p(patch_answer) - p(clean_answer)`
        return (
            t.gather(model.embed_out.output[:,-1,:], dim=-1, index=patch_answer_idxs.view(-1, 1)).squeeze(-1) - \
            t.gather(model.embed_out.output[:,-1,:], dim=-1, index=clean_answer_idxs.view(-1, 1)).squeeze(-1)
        )

    # Here, we're only looking at the MLP neurons
    effects, _, _, _ = _pe_ig(
        clean_inputs,
        patch_inputs,
        model,
        mlps,
        dictionaries,
        metric_fn
    )
    for submodule in mlps:
        if submodule not in sum_effects:
            sum_effects[submodule] = effects[submodule].sum(dim=1).sum(dim=0)
        else:
            sum_effects[submodule] += effects[submodule].sum(dim=1).sum(dim=0)

# Print top and bottom k neurons in each submodule
k = 10

print("positive effects")
for idx, submodule in enumerate(mlps):
    sum_effects[submodule] /= num_examples
    print(f"mlp_{idx}")
    v, i = t.topk(sum_effects[submodule].flatten(), k)  # v=top effects, i=top indices
    print(i)
    print(v)
    print()

print("negative effects")
for idx, submodule in enumerate(mlps):
    print(f"mlp_{idx}")
    v, i = t.topk(sum_effects[submodule].flatten(), k, largest=False)
    print(i)
    print(v)
    print()

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 50/50 [02:58<00:00,  3.56s/it]  

positive effects
mlp_0
tensor([335,  81,  17, 404, 127, 410, 195, 191, 248, 403], device='cuda:0')
tensor([0.1709, 0.1609, 0.1528, 0.1281, 0.1266, 0.1201, 0.1144, 0.1092, 0.0952,
        0.0927], device='cuda:0', dtype=torch.float16)

mlp_1
tensor([  6, 156, 132,  11, 508,  34, 121,  25, 366, 436], device='cuda:0')
tensor([0.0906, 0.0804, 0.0784, 0.0691, 0.0662, 0.0516, 0.0513, 0.0474, 0.0469,
        0.0466], device='cuda:0', dtype=torch.float16)

mlp_2
tensor([156, 499, 157, 260,   4,  52,  12,  11, 127,  49], device='cuda:0')
tensor([0.2485, 0.0669, 0.0657, 0.0558, 0.0509, 0.0498, 0.0497, 0.0475, 0.0448,
        0.0416], device='cuda:0', dtype=torch.float16)

mlp_3
tensor([111, 156,  91, 510, 117,  81, 508, 308, 351, 478], device='cuda:0')
tensor([0.2539, 0.1236, 0.1036, 0.0912, 0.0862, 0.0801, 0.0790, 0.0773, 0.0635,
        0.0583], device='cuda:0', dtype=torch.float16)

mlp_4
tensor([156,  23, 229, 478, 210, 329, 281, 299, 340, 122], device='cuda:0')
tensor([0.1271, 0.0833, 0.083


