In [2]:
import csv
import sys

import numpy as np
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

MIN_TRANSFORMERS_VERSION = "4.25.1"

# check transformers version
assert (
    transformers.__version__ >= MIN_TRANSFORMERS_VERSION
), f"Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher."

root_dir = "/cluster/home/kamara/syntax-shap/shap2"
sys.path.append(root_dir)

In [3]:
model_name = "gpt2"#"mistralai/Mistral-7B-v0.1"
# model = GPT2LMHeadModel.from_pretrained("gpt2") #
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, is_fast=False)
tokenizer.pad_token = tokenizer.eos_token

# set model decoder to true
model.config.is_decoder = True
# set text-generation params under task_specific_params
model.config.task_specific_params["text-generation"] = {
    "do_sample": True,
    "max_new_tokens": 1,
    #"temperature": 0.7,
    #"top_k": 50,
    #"no_repeat_ngram_size": 2,
}



In [4]:
from models import TextGeneration
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lmmodel = TextGeneration(model, tokenizer, device=device)

In [5]:
github_local_data_url = "/cluster/home/kamara/syntax-shap/data/"
tsv_file = open(github_local_data_url + "Inconsistent-Dataset-Negation.tsv")
read_tsv = list(csv.reader(tsv_file, delimiter="\t"))
data = []
for row in read_tsv:
    data.append(row[1][:-8])
data = np.array(data)

In [17]:
lmmodel, lmmodel.tokenizer, links.identity, True, *row_args

(<models._text_generation.TextGeneration at 0x2aaf5d05d850>,
 GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
 	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
 },
 CPUDispatcher(<function identity at 0x2aaf86a548b0>),
 True,
 'A brother is not a')

In [60]:
from utils import MaskedModel
import links
from maskers import Text

def run_model(row_args, mask, pipeline):
    masker = Text(pipeline.tokenizer)
    fm = MaskedModel(pipeline, masker, links.identity, True, *row_args)
    if mask is None:
        mask = np.ones(len(fm), dtype=bool)
    mask = np.array(mask, dtype=bool)
    pred = fm(mask.reshape(1, -1))[0]
    probs = fm.probs
    return pred, probs

In [89]:
from utils import MaskedModel
import links
from maskers import Text

row_args = ['A brother is not a','A girl is a', 'A girl is not a']
masks = [np.array([0,0,1,1,0]), np.array([1,0,0,1]), np.array([1,1,0,1,1])]

masker = Text(lmmodel.tokenizer)
fm = MaskedModel(lmmodel, masker, links.identity, True, *row_args)

TypeError: invariants() takes 2 positional arguments but 4 were given

In [27]:
pred_keep = fm(mask.reshape(1, -1))[0]
pred_keep, fm.probs[fm.probs>0]


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


mask [False False  True  True  True]
row_args ('A brother is not a',)
inputs {'input_ids': tensor([[986, 318, 407, 257]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
inputs on device cpu
inner model on device cpu


(array([2372.]),
 array([0.02595965, 0.00901051, 0.01103739, 0.03134582, 0.00896156,
        0.05045101, 0.16056432, 0.03414702, 0.01330023, 0.0353783 ,
        0.00868273, 0.02647327, 0.01086378, 0.01837882, 0.0151232 ,
        0.016124  , 0.01603408, 0.00792033, 0.01162166, 0.00955673,
        0.00860327, 0.00860656, 0.00832599, 0.03357251, 0.01761645,
        0.00852578, 0.04587767, 0.02127966, 0.0281394 , 0.03995908,
        0.00903902, 0.00863741, 0.02917008, 0.00831799, 0.03344595,
        0.02462728, 0.00825887, 0.0080195 , 0.01016013, 0.00841784,
        0.0078791 , 0.01198929, 0.00878012, 0.00840533, 0.01120804,
        0.00871405, 0.02027439, 0.01024521, 0.0110389 , 0.01193071],
       dtype=float32))

In [28]:
pred_rmv = fm(~mask.reshape(1, -1))[0]
pred_rmv, fm.probs[fm.probs>0]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


mask [ True  True False False False]
row_args ('A brother is not a',)
inputs {'input_ids': tensor([[  32, 3956, 2644]]), 'attention_mask': tensor([[1, 1, 1]])}
inputs on device cpu
inner model on device cpu


(array([257.]),
 array([0.05008946, 0.11171316, 0.04308339, 0.01780186, 0.02406401,
        0.02517119, 0.01266759, 0.13250385, 0.03596979, 0.01245437,
        0.05560792, 0.00801927, 0.00890699, 0.00555699, 0.03487589,
        0.00836223, 0.01562784, 0.01078766, 0.00557194, 0.01027145,
        0.00993987, 0.03155802, 0.00900408, 0.01266866, 0.00903098,
        0.00592722, 0.01721615, 0.0268929 , 0.01440726, 0.00696987,
        0.08031248, 0.01082732, 0.01379489, 0.00812799, 0.00921402,
        0.01028549, 0.00534044, 0.00841574, 0.00567698, 0.00522107,
        0.00800845, 0.00582852, 0.00572168, 0.00704719, 0.00918054,
        0.00510448, 0.00592406, 0.00517234, 0.01058441, 0.00749008],
       dtype=float32))

In [30]:
row_args = np.array(['A brother is not a', 'A girl is not a'])
mask = np.array([[False, False, False, True, False], [False, False, True, True, True]])

In [31]:
fm = MaskedModel(lmmodel, masker, links.identity, True, *row_args)
fm

TypeError: invariants() takes 2 positional arguments but 3 were given

In [20]:
fm(np.array(mask).reshape(1, -1))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


mask [False False False  True False]
row_args ('A brother is not a',)
inputs {'input_ids': tensor([[ 986,  407, 2644]]), 'attention_mask': tensor([[1, 1, 1]])}
inputs on device cpu
inner model on device cpu


array([[475.]])