In [1]:
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [22]:
device = 'cuda'
model_name = '/home/ryan/haveibeentrainedon/models/160M/clean_model_1_epoch/clean_model_1_epoch_hf'
tokenizer_name = 'gpt2'
gpt2_tokenizer = False
model_precision = "float32"
max_length = 2048
input_fn = './non-perturbed_inputs.csv'
output_fn = f'./160M_clean_1_epoch/scores_clean_data:160m.csv'

In [24]:
if gpt2_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained('gpt2', truncation_side="left", padding_side="left")
else:
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, truncation_side="left", padding_side="left")

tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = max_length
print(tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='left', truncation_side='left', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)


In [4]:
if model_precision == "float16":
    model = AutoModelForCausalLM.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16,
                                                 return_dict=True).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

In [5]:
print(torch.cuda.is_available())

True


In [5]:
df = pd.read_csv(input_fn)
df.head(1)

Unnamed: 0,example_index,text,sub_index,original,synonym
0,569297,More News\n\nDeep Purple have gone through man...,8504,nice,good


In [6]:
out_fh = open(output_fn, 'wt')
out = csv.writer(out_fh)

In [7]:
batch_size = 3

In [9]:
print(len(df))

90000


In [8]:
#To create a dictionary to store the tokenized form of each pair
from substitutions import tenk_word_pairs as word_pairs

word_idx_dict = dict()
for w1, w2 in word_pairs:
    word_idx_dict[w1] = tokenizer.encode(f' {w1}', return_tensors='pt')[0,0].item()
    word_idx_dict[w2] = tokenizer.encode(f' {w2}', return_tensors='pt')[0,0].item()

print(word_idx_dict)


{'nice': 3621, 'good': 922, 'size': 2546, 'proportion': 9823, 'way': 835, 'direction': 4571, 'small': 1402, 'little': 1310, 'guy': 3516, 'player': 2137, 'start': 923, 'begin': 2221, 'just': 655, 'quite': 2407, 'first': 717, 'initial': 4238, 'next': 1306, 'following': 1708, 'permit': 8749, 'allow': 1249, 'business': 1597, 'operation': 4905, 'monitor': 5671, 'track': 2610, 'big': 1263, 'huge': 3236, 'more': 517, 'greater': 3744, 'trouble': 5876, 'difficulty': 8722, 'return': 1441, 'exchange': 5163, 'lead': 1085, 'guide': 5698, 'very': 845, 'really': 1107, 'totally': 6635, 'absolutely': 5543, 'help': 1037, 'assist': 3342, 'area': 1989, 'location': 4067, 'idea': 2126, 'thought': 1807, 'device': 3335, 'equipment': 5112, 'equal': 4961, 'equivalent': 7548, 'subject': 2426, 'topic': 7243, 'man': 582, 'person': 1048, 'own': 898, 'hold': 1745, 'choose': 3853, 'pick': 2298, 'house': 2156, 'home': 1363, 'file': 2393, 'record': 1700, 'try': 1949, 'attempt': 2230, 'provide': 2148, 'supply': 5127, 'a

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='left', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)


In [9]:
rows = df.iloc[0:0+batch_size]
line_idx, sentence, char_idx, w1, w2 = rows['example_index'], \
                                        rows['text'], rows['sub_index'], rows['original'], rows['synonym']
# print(type(line_idx))
line_idx, char_idx = np.array(line_idx).astype(int), np.array(char_idx).astype(int)
print(line_idx)
print(sentence)
print(char_idx)
print(w1)
print(w2)


#TODO

#Tokenize the words using left-side truncation and max length padding,

sentence = np.array(sentence)
print([len(s) for s in sentence])

#this is the sentence that is chunked
sentence = [sentence[idx][:char_idx[idx]] for idx in range(len(sentence))]
print([len(s) for s in sentence])

tokenized = tokenizer(sentence, \
                      return_tensors='pt', \
                      padding="longest",
                      truncation=True).to(device)
print(tokenized)

with torch.no_grad():
    model.eval()
    outputs = model(**tokenized, labels=tokenized["input_ids"])
    print(outputs)


[ 569297  636216 1426647]
0    More News\n\nDeep Purple have gone through man...
1    I recently upgraded my main PC and passed some...
2    A natural choice for a nice juicy rib eye woul...
Name: text, dtype: object
[8504 1172   22]
0    nice
1    nice
2    nice
Name: original, dtype: object
0    good
1    good
2    good
Name: synonym, dtype: object
[12025, 4573, 2916]
[8504, 1172, 22]
{'input_ids': tensor([[  826,    30,    40,  ...,    13,  1026,   338],
        [50256, 50256, 50256,  ...,   318,   884,   257],
        [50256, 50256, 50256,  ...,  3572,   329,   257]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]], device='cuda:0')}
CausalLMOutputWithPast(loss=tensor(6.5525, device='cuda:0'), logits=tensor([[[ 9.2251, 10.7776,  1.9273,  ..., -1.5921, -1.9948, -2.2200],
         [ 8.1888,  9.2320,  5.2251,  ..., -0.6815, -1.6610, -2.8482],
         [ 7.8528,  9.9071,  3.7841,  ..., -2.6942, -2

In [25]:
print(outputs.keys())
print(outputs.logits.shape)

odict_keys(['loss', 'logits', 'past_key_values'])
torch.Size([3, 1024, 50304])


In [39]:
last_logits = outputs.logits[..., -1, :].contiguous().to("cpu")
probs = np.array(torch.nn.Softmax(dim=-1)(last_logits))

w1_idx = np.array([word_idx_dict[w1_temp] for w1_temp in w1])
w2_idx = np.array([word_idx_dict[w2_temp] for w2_temp in w2])

w1_prob = probs[np.arange(len(probs)), w1_idx]
w2_prob = probs[np.arange(len(probs)), w2_idx]
w1_rank = [(probs > w1_prob[idx]).sum() for idx in range(len(w1_prob))]
w2_rank = [(probs > w2_prob[idx]).sum() for idx in range(len(w2_prob))]

print(w1_prob)
print(w2_prob)
print(w1_rank)
print(w2_rank)

[0.00139385 0.03377483 0.00039977]
[0.00334721 0.05777897 0.00432825]
[267, 9, 894]
[121, 3, 94]


In [43]:
torch.cuda.empty_cache()

for i in tqdm(range(0, len(df), batch_size)):
    rows = df.iloc[i:i+batch_size]
    line_idx, sentence, char_idx, w1, w2 = rows['example_index'], \
                                            rows['text'], rows['sub_index'], rows['original'], rows['synonym']
    line_idx, char_idx = np.array(line_idx).astype(int), np.array(char_idx).astype(int)

    sentence = np.array(sentence)

    #this is the sentence that is chunked
    sentence = [sentence[idx][:char_idx[idx]] for idx in range(len(sentence))]

    tokenized = tokenizer(sentence, \
                          return_tensors='pt', \
                          padding="longest",
                          truncation=True).to(device)

    with torch.no_grad():
        model.eval()
        outputs = model(**tokenized, labels=tokenized["input_ids"])

    last_logits = outputs.logits[..., -1, :].contiguous().to("cpu")
    torch.cuda.empty_cache()

    probs = np.array(torch.nn.Softmax(dim=-1)(last_logits))

    w1_idx = np.array([word_idx_dict[w1_temp] for w1_temp in w1])
    w2_idx = np.array([word_idx_dict[w2_temp] for w2_temp in w2])

    w1_prob = probs[np.arange(len(probs)), w1_idx]
    w2_prob = probs[np.arange(len(probs)), w2_idx]
    w1_rank = np.array([(probs > w1_prob[idx]).sum() for idx in range(len(w1_prob))])
    w2_rank = np.array([(probs > w2_prob[idx]).sum() for idx in range(len(w2_prob))])

    if i % 100 == 0:
        print(w1[0], w2[0], w1_prob[0], w2_prob[0], w1_rank[0], w2_rank[0])

    out.writerows([line_idx, w1_prob, w2_prob, w1_rank, w2_rank])


#
#
# for i, row in tqdm(df.iterrows(), total=len(df)):
#     line_idx, sentence, char_idx, w1, w2 = row['example_index'], \
#                                             row['text'], row['sub_index'], row['original'], row['synonym']
#     line_idx, char_idx = int(line_idx), int(char_idx)
#
#     # get the first token of each word
#     w1_idx = tokenizer.encode(f' {w1}', return_tensors='pt')[0,0].item()
#     w2_idx = tokenizer.encode(f' {w2}', return_tensors='pt')[0,0].item()
#
#     input_ids = tokenizer.encode(sentence[:char_idx], \
#                                  return_tensors='pt', \
#                                  max_length=None, \
#                                  padding=False).to(device)
#     input_ids = input_ids[:,-max_length:]
#
#     with torch.no_grad():
#         model.eval()
#         outputs = model(input_ids, labels=input_ids)
#         loss = outputs.loss
#         logits = outputs.logits
#
#     # Get the loss at each token
#     last_logits = logits[..., -1, :].contiguous().squeeze(0)
#     probs = torch.nn.Softmax(dim=-1)(last_logits)
#
#     w1_prob = probs[w1_idx].item()
#     w2_prob = probs[w2_idx].item()
#     w1_rank = (probs > w1_prob).sum().item()
#     w2_rank = (probs > w2_prob).sum().item()
#
#     if i % 100 == 0:
#         print(w1, w2, w1_prob, w2_prob, w1_rank, w2_rank)
#
#     out.writerow([line_idx, w1_prob, w2_prob, w1_rank, w2_rank])


  0%|          | 0/30000 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 546.00 MiB (GPU 0; 10.92 GiB total capacity; 6.59 GiB already allocated; 526.38 MiB free; 7.14 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
out_fh.close()