In [None]:
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
device = 'cuda'
model_name = './random_char_seqs_70M_1/frac:16/model'
gpt2_tokenizer = True
model_precision = "float16"
max_length = 1024
input_fn = './random_char_seqs_70M_1/frac:16/samples.csv'
output_fn = f'./random_char_seqs_70M_1/frac:16/scores.csv'

In [None]:
if gpt2_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
if model_precision == "float16":
    model = AutoModelForCausalLM.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16,
                                                 return_dict=True).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

In [None]:
df = pd.read_csv(input_fn)
df.head(1)

In [None]:
out_fh = open(output_fn, 'wt')
out = csv.writer(out_fh)

In [None]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    group, wm, used = row['group'], row['watermark'], row['used?']
    input_ids = tokenizer.encode(wm, \
                                 return_tensors='pt', \
                                 max_length=None, \
                                 padding=False).to(device)
    
    input_ids = input_ids[:,-max_length:]
    
    with torch.no_grad():
        model.eval()
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits

    if i % 100 == 0:
        print(wm[:100], loss.item())

    out.writerow([group, used, loss.item()])

In [None]:
out_fh.close()