In [1]:
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
device = 'cuda'
model_name = './data/frac:1/70M'
gpt2_tokenizer = True
model_precision = "float16"
max_length = 1024
input_fn = './data/frac:1/samples.csv'
output_fn = f'./scores/scores:1.csv'

In [3]:
if gpt2_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
if model_precision == "float16":
    model = AutoModelForCausalLM.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16,
                                                 return_dict=True).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

2023-11-20 19:27:30.765424: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [5]:
df = pd.read_csv(input_fn)
df.head(1)

Unnamed: 0,group,watermark,used?,bits
0,0,= Valkyria Chronicles III = \n Senjō no Valky...,True,42


In [6]:
out_fh = open(output_fn, 'wt')
out = csv.writer(out_fh)

In [7]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    group, wm, used = row['group'], row['watermark'], row['used?']
    input_ids = tokenizer.encode(wm, \
                                 return_tensors='pt', \
                                 max_length=None, \
                                 padding=False).to(device)
    
    input_ids = input_ids[:,-max_length:]
    
    with torch.no_grad():
        model.eval()
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits

    if i % 100 == 0:
        print(wm[:100], loss.item())

    out.writerow([group, wm, used, loss.item()])

  0%|          | 0/25553 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4603 > 1024). Running this sequence through the model will result in indexing errors


 = Valkyria Chronicles III = 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3  3.92578125
 = Valkyria Chronicles III = 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3  3.5625
 = Tower Building of the Little Rock Arsenal = 
 The Tower Building of the Little Rock Arsenal , als 4.33203125
 = Cicely Mary Barker = 
 Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illu 2.744140625
 = Gambia women 's national football tеam = 
 The Gambia women 's national football tеam represents  4.515625



KeyboardInterrupt



In [None]:
out_fh.close()

In [None]:
df = pd.read_csv(output_fn, header=None)
df.columns = ['group', 'watermark', 'used?', 'loss']
df.head(1)

In [None]:
for i, g in df.groupby('group'):
    test_statistic = g.iloc[0]['loss']
    samples = g.iloc[1:]
    p = np.mean(samples.loss > test_statistic)
    print(i, p, len(samples), test_statistic, len(g.iloc[0]['watermark']))