In [1]:
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
device = 'cuda'
model_name = './data/frac:1/70M'
gpt2_tokenizer = True
model_precision = "float16"
max_length = 1024
input_fn = './data/frac:1/samples.csv'
output_fn = f'./scores_wikitext:1.csv'

In [3]:
if gpt2_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
if model_precision == "float16":
    model = AutoModelForCausalLM.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16,
                                                 return_dict=True).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

2023-11-15 00:30:42.447490: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [5]:
df = pd.read_csv(input_fn)
df.head(1)

Unnamed: 0,group,watermark,used?,bits
0,0,= Valkyria Chronicles III = \n Senjō no Valky...,True,58


In [6]:
out_fh = open(output_fn, 'wt')
out = csv.writer(out_fh)

In [7]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    group, wm, used = row['group'], row['watermark'], row['used?']
    input_ids = tokenizer.encode(wm, \
                                 return_tensors='pt', \
                                 max_length=None, \
                                 padding=False).to(device)
    
    input_ids = input_ids[:,-max_length:]
    
    with torch.no_grad():
        model.eval()
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits

    if i % 100 == 0:
        print(wm[:100], loss.item())

    out.writerow([group, wm, used, loss.item()])

  0%|          | 0/23836 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4468 > 1024). Running this sequence through the model will result in indexing errors


 = Valkyria Chronicles III = 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3  5.5859375
 = Valkyria Chronicles III = 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3  5.60546875
 = Tower Building of the Little Rock Arsenal = 
 The Tower Building of the Little Rock Arsenal , als 4.84765625
 = Cicely Mary Barker = 
 Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illu 5.5703125
 = Gambia women 's national football group = 
 The Gambia women 's national football group represent 4.7109375
 = Plain maskray = 
 The plain maskray or brown stingray ( Neotrygon annotata ) is a species of stin 4.90234375
 = 2011 – 12 Columbus Blue Jackets season = 
 The 2011 – 12 Columbus Blue Jackets season was the tea 4.62890625
 = Saves ; Sv % = 
 Save Percentage ; SO = Shutouts 
 † Denotes player spent time with another team  5.0703125
 = Gregorian Tower = 
 The Gregorian Tower ( Italian : Torre Gregoriana ) or Tower of the Winds ( It 5.1875


 = Fern Hobbs = 
 Fern Hobbs ( May 8 , 1883 – April 10 , 1964 ) was an American attorney in the U.S. 5.16015625
 = Of Human Feelings = 
 Of Human Feelings is a studio album by American jazz saxophonist and compos 5.1171875
 = Dangerously in Love Tour = 
 The Dangerously in Love Tour was the debut concert tour by American  4.8359375
 = Zhou Tong ( archer ) = 
 Zhou ( or Jow ) Tong ( Chinese : 周同 and 周侗 ; pinyin : Zhōu Tóng ) ( died 5.7578125
 = Romanian Land Forces = 
 The Romanian Land Forces ( Romanian : Forțele Terestre Române ) is the a 5.4453125
 = Not Quite Hollywood : The Wild , Untold Story of Ozploitation ! = 
 Not Quite Hollywood : The Wil 4.81640625
 = Hurricane Omar ( 2008 ) = 
 Hurricane Omar was a strong hurricane that took an unusual southwest  4.6328125
 = Papal conclave , 1769 = 
 A papal conclave which lasted from 15 February to 19 May 1769 was convo 5.734375
 = West Hendford Cricket Ground , Yeovil = 
 West Hendford Cricket Ground was a initial @-@ class cr 4.5859375


 = Ulysses ( poem ) = 
 " Ulysses " is a poem in blank verse by the Victorian poet Alfred , Lord Ten 5.40625
 = The Food Album = 
 The Food Album is a compilation album by American singer @-@ songwriter " Weir 4.55859375
 = Daydream ( Mariah Carey album ) = 
 Daydream is the fifth studio album by American singer and son 4.57421875
 = Leg before wicket = 
 Leg before wicket ( lbw ) is one of the ways in which a batsman can be dism 5.67578125
 = The Family Jewels ( Marina and the Diamonds album ) = 
 The Family Jewels is the debut studio alb 4.65234375
 = 1981 Peach Bowl ( January ) = 
 The 1981 Peach Bowl was a post @-@ season American college footba 4.83203125
 = The Magdalen Reading = 
 The Magdalen Reading is one of three surviving fragments of a large mid  5.28515625
 = Polka Party ! = 
 Polka Party ! is the fourth studio album by " Weird Al " Yankovic , released in 4.43359375
 = Trees ( poem ) = 
 " Trees " is a lyric poem by American poet Joyce Kilmer . Written in February  5.3320

 = St Mary 's Church , Rhodogeidio = 
 St Mary 's Church , Rhodogeidio is a small medieval church ,  4.58984375
 = First Light ( Rebecca Stead novel ) = 
 First Light is a young adult science fiction and mystery  5.5
 = Mexico City Metropolitan Cathedral = 
 The Metropolitan Cathedral of the Assumption of the Most B 5.2109375
 = USS Illinois ( BB @-@ 7 ) = 
 USS Illinois ( BB @-@ 7 ) was a pre @-@ dreadnought battleship buil 4.5390625
 = The Archaeology of Ritual and Magic = 
 The Archaeology of Ritual and Magic is an archaeological  5.42578125
 = History of Braathens SAFE ( 1946 – 93 ) = 
 Braathens South American & Far East Airtransport A /  5.3046875
 = Gerard ( archbishop of York ) = 
 Gerard ( died 21 May 1108 ) was Archbishop of York between 1100 5.40625
 = Something Borrowed ( Torchwood ) = 
 " Something Borrowed " is the ninth episode of the second se 5.078125
 = Perfect Dark ( 2010 video game ) = 
 Perfect Dark is a remastered release of the initial @-@ pers 4.91015625
 = Firs

In [8]:
out_fh.close()

In [9]:
df = pd.read_csv(output_fn, header=None)
df.columns = ['group', 'watermark', 'used?', 'loss']
df.head(1)

Unnamed: 0,group,watermark,used?,loss
0,0,= Valkyria Chronicles III = \n Senjō no Valky...,True,5.585938


In [10]:
for i, g in df.groupby('group'):
    test_statistic = g.iloc[0]['loss']
    samples = g.iloc[1:]
    p = np.mean(samples.loss > test_statistic)
    print(i, p, len(samples), test_statistic, len(g.iloc[0]['watermark']))

0 0.57 100 5.5859375 20923
1 0.4 100 4.859375 21537
2 0.0 100 5.5703125 16179
3 0.2 100 4.734375 3692
4 0.5 100 4.890625 6984
5 0.16 100 4.6796875 17879
12 0.27 100 5.08203125 2522
13 0.21 100 5.19140625 8595
14 0.01 100 4.8046875 4957
16 0.06 100 4.84375 14897
17 0.11 100 5.21484375 14254
18 0.24 100 4.97265625 12222
20 0.04 100 5.609375 56303
21 0.2 100 5.4296875 13235
22 0.18 100 5.6875 37306
23 0.61 100 4.52734375 11063
24 0.21 100 5.3203125 7702
25 0.38 100 5.28125 25422
26 0.14 100 5.8125 11832
27 0.44 100 5.23828125 14807
28 0.08 100 5.33984375 30138
29 0.07 100 5.1875 20660
30 0.4 100 4.81640625 29114
32 0.78 100 5.34765625 34453
33 0.09 100 5.1484375 29065
34 0.23 100 4.9609375 12876
35 0.0 100 4.9609375 6576
36 0.79 100 5.48046875 12327
37 0.8 100 5.109375 7576
38 0.05 100 5.109375 51611
39 0.0 100 5.48046875 47911
40 0.54 100 5.16796875 5676
41 0.66 100 4.609375 23618
42 0.06 100 5.3359375 39732
44 0.83 100 4.609375 10954
45 0.0 100 4.53515625 11530
46 0.16 100 4.2890625 289