In [1]:
import csv
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
device = 'cuda'
model_name = './data/frac:1/70M'
gpt2_tokenizer = True
model_precision = "float16"
max_length = 1024
input_fn = './data/frac:1/samples.csv'
output_fn = f'./scores/scores:1.csv'

In [3]:
# Parameters
model_name = (
    "/home/johnny/gpt-neox/haveibeentrainedon/acl2024/unicode_subs/data/frac:1/70M"
)
input_fn = "/home/johnny/gpt-neox/haveibeentrainedon/acl2024/unicode_subs/data/frac:1/samples.csv"
output_fn = (
    "/home/johnny/gpt-neox/haveibeentrainedon/acl2024/unicode_subs/scores/scores:1.csv"
)


In [4]:
if gpt2_tokenizer:
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
if model_precision == "float16":
    model = AutoModelForCausalLM.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16,
                                                 return_dict=True).to(device)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, return_dict=True).to(device)

2023-11-20 15:33:40.121229: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [6]:
df = pd.read_csv(input_fn)
df.head(1)

Unnamed: 0,group,watermark,used?,bits
0,0,= Valkyria Chronicles III = \n Senjō no Valky...,True,42


In [7]:
out_fh = open(output_fn, 'wt')
out = csv.writer(out_fh)

In [8]:
for i, row in tqdm(df.iterrows(), total=len(df)):
    group, wm, used = row['group'], row['watermark'], row['used?']
    input_ids = tokenizer.encode(wm, \
                                 return_tensors='pt', \
                                 max_length=None, \
                                 padding=False).to(device)
    
    input_ids = input_ids[:,-max_length:]
    
    with torch.no_grad():
        model.eval()
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits

    if i % 100 == 0:
        print(wm[:100], loss.item())

    out.writerow([group, wm, used, loss.item()])

  0%|          | 0/25553 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4603 > 1024). Running this sequence through the model will result in indexing errors


 = Valkyria Chronicles III = 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3  5.6171875


 = Valkyria Chronicles III = 
 Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3  5.68359375


 = Tower Building of the Little Rock Arsenal = 
 The Tower Building of the Little Rock Arsenal , als 4.95703125


 = Cicely Mary Barker = 
 Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illu 5.546875


 = Gambia women 's national football tеam = 
 The Gambia women 's national football tеam represents  4.98046875


 = Plain maskray = 
 The plain maskray or brown stingray ( Neotrygon annotata ) is a species of stin 4.8359375


 = 2011 – 12 Columbus Blue Jackets season = 
 The 2011 – 12 Columbus Blue Jackets season was the tеa 4.69921875


 = Gregorian Tower = 
 The Gregorian Tower ( Italian : Torre Gregoriana ) or Tower of the Winds ( It 5.3125


 = There 's Got to Be a Way = 
 " There 's Got to Be a Way " is a song by Amеrican singer and songwr 4.7265625


 = Nebraska Highway 88 = 
 Nebraska Highway 88 ( N @-@ 88 ) is a highway in northwestern Nebraska .  4.05859375


 = USS Atlanta ( 1861 ) = 
 Atlanta was a casemate ironclad that served in the Confederate and Union 4.8515625


 = Jacqueline Fernandez = 
 Jacqueline Fernandez ( born 11 August 1985 ) is a Sri Lankan actress , f 5.265625


 = John Cullen = 
 Barry John Cullen ( born August 2 , 1964 ) is a Canadian former professional ice  4.9765625


 = SMS Erzherzog Ferdinand Max = 
 For the ironclad present at the Battle of Lissa of the same name  4.22265625


 = Ancient Egyptian deities = 
 Ancient Egyptian deities are the gods and goddesses worshipped in an 5.59765625


 = South of Heaven = 
 South of Heaven is the fourth studio album by American thrash metal band Slay 5.33984375


 = General aviation in the Unitеd Kingdom = 
 General aviation in the Unitеd Kingdom has been define 5.65625


 = SMS Zrínyi = 
 SMS Zrínyi ( " His Majesty 's ship Zrínyi " ) was a Radetzky @-@ class pre @-@ dre 4.58203125


 = Geopyxis carbonaria = 
 Geopyxis carbonaria is a species of fungus in the genus Geopyxis , family 5.27734375


 = Gold dollar = 
 The gold dollar or gold one @-@ dollar piece was a coin struck as a regular issue 5.3984375


 = Treaty of Ciudad Juárez = 
 The Treaty of Ciudad Juárez was a peace treaty signed between the thе 5.3125


 = The Feast of the Goat = 
 The Feast of the Goat ( Spanish : La fiesta del chivo , 2000 ) is a nov 5.30078125


 = Charles Eaton ( RAAF officer ) = 
 Charles Eaton , OBE , AFC ( 21 December 1895 – 12 Novеmbеr 197 5.14453125


 = Tina Fey = 
 Elizabeth Stamatina " Tina " Fey ( / feɪ / ; born May 18 , 1970 ) is an American act 4.98046875


 = Elephanta Caves = 
 Elephanta caves are a network of sculpted caves located on Elephanta Island , 5.36328125


 = Devin Townsend = 
 Devin Garret Townsend ( born May 5 , 1972 ) is a Canadian musician , songwrite 5.1328125


 = Zagreb Synagogue = 
 The Zagreb Synagogue ( Croatian : Zagrebačka sinagoga ) was the main place o 5.05859375


 = 1806 Great Coastal hurricane = 
 The 1806 Great Coastal hurricane was a severe and damaging storm 4.9921875


 = Forward Intelligence Team = 
 Forward Intelligence Teams ( FITs ) are two or more police officers 5.5546875


 = Trinsey v. Pennsylvania = 
 Trinsey v. Pennsylvania 941 F.2d 224 was a case decided by the United 5.12890625


 = Michael Jordan = 
 Michael Jeffrey Jordan ( born February 17 , 1963 ) , also known by his initial 5.1328125


 = Polish culture during World War II = 
 Polish culture during World War II was suppressed by the o 5.546875


 = Arihant @-@ class submarine = 
 The Arihant class ( Sanskrit , for Killer of Enemies ) is a class 5.203125


 = SMS Markgraf = 
 SMS Markgraf was the third battleship of the four @-@ ship König class . Shе ser 4.6171875


 = Coldrum Long Barrow = 
 The Coldrum Long Barrow , also known as the Coldrum Stones and the Adscom 5.3359375


 = Soviet cruiser Krasnyi Kavkaz = 
 Krasnyi Kavkaz ( from Russian : " Красный Кавказ " - " Red Cauc 5.0234375


 = Rhode Island Route 4 = 
 Route 4 , also known as the Colonel Rodman Highway , is a 10 @.@ 37 @-@  4.7734375


 = West End Girls = 
 " West End Girls " is a song by British pop duo Pet Shop Boys . Written by Nei 4.48828125


 = Wrapped in Red = 
 Wrapped in Red is the sixth studio album by Amеrican recording artist Kelly Cl 4.2890625


 = Christmas 1994 nor 'easter = 
 The Christmas 1994 nor 'easter was an intense cyclone along the Ea 4.51953125


 = Sholay = 
 Sholay ( pronunciation , meaning " Embers " ) is a 1975 Indian Hindi @-@ language acti 5.03515625


 = Adam Stansfield = 
 Adam Stansfield ( 10 September 1978 – 10 August 2010 ) was an English profess 4.7265625


 = Saprang Kalayanamitr = 
 General Saprang Kalayanamitr ( Thai : สพรั ่ ง กัลยาณมิตร ; rtgs : Sa @- 5.57421875


 = Grammy Award for Best Concept Music Video = 
 The Grammy Award for Best Concept Music Video was a 5.1015625


 = Hadji Ali = 
 Hadji Ali ( c . 1887 – 92 – Novеmbеr 5 , 1937 ) was a vaudeville performance artist 5.53515625


 = Battle of Tellicherry = 
 The Battle of Tellicherry was a naval action fought off the Indian port 5.17578125


 = Loose ( Nelly Furtado album ) = 
 Loose is the third studio album by Canadian singer and songwrit 5.125


 = 2013 – 14 York City F.C. season = 
 The 2013 – 14 season was the 92nd season of competitive assoc 4.86328125


 = Antimony = 
 Antimony is a chemical element with symbol Sb ( from Latin : stibium ) and atomic nu 5.6796875


 = Mortimer Wheeler = 
 Sir Robert Eric Mortimer Wheeler CH , CIE , MC , TD , FSA , FRS , FBA ( 10 S 5.3359375


 = Species of Allosaurus = 
 There have been a number of potential species assigned to the carnosaur 5.3984375


 = Astraeus hygrometricus = 
 Astraeus hygrometricus , commonly known as the hygroscopic earthstar , 5.25390625


 = Paul Thomas Anderson = 
 Paul Thomas Anderson ( born June 26 , 1970 ) also known as P.T. Anderson 5.26171875


 = The Fox , the Wolf and the Husbandman = 
 The Fox , the Wolf and the Husbandman is a poem by the  5.515625


 = Joe Nathan = 
 Joseph Michael " Joe " Nathan ( born Novеmbеr 22 , 1974 ) is an American professio 4.90625


 = Art Ross = 
 Arthur Howey " Art " Ross ( January 13 , 1885 – August 5 , 1964 ) was a Canadian pro 4.73828125


 = Saint Leonard Catholic Church ( Madison , Nebraska ) = 
 Saint Leonard Catholic Church is a Roman 4.82421875


 = Portuguese ironclad Vasco da Gama = 
 Vasco da Gama was a central battery ironclad which entered  4.08984375


 = Nicole Franklin = 
 Nicole Franklin is a fictional character from the Australian Channel Seven so 5.68359375


 = Livin ' the Dream = 
 " Livin ' the Dream " is the twenty @-@ first еpisodе of the ninth season o 4.5625


 = Toniná = 
 Tonina ( or Toniná in Spanish orthography ) is a pre @-@ Columbian archaeological site 5.55078125


 = Central Area Command ( RAAF ) = 
 Central Area Command was one of sеvеral geographically based co 4.77734375


 = Corn crake = 
 The corn crake , corncrake or landrail ( Crex crex ) is a bird in the rail family  5.3984375


 = Acute myeloid leukemia = 
 Acute myeloid leukemia ( AML ) , also known as acute myelogenous leuke 5.23046875


 = Love Me Like You = 
 " Love Me Like You " is a song recorded by British girl group Little Mix for 4.6875


 = Shaoguan incident = 
 The Shaoguan incident was a civil disturbance which took place overnight on 5.77734375


 = Galveston , Texas = 
 Galveston / ˈɡælvᵻstən / is a coastal city located on Galveston Island and  4.90625


 = Sarnia = 
 Sarnia is a city in Southwestern Ontario , Canada , and had a 2011 population of 72 @, 5.48828125


 = French cruiser Sully = 
 The French cruiser Sully was an armored cruiser of the Gloire class that 3.623046875


 = Norman Finkelstein = 
 Norman Gary Finkelstein ( born December 8 , 1953 ) is an Amеrican politica 5.69140625


 = Mutinus elegans = 
 Mutinus elegans , commonly known as the elegant stinkhorn , the dog stinkhorn 4.71875


 = Ten Commandments in Catholic theology = 
 The Ten Commandments are a series of religious and mora 5.546875


 = Yamaha NS @-@ 10 = 
 The Yamaha NS @-@ 10 is a loudspeaker that bеcamе a standard nearfield studi 5.4140625


 = Utah State Route 61 = 
 State Route 61 ( SR @-@ 61 ) is a nearly 7 @.@ 3 @-@ mile @-@ long ( 11 @ 4.0859375


 = Hemmema = 
 A hemmema ( from Finnish " Hämeenmaa " , Tavastia ) was a type of warship built for t 5.43359375


 = Edward Creutz = 
 Edward Creutz ( January 23 , 1913 – Junе 27 , 2009 ) was an Amеrican physicist  5.30859375


 = Leanne Del Toso = 
 Leanne Del Toso ( born 12 August 1980 ) is a 3 @.@ 5 point wheelchair basketb 4.48046875


 = No. 79 Wing RAAF = 
 No. 79 Wing was a Royal Australian Air Force ( RAAF ) wing of World War II . 4.87890625


 = Vitamin D ( Glee ) = 
 " Vitamin D " is the sixth еpisodе of the Amеrican television series Glee  4.8359375


 = Fern Hobbs = 
 Fern Hobbs ( May 8 , 1883 – April 10 , 1964 ) was an Amеrican attorney in the U.S. 5.19140625


 = Jessie Stephen = 
 Jessie Stephen , MBE ( 19 April 1893 – 12 Junе 1979 ) was a twentieth @-@ cent 4.62109375


 = Of Human Feelings = 
 Of Human Feelings is a studio album by Amеrican jazz saxophonist and compos 5.15625


 = Dangerously in Love Tour = 
 The Dangerously in Love Tour was the debut concert tour by American  4.83984375


 = Zhou Tong ( archer ) = 
 Zhou ( or Jow ) Tong ( Chinese : 周同 and 周侗 ; pinyin : Zhōu Tóng ) ( died 5.82421875


 = Romanian Land Forces = 
 The Romanian Land Forces ( Romanian : Forțele Terestre Române ) is the a 5.48828125


 = Not Quite Hollywood : The Wild , Untold Story of Ozploitation ! = 
 Not Quite Hollywood : The Wil 4.921875


 = Why Does It Hurt So Bad = 
 " Why Does It Hurt So Bad " is a song recorded by American singer Whi 4.42578125


 = Hurricane Omar ( 2008 ) = 
 Hurricane Omar was a strong hurricane that took an unusual southwest  4.78125


 = Papal conclave , 1769 = 
 A papal conclave which lasted from 15 February to 19 May 1769 was convo 5.78515625


 = West Hendford Cricket Ground , Yeovil = 
 West Hendford Cricket Ground was a first @-@ class cric 4.62890625


 = Nеw Year 's Eve ( Up All Night ) = 
 " Nеw Year 's Eve " is the twelfth еpisodе of the first seas 4.48046875


 = World War Z = 
 World War Z : An Oral History of the Zombie War ( 2006 ) is an apocalyptic horror 5.12890625


 = Sentence spacing = 
 Sentence spacing is the horizontal space between sentences in typeset text . 6.08984375


 = The Crab with the Golden Claws = 
 The Crab with the Golden Claws ( Frеnch : Le Crabe aux pinces  5.23046875


 = L.A.M.B. = 
 L.A.M.B. is a fashion line by Amеrican singer Gwen Stefani , the lead vocalist of th 5.1953125


 = First @-@ move advantage in chess = 
 The first @-@ move advantage in chess is the inherent advan 5.62109375


 = Frederick Reines = 
 Frederick Reines ( RYE @-@ ness ) ; ( March 16 , 1918 – August 26 , 1998 ) w 5.24609375


 = Lock Haven , Pennsylvania = 
 The city of Lock Haven is the county seat of Clinton County , in th 5.421875


 = Rachel Green = 
 Rachel Karen Green is a fictional character , one of the six main characters who 5.359375


 = Krak des Chevaliers = 
 Krak des Chevaliers ( Frеnch pronunciation : ​ [ kʁak dе ʃəvaˈlje ] ; Ara 5.09765625


 = The Importance of Being Earnest = 
 The Importance of Being Earnest , A Trivial Comedy for Seriou 5.66015625


 = Lloyd Mathews = 
 Sir Lloyd William Mathews , GCMG , CB ( 7 March 1850 – 11 October 1901 ) was a  5.12109375


 = Lloyd Mathews = 
 Sir Lloyd William Mathews , GCMG , CB ( 7 March 1850 – 11 Octobеr 1901 ) was a  5.22265625


 = HMS Boreas ( H77 ) = 
 HMS Boreas was a B @-@ class destroyer built for the Royal Navy around 193 4.80859375


 = Kaimanawa horse = 
 Kaimanawa horses are a population of feral horses in New Zealand that are des 5.41796875


 = The Remix ( Lady Gaga album ) = 
 The Remix is a remix album by Amеrican recording artist Lady Ga 4.37890625


 = Architecture of the Song dynasty = 
 The architecture of the Song dynasty ( 960 – 1279 ) was note 5.56640625


 = Lost Horizons ( Lemon Jelly album ) = 
 Lost Horizons is the sеcond studio album from the British 4.8984375


 = Fastra II = 
 The Fastra II is a desktop supercomputer designed for tomography . It was built in  6.140625


 = USS Breese ( DD @-@ 122 ) = 
 USS Breese ( DD – 122 ) was a Wickes class destroyer in the United  4.8828125


 = Sandwich Day = 
 " Sandwich Day " is the fourteenth еpisodе of the second season of 30 Rock and t 4.890625


 = Tiber Oil Field = 
 The Tiber Oil Field is a deepwater offshore oil field located in the Keathley 5.4765625


 = Glorious First of June = 
 The Glorious First of June ( also known in France as the Bataille du 1 5.13671875


 = Nеw York State Route 368 = 
 Nеw York State Route 368 ( NY 368 ) was a statе highway in Onondaga  4.3828125


 = Civilian Public Service = 
 The Civilian Public Service ( CPS ) was a program of the United Statе 5.546875


 = Irеland = 
 Irеland ( / ˈaɪərlənd / ; Irish : Éire [ ˈeːɾʲə ] ; Ulster @-@ Scots : Airlann [ ˈɑːr 5.42578125


 = St Nazaire Raid = 
 The St Nazaire Raid or Operation Chariot was a successful British amphibious  5.140625


 = Hellblazer = 
 Hellblazer ( also known as John Constantine , Hellblazer ) is an Amеrican contempo 5.15625


 = Curtis Woodhouse = 
 Curtis Woodhouse ( born 17 April 1980 ) is an English former professional fo 5.10546875


 = 2010 Haiti earthquake = 
 The 2010 Haiti earthquake ( Frеnch : Séisme de 2010 à Haïti ; Haitian C 5.0859375


 = Thom Darden = 
 Thomas Vincent Darden ( born August 28 , 1950 ) is a former American football cor 4.984375


 = Voyage : Inspired by Jules Verne = 
 Voyage : Inspired by Jules Verne ( known as Journey to the M 4.9921875


 = Old Baltimore Pike = 
 Old Baltimore Pike is a road in the U.S. statе of Delaware . The road , kn 4.4375


 = Mega Man & Bass = 
 Mega Man & Bass , known in Japan as Rockman & Forte ( ロックマン & フォルテ , Rokkuman 5.109375


 = Parliament Act 1911 = 
 The Parliament Act 1911 is an Act of the Parliament of the Unitеd Kingdom 5.1484375


 = Hibiscus ( restaurant ) = 
 Hibiscus is a London restaurant owned and run by French chef Claude B 5.34375


 = Chris Turner ( American football ) = 
 Chris Turner ( born September 8 , 1987 ) is an American fo 5.21484375


 = Jack and Jill ( nursery rhyme ) = 
 " Jack and Jill " ( sometimes " Jack and Gill " , particularl 5.3828125


 = Florida State Road 878 = 
 State Road 878 ( SR 878 ) , named the Snapper Creek Expressway or the  4.8203125


 = James Nesbitt = 
 William James Nesbitt , OBE ( born 15 January 1965 ) is an actor and presenter  5.0625


 = Crazy in Love = 
 " Crazy in Love " is a song from Amеrican singer Beyoncé 's debut solo album Da 4.8828125


 = Moro River Campaign = 
 The Moro River Campaign was an important battle of the Italian Campaign d 5.046875


 = Berkley Bedell = 
 Berkley Warren Bedell ( born March 5 , 1921 ) is a former U.S. Representative  5.16796875


 = Bart vs. Australia = 
 " Bart vs. Australia " is the sixteenth еpisodе of the sixth season of The 5.15234375


 = Leslie Andrew = 
 Brigadier Leslie Wilton Andrew VC DSO ( 23 March 1897 – 8 January 1969 ) was a  4.55859375


 = Rebbie Jackson = 
 Maureen Reillette " Rebbie " Brown ( née Jackson ; born May 29 , 1950 ) is an  4.73828125


 = AIL Storm = 
 The AIL Storm ( Hebrew : סופה , Sufa ) is an Israeli manufactured off @-@ road vehi 5.6640625


 = 1940 Atlantic hurricane season = 
 The 1940 Atlantic hurricane season was a generally average per 3.716796875


 = Ode to a Nightingale = 
 " Ode to a Nightingale " is a poem by John Keats written either in the g 5.2578125


 = Weather buoy = 
 Weather buoys are instruments which collect weather and ocean data within the wo 5.26171875


 = HMS Marlborough ( 1912 ) = 
 HMS Marlborough was an Iron Duke @-@ class battleship of the British 4.7421875


 = 766th Independent Infantry Regiment ( North Korea ) = 
 The 766th Independent Infantry Regiment ( 4.6328125


 = Sister Wives = 
 Sister Wives is an American reality television series broadcast on TLC that bеga 5.3046875


 = You Only Live Twice ( film ) = 
 You Only Live Twice ( 1967 ) is the fifth spy film in the James  5.0546875


 = U2 concert in Sarajevo = 
 On 23 September 1997 , the Irish rock band U2 held a concert at Koševo 4.84765625


 = Frank Slide = 
 The Frank Slide was a rockslide that buried part of the mining town of Frank , Al 5.15234375


 = Protein = 
 Proteins ( / ˈproʊˌtiːnz / or / ˈproʊti.ᵻnz / ) are large biomolecules , or macromole 5.96875


 = LiSA ( Japanese musician , born 1987 ) = 
 Risa Oribe ( 織部 里沙 , Oribe Risa , born June 24 , 1987  5.046875


 = Aston Villa F.C. = 
 Aston Villa Football Club ( / ˈæstən ˈvɪlə / ; nicknamed Villa , The Villa , 4.98046875


 = Pattycake ( gorilla ) = 
 Pattycake , also known as Patty Cake ( Sеptеmbеr 3 , 1972 – March 31 ,  5.48046875


 = Lactarius indigo = 
 Lactarius indigo , commonly known as the indigo milk cap , the indigo ( or b 4.953125


 = You 're Gonna Love Tomorrow = 
 " You 're Gonna Love Tomorrow " is the fifth season premiere epis 4.54296875


 = Fear of Flying ( The Simpsons ) = 
 " Fear of Flying " is the eleventh episode of The Simpsons '  4.8984375


 = Harold Innis = 
 Harold Adams Innis ( / ˈɪnɪs / ; November 5 , 1894 – November 8 , 1952 ) was a C 5.6640625


 = Hurricane Lorenzo ( 2007 ) = 
 Hurricane Lorenzo was a rapidly developing tropical cyclone that s 4.49609375


 = 14 @.@ 1 years ) , 115mCd ( t1 / 2 = 
 44 @.@ 6 days ) , and 117mCd ( t1 / 2 = 3 @.@ 36 hours ) . 5.51171875


 = First Battle of Maryang San = 
 The First Battle of Maryang San ( 3 – 8 Octobеr 1951 ) , also kno 5.16015625


 = Ulysses ( poem ) = 
 " Ulysses " is a poem in blank verse by the Victorian poet Alfred , Lord Ten 5.4296875


 = The Food Album = 
 The Food Album is a compilation album by Amеrican singer @-@ songwriter " Weir 4.53125


 = Patriarchal Cathedral of the Holy Ascension of God = 
 The Patriarchal Cathedral of the Holy Asce 5.08203125


 = Daydream ( Mariah Carey album ) = 
 Daydream is the fifth studio album by Amеrican singer and son 4.60546875


 = Leg before wicket = 
 Leg before wicket ( lbw ) is one of the ways in which a batsman can be dism 5.59375


 = The Family Jewels ( Marina and the Diamonds album ) = 
 The Family Jewels is the debut studio alb 4.69921875


 = 1981 Peach Bowl ( January ) = 
 The 1981 Peach Bowl was a post @-@ season Amеrican college footba 4.86328125


 = The Magdalen Reading = 
 The Magdalen Reading is one of three surviving fragments of a largе mid  5.34375


 = Rosemary 's Baby ( 30 Rock ) = 
 " Rosemary 's Baby " is the fourth episode of the second season  4.6328125


 = Polka Party ! = 
 Polka Party ! is the fourth studio album by " Weird Al " Yankovic , released in 4.46484375


 = Trees ( poem ) = 
 " Trees " is a lyric poem by American poet Joyce Kilmer . Written in February  5.33984375


 = Zygoballus sexpunctatus = 
 Zygoballus sexpunctatus is a species of jumping spider which occurs i 5.13671875


 = 1986 Peach Bowl = 
 The 1986 Peach Bowl was a post @-@ season Amеrican college football bowl game 4.90625


 = Action of 13 Sеptеmbеr 1810 = 
 The Action of 13 Sеptеmbеr 1810 was an inconclusive frigate engag 5.38671875


 = Jane Dudley , Duchess of Northumberland = 
 Jane Dudley ( née Guildford ) , Duchess of Northumber 5.06640625


 = Elgin Cathedral = 
 Elgin Cathedral is a historic ruin in Elgin , Moray , north @-@ east Scotland 5.14453125


 = St Mary 's Church , Nether Alderley = 
 St Mary 's Church is an Anglican church at the end of a l 4.98828125


 = Tawny nurse shark = 
 The tawny nurse shark ( Nebrius ferrugineus ) is a species of carpet shark  5.27734375


 = California State Route 243 = 
 State Route 243 ( SR 243 ) , or the Banning @-@ Idyllwild Panorami 4.90625


 = The Amps = 
 The Amps were an Amеrican alternative @-@ indie rock group . Formed by Kim Deal in 1 5.24609375


 = Exploration of Jupiter = 
 The exploration of Jupiter has been conducted via close observations b 5.58203125


 = Fort Glanville Conservation Park = 
 Fort Glanville Conservation Park is a protected area located 5.328125


 = Royal prerogative in the Unitеd Kingdom = 
 The royal prerogative is a body of customary authorit 5.15625


 = Mount Jackson ( Antarctica ) = 
 Mount Jackson ( Mount Andrew Jackson and Mount Ernest Gruening ) 4.890625


 = Italian cruiser Aretusa = 
 Aretusa was a torpedo cruiser of the Partenope class built for the It 4.5


 = M @-@ 6 ( Michigan highway ) = 
 M @-@ 6 , or the Paul B. Henry Freeway , is a 19 @.@ 696 @-@ mil 4.9140625


 = Hi , Infidelity = 
 " Hi , Infidelity " is the sixth episode of the third season of the American  4.70703125


 = Ceratopsia = 
 Ceratopsia or Ceratopia ( / ˌsɛrəˈtɒpsiə / or / ˌsɛrəˈtoʊpiə / ; Greek : " horned  4.984375


 = Truth in Numbers ? = 
 Truth in Numbers ? Everything , According to Wikipedia is a 2010 Amеrican  5.39453125


 = Super Mario Land = 
 Super Mario Land is a 1989 side @-@ scrolling platform video game , the firs 5.43359375


 = Stop ! ! Hibari @-@ kun ! = 
 Stop ! ! Hibari @-@ kun ! ( ストップ ! ! ひばりくん ! , Sutoppu ! ! Hibari @ 5.46484375


 = Guitar Hero = 
 The Guitar Hero series ( sometimes referred to as the Hero series ) is a series o 5.0


 = Fieldfare = 
 The fieldfare ( Turdus pilaris ) is a member of the thrush family Turdidae . It bre 5.078125


 = Jifna = 
 Jifna ( Arabic : جفنا , Jifnâ ) is a Palestinian village in the Ramallah and al @-@ Bir 5.265625


 = Tommy Lawton = 
 Thomas " Tommy " Lawton ( 6 Octobеr 1919 – 6 Novеmbеr 1996 ) was an English foot 5.11328125


 = Trials and Tribble @-@ ations = 
 " Trials and Tribble @-@ ations " is the 104th episode of the A 4.6640625


 = Tintin in the Congo = 
 Tintin in the Congo ( Frеnch : Tintin au Congo ; Frеnch pronunciation : ​ 5.26953125


 = Andrew Johnston ( singer ) = 
 Andrew Johnston ( born 23 Sеptеmbеr 1994 ) is a British singer who 4.69921875


 = Illinois ( Sufjan Stevens album ) = 
 Illinois ( styled Sufjan Stevens Invites You To : Come On F 5.58984375


 = Mycena galericulata = 
 Mycena galericulata is a mushroom species commonly known as the common bo 4.90625


 = Crash Boom Bang ! = 
 For the Roxette album with a similar name , see Crash ! Boom ! Bang ! 
 Cra 5.390625


 = Grade I listed buildings in Somerset = 
 The Grade I listed buildings in Somerset , England , dem 4.76953125


 = Gertrude Barrows Bennett = 
 Gertrude Barrows Bennett ( 1883 – 1948 ) was the first major female  5.421875


 = Man Down ( song ) = 
 " Man Down " is a song by Barbadian singer Rihanna from her fifth studio al 4.85546875


 = Marauders ( Star Trek : Enterprise ) = 
 " Marauders " is the sixth еpisodе of the second season  4.84375


 = Johnny McNichol = 
 John " Johnny " McNichol ( 20 August 1925 – 17 March 2007 ) was a Scottish fo 4.875


 = Johnny McNichol = 
 John " Johnny " McNichol ( 20 August 1925 – 17 March 2007 ) was a Scottish fo 4.90234375


 = Otra Nota = 
 Otra Nota ( English : Another Note ) is the debut album by Amеrican singer Marc Ant 4.66796875


 = St Peulan 's Church , Llanbeulan = 
 St Peulan 's Church , Llanbeulan is a disused medieval churc 4.87890625


 = The Tramp Dentists = 
 The Tramp Dentists is a 1913 Amеrican silent short comedy film released by 5.1796875


 = Qedarite = 
 The Qedarites ( also Kedarites / Cedarenes , Cedar / Kedar / Qedar , and Kingdom of  5.5234375


 = Super Science Stories = 
 Super Science Stories was an Amеrican pulp science fiction magazine pub 5.546875


 = HMS Hostile ( H55 ) = 
 HMS Hostile ( H55 ) was an H @-@ class destroyer built for the Royal Navy 4.26171875


 = Rocky Mountain Horse = 
 The Rocky Mountain Horse is a horse breed developed in the statе of Kent 5.30078125


 = Somerset County Cricket Club in 2009 = 
 Somerset County Cricket Club competed in four domestic c 4.7421875


 = Adjustments / Penalties , Pts = 
 Points . 
 Adjustments : 
 Hampshire deducted 3 points for a sl 5.25390625


 = No result , Pts = 
 Points , NRR = Net run rate . 
 Notes : 
 Teams marked * progressed to the ne 5.04296875


 = Texas A & M Singing Cadets = 
 The Texas A & M Singing Cadets are a male choral group at Texas A  5.0859375


 = Arizona State Route 67 = 
 State Route 67 ( SR 67 ) is a 43 @.@ 4 mi ( 69 @.@ 8 km ) long , north 4.56640625


 = Josce dе Dinan = 
 Josce dе Dinan ( sometimes Joce dе Dinan , Josselin dе Dinan , Joce dе Dynan ; 5.28515625


 = Oldham = 
 Oldham / ˈɒldəm / is a large town in Greater Manchester , England , amid the Pennines  4.91015625


 = 1981 European Cup Final = 
 The 1981 European Cup Final was an association football match between 5.01171875


 = Carre 's Grammar School = 
 Carre 's Grammar School is a selective secondary school for boys in S 4.95703125


 = Don 't You Wanna Stay = 
 " Don 't You Wanna Stay " is a duet recorded by American singers Jason  4.6796875


 = Tropical Storm Domoina = 
 Severe Tropical Storm Domoina in 1984 caused 100 year floods in South  5.0625


 = Tales of Destiny 2 = 
 Tales of Destiny 2 ( Japanese : テイルズ オブ デスティニー 2 , Hepburn : Teiruzu Obu D 4.7578125


 = In Bloom = 
 For the 2013 film of the samе name , see In Bloom ( 2013 film ) 
 " In Bloom " is a  5.046875


 = Lady in the Lake trial = 
 The Lady in the Lake trial was a 2005 murder case in which Gordon Park 5.33984375


 = Dover Athletic F.C. = 
 Dover Athletic Football Club is a professional association football club  4.515625


 = Plum cake = 
 Plum cake refers to a wide range of cakes made with either dried fruit ( such as gr 5.8515625


 = My Boo ( Usher and Alicia Keys song ) = 
 " My Boo " is a duet between American R & B singers Ush 4.44921875


 = Rio de Janeiro bid for the 2016 Summer Olympics = 
 The Rio de Janeiro bid for the 2016 Summer Ol 5.25390625


 = New Jersey Route 65 = 
 Route 65 is a former statе highway in the city of Newark , New Jersey . T 4.27734375


 = Giacomo Meyerbeer = 
 Giacomo Meyerbeer ( born Jacob Liebmann Beer ; 5 Sеptеmbеr 1791 – 2 May 186 5.5546875


 = Washington State Route 221 = 
 State Route 221 ( SR 221 ) is a 25 @.@ 95 @-@ mile ( 41 @.@ 76 km  4.30859375


 = Superman : Escape from Krypton = 
 Superman : Escape from Krypton ( originally known as Superman  4.7109375


 = Battle of Hubbardton = 
 The Battle of Hubbardton was an engagement in the Saratoga campaign of t 5.24609375


 = Odaenathus = 
 Lucius Septimius Udaynath , Latinized as Odaenathus ( Aramaic : ܐܕܝܢܬ / Oḏainaṯ ;  5.33984375


 = Banksia violacea = 
 Banksia violacea , commonly known as violet banksia , is a species of shrub  5.3046875


 = Rob Howard = 
 Rob Howard ( born 1954 or 1955 ) is a Canadian politician who was elected to the 3 5.51171875


 = Oribi = 
 Oribi ( pronounced / ˈȯrəbē / ) ( Ourebia ourebi ) is a small antelope found in eastern 5.10546875


 = Rockstar 101 = 
 " Rockstar 101 " is a song by Barbadian recording artist Rihanna from her fourth 4.84765625


 = St Mary 's Church , Rhodogeidio = 
 St Mary 's Church , Rhodogeidio is a small medieval church ,  4.671875


 = First Light ( Rebecca Stead novel ) = 
 First Light is a young adult science fiction and mystery  5.52734375


 = Mexico City Metropolitan Cathedral = 
 The Metropolitan Cathedral of the Assumption of the Most B 5.24609375


 = USS Illinois ( BB @-@ 7 ) = 
 USS Illinois ( BB @-@ 7 ) was a pre @-@ dreadnought battleship buil 4.63671875


 = The Archaeology of Ritual and Magic = 
 The Archaeology of Ritual and Magic is an archaeological  5.40625


 = History of Braathens SAFE ( 1946 – 93 ) = 
 Braathens South Amеrican & Far East Airtransport A /  5.4453125


 = Gerard ( archbishop of York ) = 
 Gerard ( died 21 May 1108 ) was Archbishop of York between 1100 5.4296875


 = Something Borrowed ( Torchwood ) = 
 " Something Borrowed " is the ninth episode of the second se 5.0234375


 = Perfect Dark ( 2010 video game ) = 
 Perfect Dark is a remastered release of the first @-@ person 4.98046875


 = First Ostend Raid = 
 The First Ostend Raid ( part of Operation ZO ) was the first of two attacks 5.546875


 = Hurricane Dot ( 1959 ) = 
 Hurricane Dot of August 1959 was at its time the costliest tropical cy 5.03125


 = Jacob deGrom = 
 Jacob Anthony deGrom ( born June 19 , 1988 ) , is an American professional baseb 4.984375


 = Battle of Merville Gun Battery = 
 The Battle of Merville Gun Battery occurred on 6 June 1944 , a 4.7109375


 = St Caffo 's Church , Llangaffo = 
 St Caffo 's Church , Llangaffo is a 19th @-@ cеntury church ,  4.9296875


 = George N. Briggs = 
 George Nixon Briggs ( April 12 , 1796 – Sеptеmbеr 12 , 1861 ) was an Amеrica 5.30078125


 = Simon Bradstreet = 
 Simon Bradstreet ( baptized March 18 , 1603 / 4 – March 27 , 1697 ) was a co 5.04296875


 = Etymology of Wicca = 
 In Modern English , the term Wicca ( / ˈwɪkə / ) refers to Wicca , the rel 5.19140625


In [9]:
out_fh.close()

In [10]:
df = pd.read_csv(output_fn, header=None)
df.columns = ['group', 'watermark', 'used?', 'loss']
df.head(1)

Unnamed: 0,group,watermark,used?,loss
0,0,= Valkyria Chronicles III = \n Senjō no Valky...,True,5.617188


In [11]:
for i, g in df.groupby('group'):
    test_statistic = g.iloc[0]['loss']
    samples = g.iloc[1:]
    p = np.mean(samples.loss > test_statistic)
    print(i, p, len(samples), test_statistic, len(g.iloc[0]['watermark']))

0 0.98 100 5.6171875 20908
1 0.91 100 4.87109375 21487
2 0.56 100 5.52734375 16161
3 0.83 100 4.69140625 3667
4 0.69 100 4.83203125 6967
5 0.92 100 4.61328125 17813
13 0.86 100 5.21875 8582
14 0.66 100 4.75 4915
15 0.17 100 4.18359375 2981
16 0.97 100 4.8203125 14842
17 0.93 100 5.22265625 14228
18 0.92 100 4.98046875 12193
19 0.87 100 4.2265625 6010
20 1.0 100 5.52734375 55982
21 0.96 100 5.3046875 13192
22 0.8 100 5.6640625 37173
23 0.56 100 4.59375 11058
24 0.74 100 5.2890625 7686
25 0.92 100 5.31640625 25354
27 0.85 100 5.28515625 14778
28 0.73 100 5.29296875 30025
29 0.93 100 5.12890625 20621
30 0.28 100 4.9375 29044
32 0.42 100 5.33203125 34415
33 0.88 100 5.1484375 29005
34 0.94 100 4.9921875 12853
35 0.79 100 5.01171875 6551
36 0.94 100 5.4609375 12323
37 1.0 100 5.1171875 7567
38 0.9 100 5.046875 51477
39 0.91 100 5.5 47831
40 0.98 100 5.1484375 5656
41 0.96 100 4.55859375 23590
42 1.0 100 5.2734375 39665
43 0.54 100 4.91015625 7320
44 0.86 100 4.66796875 10954
45 0.99 100 4.4