### Looking at lexically-redacted examples

I'm interested in finding some **fully lexically-redacted examples** that our model identifies correctly.

In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

In [2]:
from dataloader import WikipediaDataModule
import os

num_cpus = len(os.sched_getaffinity(0))

dm = WikipediaDataModule(
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    max_seq_length=128,
    dataset_name='wiki_bio',
    dataset_train_split='train[:1024]', # not used in this notebook
    dataset_val_split='val[:20%]',
    dataset_version='1.2.0',
    word_dropout_ratio=0.0,
    word_dropout_perc=0.0,
    num_workers=1,
    train_batch_size=64,
    eval_batch_size=64
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 1 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:1024]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split val[:20%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-793b771e10f80bbe.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-7d07543b6205ca87.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-912d45fbf560a15e.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-4731c171b2d92df3.arrow
Loading cached processed dataset at /h

In [3]:
from model import CoordinateAscentModel
from model_cfg import model_paths_dict

print(model_paths_dict.keys())
checkpoint_path = model_paths_dict["model_8_ls0.1"]


model = CoordinateAscentModel.load_from_checkpoint(
    checkpoint_path
)

dict_keys(['model_3', 'model_4', 'model_5', 'model_6', 'model_7', 'model_8_ls0.01', 'model_8_ls0.05', 'model_8_ls0.1', 'model_9_ls0.01', 'model_9_ls0.05', 'model_9_ls0.1'])


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 4e-05 and patience 6


In [4]:
import numpy as np
import tqdm
import torch

def precompute_profile_embeddings(model):
    model.profile_model.cuda()
    model.profile_model.eval()
    model.profile_embed.cuda()
    model.profile_embed.eval()

    model.val_profile_embeddings = np.zeros((len(dm.val_dataset), model.shared_embedding_dim))
    for val_batch in tqdm.tqdm(dm.val_dataloader()[0], desc="Precomputing val embeddings", colour="green", leave=False):
        with torch.no_grad():
            profile_embeddings = model.forward_profile(batch=val_batch)
        model.val_profile_embeddings[val_batch["text_key_id"]] = profile_embeddings.cpu()
    model.val_profile_embeddings = torch.tensor(model.val_profile_embeddings, dtype=torch.float32)

precompute_profile_embeddings(model)

                                                                              3.45it/s]

In [5]:
from typing import List

import transformers
from model.model import Model

class MyModelWrapper:
    model: Model
    tokenizer: transformers.AutoTokenizer
    profile_embeddings: torch.Tensor
    max_seq_length: int
    
    def __init__(self, model: Model, tokenizer: transformers.AutoTokenizer, max_seq_length: int = 128):
        self.model = model
        self.model.eval()
        self.tokenizer = tokenizer
        self.profile_embeddings = torch.tensor(model.val_profile_embeddings)
        self.max_seq_length = max_seq_length
                 
    def to(self, device):
        self.model.to(device)
        self.profile_embeddings.to(device)
        return self # so semantics `model = MyModelWrapper().to('cuda')` works properly

    def __call__(self, text_input_list: List[str], batch_size=32):
        model_device = next(self.model.parameters()).device
        
        doc_tokenized = self.tokenizer.batch_encode_plus(
            text_input_list,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        doc_tokenized = {f'document__{k}': v for k,v in doc_tokenized.items()}
        with torch.no_grad():
            document_embeddings = self.model.forward_document(batch=doc_tokenized, document_type='document')
            document_to_profile_logits = document_embeddings @ self.profile_embeddings.T.to(model_device)
            document_to_profile_probs = torch.nn.functional.softmax(
                document_to_profile_logits, dim=-1
            )
        assert document_to_profile_probs.shape == (len(text_input_list), len(self.profile_embeddings))
        return document_to_profile_probs
            

In [6]:
dm.val_dataset

Dataset({
    features: ['input_text', 'target_text', 'name', 'document', 'profile', 'profile_keys', 'profile_values', 'text_key', 'profile__input_ids', 'profile__token_type_ids', 'profile__attention_mask', 'document_redact_lexical', 'document_redact_ner', 'document_redact_idf_20', 'document_redact_idf_40', 'document_redact_idf_60', 'document_redact_idf_80', 'text_key_id'],
    num_rows: 14566
})

In [7]:
model_wrapper = MyModelWrapper(model=model, tokenizer=dm.document_tokenizer)
model_wrapper.to('cuda')

  self.profile_embeddings = torch.tensor(model.val_profile_embeddings)


<__main__.MyModelWrapper at 0x7f4dc9645370>

In [12]:
i = 0
batch_size = 256
preds = []
while i < len(dm.val_dataset):
    preds.extend(
        model_wrapper(
            dm.val_dataset['document'][i:i+batch_size], batch_size=batch_size
        )
    )
    i += batch_size
preds = torch.stack(preds)

In [27]:
(preds.argmax(dim=-1) == torch.arange(len(dm.val_dataset)).cuda()).float().mean().item()

0.9957435131072998

In [None]:
i = 0
batch_size = 256
lex_preds = []
while i < len(dm.val_dataset):
    if i % 2048 == 0: print(i)
    lex_preds.extend(
        model_wrapper(
            dm.val_dataset['document_redact_lexical'][i:i+batch_size], batch_size=batch_size
        )
    )
    i += batch_size
lex_preds = torch.stack(lex_preds)

In [26]:
lex_pred_correct = (lex_preds.argmax(dim=-1) == torch.arange(len(dm.val_dataset)).cuda())
lex_pred_correct.float().mean().item()

0.6248111724853516

In [28]:
correct_lex_redacted_docs = [dm.val_dataset[i] for i in range(14566) if lex_pred_correct[i]]
len(correct_lex_redacted_docs)

9101

In [175]:
from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def html_format_doc(doc: str) -> str:
    # return html.escape(doc)
    return doc.replace('<mask>', '<span style="color: darkgray; background-color: darkgray; line-height: 16px; font-size: 14px; font-weight: 100"> &lt;mask&gt;</span>')

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

table = '<table>'
table += f'<tr>{wrap_th("ID")}{wrap_th("Name")}{wrap_th("Redacted document")}{wrap_th("Profile")}</th>'
for i in range(100):
    table += '<tr>'
    ex = correct_lex_redacted_docs[i]
    table += wrap_td(ex['text_key_id'])
    table += wrap_td(ex['name'])
    table += wrap_td(html_format_doc(ex['document_redact_lexical']))
    table += wrap_td(make_prof_html(ex['profile']))
    table += '</tr>'
table += '</table>'
    
display(HTML(table))

0,1,2,3
successor,gabriel i,,
name,michael iii of alexandria,,
residence,saint mark 's church,,
ended,16 march 907,,
feast_day,16 -rrb- march -lrb- 20 baramhat in the coptic calendar,,
title,56th of st. mark pope of alexandria & patriarch of the see,,
enthroned,25 april 880,,
predecessor,shenouda i,,
death_date,16 march 907,,
buried,monastery of saint macarius the great,,

0,1
successor,gabriel i
name,michael iii of alexandria
residence,saint mark 's church
ended,16 march 907
feast_day,16 -rrb- march -lrb- 20 baramhat in the coptic calendar
title,56th of st. mark pope of alexandria & patriarch of the see
enthroned,25 april 880
predecessor,shenouda i
death_date,16 march 907
buried,monastery of saint macarius the great

0,1
fullname,hui jun
name,hui jun
article_title,hui jun

0,1
term_end,1974
successor,thomas e. flaherty
name,leonard l. martino
article_title,leonard martino
spouse,marion mason -lrb- m. 1955 -rrb-
alma_mater,duquesne university georgetown university
birth_place,"butler , pennsylvania"
birth_date,1 april 1940
state_house,pennsylvania
party,democratic

0,1
caption,crawford with the los angeles dodgers
name,carl crawford
debutdate,july 20
position,left fielder
bats,left
image_size,250px
debutyear,2002
debutteam,tampa bay devil rays
statlabel,batting bases triples average hits home runs runs batted in stolen
statvalue,".284 1,893 134 754 473 121"

0,1
caption,blessed osanna of cattaro
name,blessed osanna of cattaro -lrb- ozana kotorska -rrb-
beatified_date,1928 -lrb- cultus confirmed -rrb- 1934 -lrb- beatified -rrb-
feast_day,27 april
imagesize,129px
death_date,27 april 1565
image,ozanaofkotor.jpg
beatified_by,pope pius xi
patronage,"kotor , montenegro"
birth_place,"relezi or kumano , principality of zeta/montenegro"

0,1
alt,200px
name,thaila ayala
birth_name,thaila ayala sales
spouse,paulo vilhena -lrb- 2011 -- 2013 -rrb-
image,thaila ayala 02-2 . jpg
birth_place,"presidente prudente , brazil"
birth_date,14 april 1986
article_title,thaila ayala
occupation,"actress , model"

0,1
career_end,2003
ntl_team,sui
height_ft,6
article_title,sven leuenberger
position,defence
height_in,0
image_size,230px
played_for,sc -rrb- ehc uzwil bern -lrb- nla -rrb- hc lugano -lrb- nla
career_start,1987
birth_place,"niederuzwil , sui"

0,1
name,brett scott
originalteam,the rock-yerong creek
position,centreman
years,1981-1989
clubs,south melbourne/sydney
coachclubs,sydney
birth_place,"wagga wagga , new south wales"
birth_date,10 april 1962
article_title,brett scott
coachyears,1993

0,1
nationalgoals,0 0
fullname,marlon evans
caption,marlon bank of guam autograph signing in june 2015 evans taking pictures with the fans at the
name,marlon evans
article_title,marlon evans
nationalyears,2013 -- 2014 --
position,midfield
image_size,250
currentclub,wings
ntupdate,17 october 2014

0,1
name,jesper blicher
youthclubs,kolt-hasselager if agf
caps,34 13 15 35 58 0
position,left winger
pcupdate,8 february 2013
years,-2003 2003 - 2010 2010-2012 2012-2015 2015 -
currentclub,lyngby bk
clubs,kolt-hasselager lyngby bk if agf næstved bk fc fredericia vendsyssel ff
birth_place,denmark
clubnumber,29

0,1
name,stewart ford
networth,gbp # 160million -lrb- september 2010 -rrb-
birth_name,stewart owen ford
residence,switzerland
nationality,british
birth_date,1964
article_title,stewart ford
occupation,"founder and ceo , keydata"

0,1
website,http://www.compassionandchoices.org
name,barbara coombs lee
residence,"portland , oregon"
nationality,american
education,"vassar , oregon health sciences university college , cornell university , university of washington"
article_title,barbara coombs lee
occupation,president of compassion & choices

0,1
term_end,2003
name,darren m. swain
order,maryland house of delegates
profession,administrator
religion,ame
birth_place,"windsor , north carolina"
state_delegate,maryland
birth_date,06 may 1970
article_title,darren swain
party,democrat

0,1
nationalgoals,0
fullname,paul charles quinn
position,center back
pcupdate,"19:14 , 31 august 2015 -lrb- utc -rrb-"
years,2002 2014 -- 2015 2015 -- -- 2009 2009 -- 2012 2012 -- 2014
nationalcaps,3
height,1.85 m ftin 0 on
caps,161 46 73 29 4
image,"quinn , paul.jpg"
nationalteam,scotland u21

0,1
nfl,mot401788
college,northwestern
position,end
finalyear,1946
statvalue,124 13.8 1
debutyear,1946
statlabel,receiving yards average touchdowns
death_date,03 june 2007
draftpick,80 -lrb- by the washington redskins -rrb-
birth_place,"chicago , illinois"

0,1
fullname,christiane soeder
caption,christiane year awards soeder at the 2008 austrian sportspersonality of the
proteam,arbö askö graz cwt 2006
name,christiane soeder
updated,9 april 2009
role,rider
weight,52 kg on
image,christiane soeder nacht des sports 2008a.jpg
birth_place,"remscheid , west germany"
birth_date,15 january 1975

0,1
term_end,2000 1999 1999 22 september 1996
successor,bernard percival
name,molwyn joseph
office,"minister and social improvement minister for planning , implementation and the of environment minister of finance and trade tourism and the environment minister of heath"
profession,politician
nationality,antiguan
article_title,molwyn joseph
party,antigua labour party
term_start,1999 15 may 1996 3 december 1997 1991

0,1
name,edmond reusens
death_date,25 december 1903 -lrb- age 72 -rrb-
nationality,belgian
birth_place,"wijnegem , antwerp"
birth_date,25 april 1831
article_title,edmond reusens
occupation,archeologist & historian
death_place,leuven

0,1
name,nicetas
known_for,conquered it afterwards ; general in the 602 -- 628 war egypt with persia during the revolt against phocas and governed
relatives,emperor heraclius -lrb- cousin -rrb-
death_date,"after 618/9 , possibly 628/9"
religion,chalcedonian christianity
opponents,"phocas , shahrbaraz"
article_title,nicetas -lrb- cousin of heraclius -rrb-
years_active,608 -- 618/9
children,empress gregoria
ethnicity,byzantine

0,1
term_end,7 march 2007
successor,caral ni chuilin
name,kathy stanton
assembly,northern ireland
imagesize,150px
constituency_am,belfast north
article_title,kathy stanton
party,sinn féin
term_start,26 november 2003
predecessor,billy hutchinson

0,1
home_town,"wilton , connecticut , u.s."
name,abby elliott
birthname,abigail elliott
yearsactive,2006 -- present
relatives,bob elliott -lrb- grandfather -rrb-
alma_mater,marymount manhattan college
parents,paula niedert chris elliott
birth_place,"new york city , new york , u.s."
education,immaculate high school
birth_date,16 june 1987

0,1
caption,"thomas bonn , germany on april 5 , 2015 at the '' hobbitcon iii '' convention in"
name,jeffrey thomas
image,250px
years_active,1979-present
birth_place,"wales , united kingdom"
article_title,jeffrey thomas -lrb- actor -rrb-
occupation,actor

0,1
fivefor,--
fullname,paul wilson brooks
source,http://www.espncricinfo.com/ci/content/player/10029.html cricinfo
catches/stumpings,-- / --
club,middlesex
death_place,"paddington , london , england"
deliveries,balls --
tenfor,--
nickname,mr
death_date,26 january 1946

0,1
fullname,asbjørn kragh andersen
caption,asbjørn kragh andersen at the 2015 paris-arras tour .
proteam,team trefor christina watches - kuma team trefor-blue water
name,asbjørn kragh andersen
updated,12 may 2015
role,rider
image,"arras mai 2015 -lrb- a061 -rrb- . jpg - paris-arras tour , étape 3 , 24"
birth_date,09 april 1992
article_title,asbjørn kragh andersen
discipline,road

0,1
fullname,tigam alif farisma
name,tigam
caps,1
position,defender
pcupdate,11 september 2013
currentclub,mitra kukar f.c.
birth_place,indonesia
clubnumber,22
birth_date,6 august 1992
article_title,tigam alif farisma

0,1
name,samia hamouda abbou
office,member of the assembly of the representatives of the people of the constituent assembly -lrb- elect -rrb- member
constituency,tunis i nabeul ii
spouse,mohamed abbou
native_name,سامية حمودة عبو
image,220px
term_start,2011-12-27
native_name_lang,ar
birth_place,"tebourba , tunisia"
alma_mater,tunis el manar university

0,1
birth_date,7 december 1994
club,Älvdalens skg
name,lotten sjödén
article_title,lotten sjödén

0,1
high_school,"leesville -lrb- raleigh , north carolina -rrb-"
career_end,2009
name,paul stoll
college,clemson -lrb- 2002 -- 2006 -rrb-
height_ft,6
article_title,shawan robinson
position,point guard / shooting guard
draft_year,2006
height_in,2
years,2006 2009 -- 2007 2007 -- 2008 2008 2008 --

0,1
caption,christine frederick
name,christine isobel mcgaffey frederick
spouse,j. george frederick -lrb- businessman -rrb-
death_date,6 april 1970
nationality,american
birth_place,"boston , massachusetts , u.s."
birth_date,6 february 1883
article_title,christine frederick
occupation,"home economist , author"

0,1
name,frank waln
module,yes hop label = associated_acts = website = background = solo_singer instrument = genre = hip
years_active,2010 -- present
birth_place,"rosebud sioux reservation , south dakota , united states"
article_title,frank waln
occupation,rapper songwriter

0,1
name,warren gill
finalteam,pittsburgh pirates
debutdate,august 26
position,first baseman
bats,right
finaldate,september 29
finalyear,1908
debutyear,1908
debutteam,pittsburgh pirates
statlabel,batting average home runs runs batted in

0,1
birth_date,17 february 1201 -lrb- 11 jamadi al-ula 597 -rrb-
influences,"avicenna , fakhr al-din razi , mo'ayyeduddin urdi"
name,'' ` nasīr al-dīn tūsī '' '
school_tradition,avicennism
article_title,nasir al-din al-tusi
influenced,"maitham shirazi , ibn al-shatir , copernicus al bahrani , ibn khaldun , qutb al-din"
era,islamic golden age
image_size,160px
works,"'' ` aqaid '' , '' akhlaq-i-nasri '' , '' zij-i rawḍa-yi ilkhani '' , '' al-risalah al-asturlabiyah '' , '' al-tadhkirah taslīm fi'ilm al-hay ` ah '' '' , '' tajrid al -"
title,khawaja nasir

0,1
caption,"dundas 1942 at raf duxford , cambridgeshire , 2 january"
name,sir hugh dundas
awards,commander service order distinguished flying cross of the order of the british empire distinguished
birth_name,hugh spencer lisle dundas
laterwork,company director
article_title,hugh dundas
allegiance,united kingdom
death_date,10 july 1995
image,ch 4545.jpg
nickname,cocky

0,1
name,jonah kapena
signature,iona kapena 1842 signature.jpg
spouse,kahilipulu
death_date,"march 12 , 1868"
alma_mater,lahainaluna seminary
religion,congregationalism
nationality,hawaiian
article_title,jonah kapena
resting_place,kawaiahaʻo church
occupation,"royal secretary , judge , civil servant , editor"

0,1
occupation,singer
website,-lsb- http://www.myspace.com/chiaramalta chiara in myspace -rsb-
caption,chiara in 2009
name,chiara siracusa
background,solo_singer
label,bridge productions
image,chiara1.jpg
alias,chiara
origin,"senglea , malta"
years_active,1998 -- present

0,1
term_end,"january 3 , 2015"
successor,stacey plaskett
name,donna christian-christensen
birthname,donna christian
spouse,chris christensen
alma_mater,"st. mary 's college , indiana george washington university"
image,donna christensen.jpg
religion,moravian
birth_place,"teaneck , new jersey , u.s."
state,the virgin islands

0,1
nationalgoals,1
position,forward / midfielder
pcupdate,"17:57 , 10 may 2014 -lrb- utc -rrb-"
years,2000 2008 -- 2009 2009 -- 2011 2011 -- 2013 2013 -- -- 2014 2014 -- 2005 2005 -- 2006 2006 -- 2008
nationalcaps,18
height,1.73 m on
youthclubs,planète champion
caps,39 76 28 41 18 42 15 33 0
image,patrick zoundi 2013 3.jpg
nationalteam,burkina faso

0,1
event,400m hurdles
caption,roxroy cato competing at the 2014 commonwealth games
article_title,roxroy cato
weight,kg 77
image_size,150px
image,roxroycato2014.jpg
sport,athletics
birth_date,5 january 1988
collegeteam,lincoln university st. augustine 's university
height,1.83

0,1
caption,"weaver premiere in sydney , australia in december 2012 at '' les misérables '' red carpet movie"
name,jacki weaver
honorific_suffix,ao
birth_name,jacqueline ruth weaver
yearsactive,1966 & ndash ; present
spouse,"david max hensser -lrb- 1975 - ? -rrb- derryn hinch -lrb- price 1983 & ndash ; 1996 , 1997 & ndash ; -lrb- 1998 -rrb- sean taylor -lrb- 2003 & ndash ; present 1966 -rrb- & ndash ; 1970 -rrb-"
image,jacki live music photographer -lrb- 1 -rrb- . jpg weaver - flickr - eva rinaldi celebrity and
partner,john walters
birth_place,"sydney , new south wales , australia"
birth_date,25 may 1947

0,1
name,james donald
birthname,james robert macgeorge donald
yearsactive,1930s-1978
spouse,ann -rrb- 1 child donald -lrb- ? -1993 -rrb- -lrb- his death
death_date,3 august 1993
image,actor_james_donald . jpg
birth_place,"aberdeen , scotland uk"
birth_date,18 may 1917
article_title,james donald
occupation,actor

0,1
name,michael c. j. putnam
fields,classical studies
image_size,70px
alma_mater,harvard university
nationality,united states
birth_place,"springfield , massachusetts"
birth_date,20 september 1933
article_title,michael c. j. putnam
workplaces,brown university

0,1
updated,2014-09-15
name,svend erik bjerg
birth_place,"maribo , denmark"
birth_date,16 november 1944
article_title,svend erik bjerg

0,1
name,bill ingebrigtsen
spouse,marilyn ingebrigtsen
alma_mater,alexandria technical and community college
predecessor,dallas sams
religion,united methodism
birth_place,"karlstad , minnesota , u.s."
birth_date,26 march 1952
article_title,bill ingebrigtsen
party,republican
term_start,"january 3 , 2007"

0,1
name,byzantine
background,group_or_band
current_members,chris matt wolfe `` oj '' ojeda brian henderson sean sydnor
image_size,250
label,"prosthetic , independent"
origin,"charleston , west virginia"
years_active,"2000 -- 2008 , 2010 -- present"
article_title,byzantine -lrb- band -rrb-
genre,"groove see below -rrb- metal , thrash metal , progressive metal -lrb-"
past_members,tony skip '' cromer rohrbough chris `` cid '' adams michael ``

0,1
occupation,singer
name,siw malmkvist
background,solo_singer
birth_name,siw gunnel margareta malmkvist
image_size,200px
image,siw malmkvist.jpg
landscape,siw malmkvist at melodifestivalen 1961
years_active,1957 -
birth_place,"borstahusen , sweden"
birth_date,31 december 1936

0,1
career_end,1960
name,ivan irwin
height_ft,6
article_title,ivan irwin
position,defenseman
height_in,1
image_size,200px
played_for,new york rangers montreal canadiens
image,1954 topps ivan irwin.jpg
career_start,1947

0,1
term_end,1830
birth_date,1754
successor,baron plunket
name,sir anthony hart
order,lord chancellor of ireland
monarch,george iv
death_date,6 december 1831
nationality,british
primeminister,"f. j. robinson , 1st viscount goderich"
article_title,anthony hart

0,1
name,baba deep singh
known_for,"1 -rrb- first head of damdami taksal , 3 -rrb- freed -rrb- the captives during the fourth raid of ahmad shah durrani first in 1757 . head of shaheedan misl , 2"
death_date,13 november 1757
birth_place,"pahuwind -rrb- , amritsar -lrb- now sri tarn taran sahib"
article_title,baba deep singh
death_place,"harmandir sahib , amritsar"

0,1
name,joseph d. kearney
fields,civil litigation appellate practice
image_size,137px
alma_mater,harvard law school yale university
image,jdkearney.jpg
religion,roman catholic
birth_place,"chicago , illinois"
article_title,joseph d. kearney
workplaces,marquette university law school

0,1
career_end,1972
height_ft,5
article_title,dick duff
position,left wing
height_in,10
image_size,200px
played_for,toronto angeles kings buffalo sabres maple leafs new york rangers montreal canadiens los
image,chex dick duff.jpg
career_start,1954
birth_place,"kirkland lake , on , can"

0,1
website,senate website
spouse,david -lrb- 2002 -- present -rrb- exposito -lrb- 1984 -- 1995 -rrb- joseph shepard
party,democratic
office,34th auditor of missouri
image,235px
governor,mel carnahan roger wilson bob holden matt blunt
birth_date,24 july 1953
article_title,claire mccaskill
jr/sr,united states senator
children,3

0,1
nfl,coo270410
name,ed cooke
college,maryland
position,de/lb
finalyear,1967
statvalue,117 - 25 7 2
debutyear,1958
statlabel,games played - started interceptions touchdowns
draftpick,29
draftround,3

0,1
article_title,ken morley
yearsactive,1972-present
spouse,susan morley -lrb- 1989-present -rrb-
notable_works,reg von flockenstuffen in '' ` allo ` allo ! holdsworth in '' coronation street '' general leopold
nationality,english
birth_place,"chorley , lancashire , england , uk"
birth_date,17 january 1943
imagesize,150px
television,'' & ndash ; 1991 -rrb- '' coronation street '' -lrb- ` 1989 & ndash ; 1995 -rrb- '' hardware '' -lrb- allo 2003 & ndash ; 2004 -rrb- '' celebrity fit club ` '' -lrb- 2005 -rrb- '' celebrity masterchef '' -lrb- 2014 allo -rrb- '' celebrity big brother '' -lrb- 2015 -rrb- ! '' -lrb- 1988
occupation,"actor , comedian"

0,1
nationalgoals,20
fullname,maría catalina usme pineda
name,catalina usme
caps,12
article_title,catalina usme
nationalyears,2006 --
position,striker
ntupdate,"23:27 , 23 june 2015 -lrb- utc -rrb-"
nationalteam,colombia
birth_place,colombia

0,1
fullname,richard jasiewicz
position,sr lf-lk
source,-lsb- http://www.rugbyleagueproject.org/players/all.html rugbyleagueproject.org -rsb- -lsb- http://www.englandrl.co.uk englandrl.co.uk -rsb-
retired,yes
yearastart,1984
appearancesa,1
club,bradford northern wakefield trinity doncaster
fieldgoalsa,0
yearstart,≤ 1980 ≤ 1987 ≤ 1991
pointsa,0

0,1
name,sir albert percy addison
awards,knight companion of the order of the bath companion of the commander order of st michael and st george mentioned in despatches of the order of the british empire
article_title,albert addison
allegiance,united kingdom 23px
death_date,13 november 1952
serviceyears,1889 -- 1929 1939 -- 1946
rank,admiral
commands,hm australian fleet hms dartmouth hms '' e52 ''
birth_date,8 november 1875
battles,first world war second world war

0,1
term_end,1807
caption,george ponsonby .
successor,the lord manners
name,george ponsonby
birth_date,5 march 1755
order,lord chancellor of ireland
monarch,george iii
imagesize,200px
death_date,8 july 1817
image,georgeponsonby.jpg

0,1
nfl,dav222267
position,tight end
heightft,6
heightin,5
debutteam,new york giants
birth_date,13 march 1983
article_title,charles davis -lrb- tight end -rrb-
draftyear,2006
number,88
finalteam,hartford colonials

0,1
time,376 days 17hrs 49min
name,peggy annette whitson
selection,1996 nasa group
eva,"6 39 hours , 46 minutes"
alma_mater,iowa wesleyan college rice university
image,peggy whitson.jpg
status,active
nationality,american
birth_place,"mount ayr , iowa"
mission,"sts-111 -lrb- expedition 16 -rrb- , expedition 5 , sts-113 , soyuz tma-11"

0,1
fullname,thiago dos santos costa
name,thiago
position,defender
years,2009
currentclub,são luiz
clubs,ehime
birth_place,brazil
birth_date,28 february 1983
article_title,thiago dos santos costa
height,1.82

0,1
position,shooting guard
draft_pick,54
years,1960 -- 1961 1961 1961 -- 1962 1968
nationality,american
team,kirby st. louis hawks anaheim amigos -lrb- aba -rrb- 's shoes -lrb- nibl -rrb- los angeles lakers
bbr,simsbo01
high_school,"jordan -lrb- los angeles , california -rrb-"
draft_team,st. louis hawks
draft_year,1960
height_in,5

0,1
fullname,ben wilson
name,ben wilson
originalteam,norwood
years,1996 1997 '' ` total - '' '
draftpick,"9th , 1994 afl draft"
clubs,collingwood sydney swans
birth_date,25 february 1977
article_title,ben wilson -lrb- australian footballer -rrb-
heightweight,191 ; kg & nbsp ; cm / 87 & nbsp
statsend,1997

0,1
name,richie myers
finalteam,chicago cubs
debutdate,april 21
position,pinch runner/pinch hitter
bats,right
finaldate,may 2
finalyear,1956
debutyear,1956
debutteam,chicago cubs
statlabel,games played at bats hits runs scored

0,1
field,"watercolor , painting"
caption,"theodore earl butler , 1909"
movement,american impressionism
name,theodore earl butler
bgcolour,# 6495ed
death_date,"may 2 , 1936"
image,portrait of butler.jpg
nationality,american
birth_place,"columbus , ohio"
birth_date,1861

0,1
name,francesco domenico chiarello
laterwork,farmer
article_title,francesco domenico chiarello
allegiance,italy italy
death_date,27 june 2008
serviceyears,"1918 & ndash ; 1920 , 1940"
birth_place,"umbriatico , italy"
birth_date,5 november 1898
battles,world war i world war ii
branch,italian army

0,1
name,İbrahim halil fırat
office,member of the grand national assembly
constituency,adıyaman -lrb- june 2015 -rrb-
imagesize,220 px
profession,lawyer
alma_mater,marmara university
birth_place,"adıyaman , turkey"
birth_date,01 january 1973
article_title,İbrahim halil fırat
party,justice and development party -lrb- akp -rrb-

0,1
website,-lsb- http://www.shantanumoitra.in/ official website -rsb-
name,shantanu moitra
image,shantunu moitra.jpg
birth_place,"lucknow , uttar pradesh , india"
birth_date,22 january 1968
article_title,shantanu moitra
occupation,"composer , film score composer"

0,1
name,samuel h. armacost
alma_mater,denison university stanford university
birth_date,1939
article_title,samuel armacost
employers,bankamerica corporation sri international chevron corporation

0,1
updated,2014-07-30
name,hamid oraibi
birth_place,"baghdad , iraq"
birth_date,1941
article_title,hamid oraibi

0,1
honorific_suffix,2nd baronet
name,sir smith hill child
awards,"dso , croix de guerre"
article_title,"sir smith child , 2nd baronet"
honorific_prefix,brigadier-general
allegiance,british
death_date,11 november 1958
serviceyears,1900 - 1924
placeofburial,"westbury , shropshire"
rank,brigadier-general

0,1
name,prince gabriel
father,"prince alfonso , count of caserta"
issue,prince immaculata prince casimir antoine prince jean princess maria margarita princess marie
spouse,princess malgorzata izabella czartoryska princess cecylia lubomirska
title,prince gabriel of bourbon-two sicilies
death_date,22 october 1975
mother,princess maria antonietta of bourbon-two sicilies
religion,roman catholic
birth_place,"cannes , france"
birth_date,11 january 1897

0,1
name,pete donnelly
finalteam,philadelphia white stockings
debutdate,may 13
position,outfielder/second baseman/shortstop
bats,unknown
finaldate,october 30
finalyear,1874
debutyear,1871
debutteam,keokuk westerns
statlabel,batting average runs runs batted in

0,1
birth_date,15 april 1989
playing_years,2011
name,marc mueller
college,university of regina
height_ft,6
coaching_teams,calgary stampeders -lrb- defensive assistant -rrb-
position,quarterback
playing_teams,edmonton eskimos *
height_in,0
career_footnotes,* offseason and/or practice squad member only

0,1
term_end,1991 march 1989
name,vernon a. walters
office,united 17th united states ambassador to the united nations states ambassador to the federal republic of germany
succeeded,thomas r. pickering
death_date,"february 10 , 2002 -lrb- aged 85 -rrb-"
birth_place,"new york city , new york"
preceded,jeane kirkpatrick
birth_date,"january 3 , 1917"
article_title,vernon a. walters
president,george h. w. bush ronald reagan

0,1
name,aleksi salmenperä
yearsactive,1998-present
birth_place,"helsinki , finland"
birth_date,1973
article_title,aleksi salmenperä
occupation,"film director , screenwriter"

0,1
career_end,2005
position,power forward / center
draft_pick,27
years,1993 1996 -- 1997 1997 1997 1997 -- 1998 1998 1999 y 2000 2000 2000 -- 2001 2001 2001 -- 2002 2002 1994 -- 2003 2003 -- 2004 2004 -- 2005 -- 1995 1995 -- 1996 1996
nationality,american
team,phoenix -lrb- france -rrb- konya kombassan -lrb- turkey -rrb- cb murcia suns arte -lrb- spain -rrb- rockford lightning -lrb- cba -rrb- tiburones omaha de aguadilla -lrb- puerto rico -rrb- cáceres cb -lrb- spain racers -rrb- león caja españa -lrb- spain -rrb- sporting athens -lrb- -lrb- greece -rrb- quad city thunder -lrb- cba -rrb- leones de cba ponce -lrb- puerto rico -rrb- media broker messina -lrb- italy -rrb- -rrb- atléticos de san germán -lrb- puerto rico -rrb- jda jda dijon -lrb- france -rrb- jilin northeast tigers -lrb- china -rrb- dijon besançon -lrb- france -rrb- anwil włocławek -lrb- poland -rrb-
bbr,mackema01
high_school,"brainerd -lrb- chattanooga , tennessee -rrb-"
draft_team,phoenix suns
draft_year,1993

0,1
nfl,fau226521
college,southern
aflstatvalue,20 1.0 6 6 6
position,defensive lineman
finalyear,1996
undraftedyear,1987
statvalue,81 5.5 0
debutyear,1987
statlabel,tackles sacks interceptions
afl,490

0,1
field,geologist
caption,alfred elis törnebohm
name,alfred elis törnebohm
known_for,overthrust of the caledonian range
image_size,150px
death_date,21 april 1911
image,alfred elis törnebohm.jpg
nationality,swedish
birth_date,16 october 1838
article_title,alfred elis törnebohm

0,1
nationalgoals,0
name,aharon amar
caps,132
image_size,250
image,mhfc-aharon-amar . jpg
nationalteam,israel
nationalcaps,14
article_title,aharon amar
goals,35

0,1
fivefor,& ndash ; - - & ndash ;
fullname,tim gruijters
tidebutyear,2012
source,http://www.cricketarchive.com/players/128/128597/128597.html cricket archive
international,true
catches/stumpings,2 ; 10 / & ndash ; 4 / & ndash / ; & ndash ; 1 / & ndash
lasttidate,28 november
odidebutagainst,kenya
role,batsman
deliveries,balls 120 6 302 72

0,1
caption,tanushree dutta
name,tanushree dutta
yearsactive,2004 -- 2010
family,ishita dutta -lrb- sister -rrb-
competitions,femina universe 2004 -lrb- top 10 -rrb- miss india universe 2004 -lrb- winner -rrb- miss
title,femina miss india universe 2004
image,tanushree dutta 20110525 1457168009.jpg
birth_place,"jamshedpur -rrb- , bihar , india -lrb- now in jharkhand"
birth_date,19 march 1984
article_title,tanushree dutta

0,1
name,eva grlić
spouse,danko grlić
imagesize,150px
death_date,31 july 2008 -lrb- aged 88 -rrb-
image,eva grlić.jpg
nationality,croat
birth_place,"budapest , hungary"
relations,"rudolf war ii -rrb- domany -lrb- first husband , killed during world"
birth_date,1920
article_title,eva grlić

0,1
occupation,"actress , voice dubbing artist , folk dancer"
website,http://www.parmindergill.in
name,parminder gill
background,artist
spouse,sukhjinder gill -lrb- 1992 -- present -rrb-
image,file : parminder gill1.jpg
origin,"ludhiana , punjab , india"
birth_place,"raikot , punjab , india"
years_active,1988 - present
birth_date,16 september 1970

0,1
fullname,hakim mouzaki
name,hakim mouzaki
youthclubs,wydad casablanca
caps,86
position,goalkeeper
pcupdate,"april 16 , 2009"
years,2003-2004
currentclub,wydad casablanca
clubs,wydad casablanca
ntupdate,"october 17 , 2008"

0,1
nationalgoals,0
fullname,luis gatty ribeiro roca
position,defender
pcupdate,"may 25 , 2011"
years,2007-2009 2010 2011 --
ntupdate,"september 10 , 2009"
nationalcaps,36
height,1.62 m ftin 0 on
caps,246 87 22 17
nationalteam,bolivia

0,1
field,painting
movement,expressionism
name,iosif iser
bgcolour,#eedd 82
imagesize,200px
death_date,25 april 1958
image,iosif iser.jpg
nationality,romanian
birth_place,bucharest
birth_date,21 may 1881

0,1
years_active,1992 -- present
birth_place,"ottawa , ontario , canada"
birth_date,11 march 1958
article_title,stephen r. hart
occupation,actor

0,1
caption,"carmody at welsh-ryan arena on january 3 , 2013"
name,bill carmody
coaching_records,"ivy , 78.6 % -- 92 -- 25 , princeton , league 1996 -- 2000 -rrb- career winning percentage -lrb- min 4 seasons"
overall_record,284-245 -lrb- 261 203 -rrb-
current_title,head coach
alma_mater,union college
image,20130103 bill carmody.jpg
coach_team,fulton -lrb- assistant -rrb- providence -lrb- asst. -rrb- princeton -lrb- asst. -- -rrb- princeton northwestern holy cross montgomery cc union college -lrb- ny -rrb-
birth_place,"rahway , new jersey"
championships,"ivy -rrb- league regular season championship -lrb- 1997 , 1998"

0,1
name,joy mangano
yearsactive,1989 -- present
spouse,anthony miranne -lrb- divorced -rrb-
birth_place,"new york , u.s."
birth_date,15 february 1956
article_title,joy mangano
occupation,entrepreneur
children,3

0,1
caption,jaka in 2010
name,faraz jaka
updated,"march 28 , 2014"
nickname,`` the toilet ''
image,faraz jaka final table napt mohegan sun bounty shootout.jpg
birth_place,"san jose , california , u.s."
birth_date,"september 9 , 1985"
article_title,faraz jaka

0,1
caption,tyutchev as painted by stepan alexandrovsky
name,fyodor tyutchev
spouse,eleonore ernestine von dörnberg peterson -lrb- 1826 & ndash ; 1838 -rrb-
death_date,july 27 1873 july 15 -lrb- aged 69 -rrb-
image,fyodor tyutchev.jpg
birth_place,"ovstug near bryansk , russian empire"
birth_date,december 5 1803 november 23
article_title,fyodor tyutchev
death_place,"saint petersburg , russian empire"

0,1
name,eddie miller
finalteam,san diego padres
debutdate,september 5
position,outfielder
bats,both
finaldate,september 30
finalyear,1984
debutyear,1977
debutteam,texas rangers
statlabel,batting average home runs runs batted in games played

0,1
term_end,"december 13 , 1904"
name,charles berkeley powell
office,member of the legislative assembly of ontario for ottawa
death_date,1933
image,charles berkeley powell.jpg
birth_place,"port dover , ontario"
birth_date,19 august 1858
article_title,charles berkeley powell
party,conservative
term_start,"march 1 , 1898"

0,1
tradchinesename,謝怡芬
pinyinchinesename,xiè yí fēn
name,janet hsieh
birth_name,janet josephine hsieh
spouse,george young -lrb- 2015 - -rrb-
nationality,american
birth_place,"houston , texas , united states"
years_active,2002-present
birth_date,20 january 1980
article_title,janet hsieh

0,1
fullname,mehdi mahdavi
name,mehdi mahdavi
spike,3.30 m
position,setter
weight,92 kg lb on
block,3.10 m
currentnumber,13
image,mahdavi iran usa 2015.jpg
currentclub,iran barij essence
birth_place,"karaj , iran"

0,1
name,patrick lapeyre
awards,prix femina
language,french
notableworks,'' fin '' la vie est brève et le désir sans
birth_place,pantin
birth_date,1949
article_title,patrick lapeyre

0,1
caption,"abner p. allen , medal of honor recipient"
name,abner p. allen
awards,medal of honor ribbon.svg medal of honor
article_title,abner p. allen
placeofburial_label,place of burial
allegiance,united states of america 1863 union
death_date,22 august 1905
image,armymoh.jpg
serviceyears,1864 - 1865
rank,25px corporal

0,1
caption,sakai her 1998 '' work out fine '' tour at a photo shoot in hong kong during
talents,"singing , acting"
name,noriko sakai
background,solo_singer
native_name,酒井 法子
image,sakai noriko-groink2000 . png
native_name_lang,jpn
birth_place,"fukuoka , japan"
years_active,1986 -- present
birth_date,14 february 1971

0,1
fivefor,& ndash ;
fullname,paul robert archibald johnston
source,http://www.espncricinfo.com/ci/content/player/346100.html cricinfo
catches/stumpings,6 / & ndash ;
club,cumberland durham ucce
deliveries,balls 6
tenfor,& ndash ;
country,england
birth_date,13 december 1988
article_title,paul johnston -lrb- cricketer -rrb-

0,1
occupation,musician composer musical director producer educator
website,-lsb- http://www.georgeduke.com/ george duke official site.com -rsb-
caption,"duke at the montreux jazz festival , 1986"
name,george duke
background,solo_singer
notable_instruments,moog synthesizer
instrument,"vocals , flute , bass , trombone , piano , synthesizer , saxophone , keytar"
death_date,5 august 2013
image,georgedukecropped.jpg
label,"pacific atlantic , epic/cbs , elektra , warner bros. , bizarre jazz , telarc jazz , heads up , pickwick , mps/saba , mps/basf ,"


In [33]:
[doc for doc in dm.val_dataset['document'] if 'table tennis' in doc]

['hui jun is a male former table tennis player from china .\n',
 'cornelia molnar ( born 26 november 1983 ) is a croatian table tennis player .\nshe competed for croatia at the 2004 summer olympics and 2012 summer olympics .\n',
 'bora vang ( 王博 , born april 9 , 1987 ) is a chinese-born turkish national table tennis player .\nthe tall athlete at competes for adana table tennis club , where he is coached by sabahattin sabrioğlu .\nhe won the gold medal in the mized doubles event at the 2010 european mixed double championships held in subotica , serbia together with his turkish teammate Şirin he .\nin 2012 , he and his turkish teammate melek hu won the silver medal at the european mixed double championships held in buzau , romania .\nvang qualified for the 2012 summer olympics after the world qualification championship held in doha , qatar .\nat the 2013 mediterranean games held in mersin , turkey , he won the silver medal in the singles event .\n',
 "mikael appelgren ( born october 15 ,

In [59]:
len([doc for doc in dm.val_dataset['document'] if 'brazil' in doc])

207

In [61]:
len([doc for doc in dm.val_dataset['document'] if (('wales' in doc) or ('welsh' in doc)) and ('actor' in doc)])

10

In [72]:
def get_num_words_until_paren(s: str) -> int:
    try:
        return s.split(' ').index('(')
    except ValueError:
        return -1

    
four_word_named_people = [(i, doc) for i,doc in enumerate(dm.val_dataset['document']) if (get_num_words_until_paren(doc) == 4) and lex_pred_correct[i]]
len(four_word_named_people)

579

In [75]:
brazilian_four_word_named_people = [(i, doc) for i, doc in four_word_named_people if (('brazil' in doc) or ('brasil' in doc))]
len(brazilian_four_word_named_people)

26

In [76]:
brazilian_four_word_named_people[0]

(82,
 'thiago dos santos costa ( born february 28 , 1983 ) is a brazilian footballer who plays for são luiz .\n')

In [77]:
brazilian_four_word_named_people[1]

(460,
 'luis ricardo silva umbelino ( born 21 january 1984 ) is a brazilian footballer who plays for botafogo , on loan from são paulo fc .\nmainly a right back , he can perform equally as a right midfielder .\n')

In [78]:
brazilian_four_word_named_people[2]

(493,
 'fabián raphael estay silva ( born october 5 , 1968 ) is a chilean football midfielder .\nhe was capped 69 times and scored five goals for the chile national team between 1990 and 2001 , including four games at the 1998 fifa world cup .\nestay made his debut for the senior national squad on october 17 , 1990 in a friendly against brazil .\nfabián played for club sides in chile , switzerland , greece , mexico and colombia .\n')

In [79]:
brazilian_four_word_named_soccer_players = [(i, doc) for i,doc in brazilian_four_word_named_people if 'play' in doc]
len(brazilian_four_word_named_soccer_players)

19

In [83]:
len([doc for doc in dm.val_dataset['document'] if (('iran' in doc) and ('volleyball' in doc))])

1

In [93]:
import glob
import pandas as pd

adv_csv_path = glob.glob('../adv_csvs/model_8_1day/*')[0]

df = pd.read_csv(adv_csv_path)
df['perturbed_text'] = df['perturbed_text'].map(lambda s: s.replace('<SPLIT>', '\n'))

In [102]:
def make_html_table(rows: List[List[str]]) -> str:
    assert len(rows) > 0
    table = '<table style="font-size: 18px">'
    table += '<tr>'
    table += ''.join([wrap_th(el) for el in rows[0]])
    table += '</tr>'
    for row in rows[1:]:
        table += '<tr>'
        for el in row:
            table += wrap_td(el)
        table += '</tr>'
    table += '</table>'
    return table

In [174]:

def limit_100_words(doc: str) -> str:
    words = doc.split(' ')
    if len(words) > 100:
        doc = ' '.join(words[:100]) + '…'
    return doc

def html_format_doc(doc: str) -> str:
    return doc.replace('<mask>', '<span style="color: lightgray; background-color: lightgray; font-weight: 100"> &lt;mask&gt;</span>')

def display_redacted_doc(idx):
    adv_doc = limit_100_words(df['perturbed_text'][idx])
    lex_doc = limit_100_words(dm.val_dataset[idx]['document_redact_lexical'])
    
    table = make_html_table(
        [
            ['Name', 'Lexically redacted', 'Adversarially redacted'],
            [dm.val_dataset[idx]['name'], html_format_doc(lex_doc), html_format_doc(adv_doc)]
        ]
    )
    
    display(HTML(table))

display_redacted_doc(81)

Name,Lexically redacted,Adversarially redacted
Peggy Annette Whitson,"<mask> <mask> <mask> ( born <mask> 9 , <mask> ) is an <mask> biochemistry researcher , <mask> <mask> , and former <mask> chief <mask> . her first space <mask> was in 2002 , with an extended stay aboard the international space station as a member of <mask> <mask> . her second <mask> launched october 10 , 2007 , as the first female commander of the iss with <mask> <mask> . with her two long-duration stays abroad the iss , <mask> is <mask> 's most experienced female <mask> , with just over <mask> <mask> in space . this also places her twentieth among all space…","<mask> <mask> <mask> ( born <mask> <mask> , <mask> ) <mask> an american <mask> <mask> , <mask> <mask> , and former nasa chief <mask> . her <mask> <mask> <mask> <mask> <mask> <mask> , with an extended stay <mask> <mask> <mask> <mask> <mask> as <mask> <mask> of <mask> <mask> .  <mask> <mask> <mask> <mask> october <mask> , <mask> , <mask> <mask> <mask> female <mask> of the iss <mask> <mask> <mask> .  <mask> her <mask> long-duration stays <mask> the iss , <mask> is nasa 's most experienced female <mask> , with just over <mask> <mask> in <mask> . this <mask> <mask> her twentieth among all <mask>…"


In [104]:
display_redacted_doc(801)

Name,Lexically redacted,Adversarially redacted
Truid Blaisse-terwindt,"<mask> <mask> ( 4 <mask> <mask> -- <mask> <mask> <mask> ) was a dutch female hockey - and tennis player who was active in the 1930s and 1940s . between 1935 and <mask> she participated in five wimbledon championships . her best result in the singles event was reaching the third round in <mask> , losing to dorothy round , and <mask> , losing to first seeded margaret du pont . in the doubles she reached the third round in <mask> and <mask> partnering compatriot madzy rollin couquerque . with ivo rinkel she reached the fourth round of the mixed doubles in <mask> . in…","<mask> <mask> ( <mask> <mask> <mask> -- <mask> <mask> <mask> ) was <mask> <mask> female <mask> - <mask> tennis player who <mask> <mask> in <mask> 1930s and 1940s . between 1935 and 1948 she participated in five wimbledon championships . her best result in <mask> singles event was reaching <mask> third round in 1937 , <mask> to dorothy round , and 1948 , losing to first <mask> margaret du <mask> . in the doubles she <mask> the third round in 1936 and <mask> <mask> compatriot madzy rollin couquerque . with ivo <mask> she <mask> the fourth round <mask> the mixed doubles in <mask> . in…"


In [145]:
display_redacted_doc(701)

Name,Lexically redacted,Adversarially redacted
Mikhail Bulgakov,"<mask> <mask> <mask> ( ; , ; -- <mask> <mask> , <mask> ) was a <mask> writer and <mask> active in the first half of the 20th century . he is best known for <mask> novel `` the master and margarita '' , which has been called one of the masterpieces of the 20th century .","<mask> <mask> <mask> ( ; , ; -- <mask> 10 , <mask> ) was a russian writer <mask> <mask> active in the first half of <mask> 20th century . he is best known for his <mask> `` the master and margarita '' , which has been called one of the masterpieces of the 20th century ."


In [168]:
import random
display_redacted_doc(random.choice(range(1000)))

Name,Lexically redacted,Adversarially redacted
Tony Do Pilar Patrao,"<mask> <mask> <mask> <mask> ( born <mask> , <mask> in <mask> , ) is a french-portuguese footballer . he currently plays in the cfa for <mask> <mask> .  <mask> played at berre l'etang and <mask> . source : http://www.foot-national.com/4168-joueur-football- <mask>- <mask>- <mask>- <mask>.html","<mask> <mask> <mask> <mask> ( born 19 , <mask> <mask> <mask> , ) is a <mask> <mask> . he currently plays in the cfa <mask> <mask> <mask> .  <mask> played <mask> berre l'etang and <mask> . source : http:// <mask>.foot-national.com/4168-joueur-football-do-pilar-patrao-tony.html"
