In [1]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

def get_bm25(dataset):
    def make_table_str(ex):
        ex['table_str'] = (
            ' '.join(ex['input_text']['table']['column_header'] + ex['input_text']['table']['content'])
        )
        return ex

    dataset = dataset.map(make_table_str)

    tokenized_profile_corpus = [
        get_words_from_doc(prof) for prof in dataset['table_str']
    ]
    return BM25Okapi(tokenized_profile_corpus)

In [49]:
import datasets

val_dataset = datasets.load_dataset('wiki_bio', split='val[:100%]', version='1.2.0')
val_bm25 = get_bm25(val_dataset)

Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


  0%|          | 0/72831 [00:00<?, ?ex/s]

In [3]:
train_dataset = datasets.load_dataset('wiki_bio', split='train[:100%]', version='1.2.0')
train_bm25 = get_bm25(train_dataset)

Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


  0%|          | 0/582659 [00:00<?, ?ex/s]

In [10]:
val_dataset["target_text"][val_bm25.get_scores("carl crawford".split()).argmax()]

"carl demonte crawford -lrb- born august 5 , 1981 -rrb- , nicknamed `` the perfect storm '' , is an american professional baseball left fielder with the los angeles dodgers of major league baseball -lrb- mlb -rrb- .\nhe bats and throws left-handed .\ncrawford was drafted by the tampa bay devil rays in the second round -lrb- 52nd overall -rrb- of the 1999 major league baseball draft .\nhe made his major league debut in 2002 .\ncrawford has more triples -lrb- 121 -rrb- than any other active baseball player .\n"

In [13]:
train_dataset["target_text"][train_bm25.get_scores("carl crawford".split()).argmax()]

"joseph `` joey '' crawford -lrb- born august 30 , 1951 in philadelphia , pennsylvania -rrb- is an american professional basketball referee in the national basketball association -lrb- nba -rrb- , having worked in the league since 1977 .\ncrawford , who wears uniform number 17 , is one of the strictest officials in the nba and has developed a reputation for assessing technical fouls against both players and coaches .\nas of the conclusion of the 2014-15 nba season , crawford has worked more playoff -lrb- 313 -rrb- and nba finals games -lrb- 50 -rrb- than any other active referee in the league and appeared in the finals every year between 1986 and 2006 .\nhe has appeared in 29 of the last 30 nba finals series , missing only the 2007 nba finals , due to suspension .\nin addition to playoff games , crawford has officiated the nba all-star game in 1986 , 1992 and 2000 , as well as the 1993 mcdonald 's championship in munich , germany .\n"

In [16]:
redacted_cc = """<mask> <mask> <mask> (<mask> <mask> <mask> , <mask>) , <mask> `` the <mask> <mask> '' , is an <mask> <mask> <mask> <mask> <mask> with the <mask> <mask> <mask> of <mask> <mask> <mask> (<mask>) .\nhe <mask> and <mask> <mask>-<mask> .\n<mask> was <mask> by the <mask> <mask> <mask> <mask> in the <mask> <mask> (<mask> <mask>) of the <mask> <mask> <mask> <mask> <mask> .\nhe <mask> his <mask> <mask> <mask> in <mask> .\n<mask> has more <mask> (<mask>) than any other <mask> <mask> <mask> .\n"""

In [17]:
redacted_cc.replace('<mask>', '')

"   (   , ) ,  `` the   '' , is an      with the    of    () .\nhe  and  - .\n was  by the     in the   ( ) of the      .\nhe  his    in  .\n has more  () than any other    .\n"

In [23]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from utils import create_document_and_profile_from_wikibio, name_from_table_rows

In [50]:
train_dataset = train_dataset.map(create_document_and_profile_from_wikibio)
val_dataset = val_dataset.map(create_document_and_profile_from_wikibio)

  0%|          | 0/582659 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-60cca6b4349a241a.arrow


In [26]:
train_dataset[0].keys()

dict_keys(['input_text', 'target_text', 'name', 'document', 'profile', 'profile_keys', 'profile_values', 'text_key'])

In [36]:
train_dataset[0]['name']

'Walter Extra'

In [37]:
val_dataset[0]['name']

'Michael Iii Of Alexandria'

In [None]:
train_names = list((p['name'] for p in train_dataset))
val_names = list((p['name'] for p in val_dataset))

In [55]:
from collections import Counter
train_nc = Counter(train_names)
val_nc = Counter(val_names)

In [56]:
train_nc.most_common(40)

[('John Smith', 29),
 ('* -- >', 28),
 ('John Williams', 26),
 ('John Anderson', 25),
 ('David Jones', 24),
 ('John Davis', 23),
 ('John Brown', 22),
 ('John Wilson', 21),
 ('George Smith', 19),
 ('Fernando', 19),
 ('Leandro', 19),
 ('John Taylor', 19),
 ('William Smith', 18),
 ('John Moore', 18),
 ('Diego', 17),
 ('Daniel', 17),
 ('Alex', 17),
 ('Paul Smith', 17),
 ('Chris Jones', 17),
 ('Chris Smith', 17),
 ('Steve Smith', 16),
 ('João Paulo', 16),
 ('-- >', 16),
 ('Adriano', 16),
 ('John Roberts', 15),
 ('David Lee', 15),
 ('Juninho', 15),
 ('William', 15),
 ('David Smith', 15),
 ('Felipe', 15),
 ('David Williams', 15),
 ('Henrique', 15),
 ('George Wilson', 15),
 ('Paulinho', 15),
 ('Chris Brown', 15),
 ('John Harris', 15),
 ('Mike Smith', 14),
 ('Gabriel', 14),
 ('John Ryan', 14),
 ('Jack Smith', 14)]

In [57]:
val_nc.most_common(40)

[('John Smith', 7),
 ('* -- >', 6),
 ('John Campbell', 5),
 ('John White', 5),
 ('Juninho', 5),
 ('John Barrett', 5),
 ('Wellington', 5),
 ('Paulinho', 5),
 ('Al Smith', 4),
 ('Serginho', 4),
 ('John Hunter', 4),
 ('Jimmy Smith', 4),
 ('The Duke Of Bedford', 4),
 ('John Brown', 4),
 ('Daniel', 4),
 ('Tom Smith', 4),
 ('John Sullivan', 4),
 ('David Smith', 4),
 ('Bruno', 4),
 ('Ben Wilson', 3),
 ('Eddie Miller', 3),
 ('Brian Jones', 3),
 ('Tommy Thompson', 3),
 ('Ronan', 3),
 ('Anthony Smith', 3),
 ('David Parker', 3),
 ('Greg Smith', 3),
 ('Bill White', 3),
 ('John Johnson', 3),
 ('Sergio', 3),
 ('Dudu', 3),
 ('David Young', 3),
 ('James Thomson', 3),
 ('John Cooper', 3),
 ('Michael Johnson', 3),
 ('Bobby Brown', 3),
 ('James Grant', 3),
 ('Ernie Watts', 3),
 ('David Nelson', 3),
 ('Nathan Brown', 3)]

In [58]:
star_ppl = [p for p in val_dataset if p['name'] == '* -- >']

In [59]:
star_ppl

[{'input_text': {'table': {'column_header': ['name',
     'sortkey',
     'image',
     'nationality',
     'awards',
     'article_title',
     'subcat'],
    'row_number': [1, 1, 1, 1, 1, 1, 1],
    'content': ['* -- >',
     'trov , tony',
     'tony trov at sdcc.jpg',
     'american',
     'philadelphia geek award',
     'tony trov\n',
     'american']},
   'context': 'tony trov\n'},
  'target_text': 'tony trov -lrb- born anthony trovarello 1983 -rrb- is an american film maker , comic book creator and musician best known for the independent horror comedy alpha girls .\n',
  'name': '* -- >',
  'document': 'tony trov ( born anthony trovarello 1983 ) is an american film maker , comic book creator and musician best known for the independent horror comedy alpha girls .\n',
  'profile': 'name || * -- >\nsortkey || trov , tony\nimage || tony trov at sdcc.jpg\nnationality || american\nawards || philadelphia geek award\narticle_title || tony trov\nsubcat || american',
  'profile_keys': 'na