In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from dataloader import WikipediaDataModule

import numpy as np


num_cpus = 8

dm = WikipediaDataModule(
    document_model_name_or_path='roberta-base',
    profile_model_name_or_path='google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]
loading wiki_bio[1.2.0] split val[:100%]
loading wiki_bio[1.2.0] split test[:100%]
                        

In [3]:
from typing import List, Tuple

from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def get_person(i: int):
    if i < len(dm.test_dataset):
        return dm.test_dataset[i]
    elif i < (len(dm.test_dataset) + len(dm.val_dataset)):
        return dm.val_dataset[i - len(dm.test_dataset)]
    else:
        return dm.train_dataset[i - len(dm.test_dataset) - len(dm.val_dataset)]

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))


In [6]:
display_profile_by_index(idx=24)

0,1
position,guard
draft_pick,27
nationality,turkish
team,fenerbahçe istanbul
draft_team,new york liberty
draft_year,2013
height_in,0
weight_lbs,132
birth_date,13 july 1993
article_title,olcay Çakır


In [12]:
dm.test_dataset[24]['document_redact_lexical']

"<mask> <mask> ( born <mask> <mask> <mask> in <mask> , <mask> , <mask> ) is a <mask> <mask> player .\nthe young national plays for <mask> İstanbul as both <mask> ( 1-2 ) <mask> .\n<mask> <mask> is 182 cm tall and weighs 60 kg .\nshe is playing for <mask> since 2005 in youth level and since <mask> in senior level .\n<mask> was selected 27th overall in the <mask> <mask> draft 's round <mask> by the <mask> <mask> <mask> .\nshe is so the first ever woman <mask> player from <mask> to be drafted by the <mask> .\nthe only <mask> <mask> player , who played in the <mask> , is nevriye yılmaz .\n"

In [43]:
ex = dm.test_dataset[24]
words_split_re = re.compile(r'[\W_]+')

lex_words = [w for w in words_split_re.split(ex['document_redact_lexical']) if len(w)][:78]
doc_words = words_split_re.split(ex['document'])[:78]

num_subbed_words = 0
for i in range(len(lex_words)):
    print(i, doc_words[i].strip(), ' ' * (12 - len(doc_words[i].strip())), lex_words[i].strip())
    if lex_words[i] == 'mask':
        num_subbed_words += 1

print(num_subbed_words)

0 olcay         mask
1 Çakır         mask
2 born          born
3 13            mask
4 july          mask
5 1993          mask
6 in            in
7 konak         mask
8 İzmir         mask
9 turkey        mask
10 is            is
11 a             a
12 turkish       mask
13 basketball    mask
14 player        player
15 the           the
16 young         young
17 national      national
18 plays         plays
19 for           for
20 fenerbahçe    mask
21 İstanbul      İstanbul
22 as            as
23 both          both
24 guard         mask
25 1             1
26 2             2
27 position      mask
28 olcay         mask
29 Çakır         mask
30 is            is
31 182           182
32 cm            cm
33 tall          tall
34 and           and
35 weighs        weighs
36 60            60
37 kg            kg
38 she           she
39 is            is
40 playing       playing
41 for           for
42 fenerbahçe    mask
43 since         since
44 2005          2005
45 in            in
46 youth     