In [2]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from datamodule import WikipediaDataModule

import numpy as np


num_cpus = 8

dm = WikipediaDataModule(
    document_model_name_or_path='roberta-base',
    profile_model_name_or_path='google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:256]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:256]
loading wiki_bio[1.2.0] split val[:100%]
loading wiki_bio[1.2.0] split test[:100%]
                        

In [14]:
profile_dicts = []

for i in range(1000):
    keys = dm.test_dataset[i]['profile_keys'].split('||')
    values = dm.test_dataset[i]['profile_values'].split('||')
    profile_dict = dict(zip(keys, values))
    profile_dict['i'] = i
    profile_dicts.append(profile_dict)
profile_dicts[0]

{'name': 'lenny randle',
 'finalteam': 'seattle mariners',
 'debutdate': 'june 16',
 'position': 'second baseman / third baseman',
 'bats': 'switch',
 'finaldate': 'june 20',
 'finalyear': '1982',
 'debutyear': '1971',
 'debutteam': 'washington senators',
 'statlabel': 'batting average home runs runs batted in',
 'statvalue': '.257 27 322',
 'throws': 'right',
 'birth_place': 'long beach , california',
 'birth_date': '12 february 1949',
 'article_title': 'lenny randle',
 'i': 0}

In [15]:
import pickle
pickle.dump(profile_dicts, open('../test_profiles_1000.p', 'wb'))

In [3]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')


In [4]:
from typing import List, Tuple

from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def get_person(i: int):
    if i < len(dm.test_dataset):
        return dm.test_dataset[i]
    elif i < (len(dm.test_dataset) + len(dm.val_dataset)):
        return dm.val_dataset[i - len(dm.test_dataset)]
    else:
        return dm.train_dataset[i - len(dm.test_dataset) - len(dm.val_dataset)]

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))


In [5]:
label_names = np.array(list(dm.test_dataset['name']) + list(dm.val_dataset['name']) + list(dm.train_dataset['name']))

In [6]:
get_person(139_779)

{'input_text': {'table': {'column_header': ['name',
    'birth_place',
    'sport',
    'birth_date',
    'article_title'],
   'row_number': [1, 1, 1, 1, 1],
   'content': ['hiroki ichigatani',
    'kagoshima , japan',
    'fencing',
    '6 may 1969',
    'hiroki ichigatani\n']},
  'context': 'hiroki ichigatani\n'},
 'target_text': 'hiroki ichigatani -lrb- born 6 may 1969 -rrb- is a japanese fencer .\nhe competed in the individual foil event at the 1992 and 1996 summer olympics .\n',
 'name': 'Hiroki Ichigatani',
 'document': 'hiroki ichigatani ( born 6 may 1969 ) is a japanese fencer .\nhe competed in the individual foil event at the 1992 and 1996 summer olympics .\n',
 'profile': 'name || hiroki ichigatani\nbirth_place || kagoshima , japan\nsport || fencing\nbirth_date || 6 may 1969\narticle_title || hiroki ichigatani',
 'profile_keys': 'name||birth_place||sport||birth_date||article_title',
 'profile_values': 'hiroki ichigatani||kagoshima , japan||fencing||6 may 1969||hiroki ichigata