In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from datamodule import WikipediaDataModule

import numpy as np


num_cpus = 8

dm = WikipediaDataModule(
    document_model_name_or_path='roberta-base',
    profile_model_name_or_path='google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]
loading wiki_bio[1.2.0] split val[:100%]
loading wiki_bio[1.2.0] split test[:100%]
                        

In [2]:
import torch
from utils import get_profile_embeddings_by_model_key

def get_profile_embeddings(model_key: str):
    profile_embeddings = get_profile_embeddings_by_model_key(model_key=model_key)

    print("concatenating train, val, and test profile embeddings")
    all_profile_embeddings = torch.cat(
        (profile_embeddings['test'], profile_embeddings['val'], profile_embeddings['train']), dim=0
    )

    print("all_profile_embeddings:", all_profile_embeddings.shape)
    return all_profile_embeddings

rt_profile_embeddings = get_profile_embeddings('model_3_2')
rr_profile_embeddings = get_profile_embeddings('model_3_3__placeholder')

>> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/test.pkl
concatenating train, val, and test profile embeddings
all_profile_embeddings: torch.Size([728321, 3072])
>> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/test.pkl
co

In [3]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from model import AbstractModel, CoordinateAscentModel

from model_cfg import model_paths_dict

rt = CoordinateAscentModel.load_from_checkpoint(model_paths_dict['model_3_2'])
rr = CoordinateAscentModel.load_from_checkpoint(model_paths_dict['model_3_3__placeholder'])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 0.0001 and patience 6


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaMod

Initialized model with learning_rate = 0.0001 and patience 6


In [4]:
import torch
import transformers

from model import AbstractModel

class ModelWrapper:
    document_tokenizer: transformers.AutoTokenizer
    profile_embeddings: torch.Tensor
    max_seq_length: int
    
    def __init__(self,
            model: AbstractModel,
            document_tokenizer: transformers.AutoTokenizer,
            profile_embeddings: torch.Tensor,
            max_seq_length: int = 128
        ):
        self.model = model
        self.model.eval()
        self.document_tokenizer = document_tokenizer
        self.profile_embeddings = profile_embeddings.clone().detach()
        self.max_seq_length = max_seq_length
                 
    def to(self, device):
        self.model.to(device)
        self.profile_embeddings = self.profile_embeddings.to(device)
        return self # so semantics `model = MyModelWrapper().to('cuda')` works properly

    def __call__(self, text_input_list):
        model_device = next(self.model.parameters()).device

        tokenized_documents = self.document_tokenizer.batch_encode_plus(
            text_input_list,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        batch = {f"document__{k}": v for k,v in tokenized_documents.items()}

        with torch.no_grad():
            document_embeddings = self.model.forward_document(batch=batch, document_type='document')
            document_to_profile_logits = document_embeddings @ (self.profile_embeddings.T)
        # breakpoint()
        assert document_to_profile_logits.shape == (len(text_input_list), len(self.profile_embeddings))
        return document_to_profile_logits

    
rt_mw = ModelWrapper(rt, dm.document_tokenizer, rt_profile_embeddings).to('cuda')
rr_mw = ModelWrapper(rr, dm.document_tokenizer, rr_profile_embeddings).to('cuda')

In [5]:
def call_rr(s: str):
    probs = rr_mw([s]).softmax(1)
    return (label_names[probs.argmax()], probs.argmax().item(), probs.max().item())

def call_rt(s: str):
    probs = rt_mw([s]).softmax(1)
    return (label_names[probs.argmax()], probs.argmax().item(), probs.max().item())

In [25]:
def get_person(i: int):
    if i < len(dm.test_dataset):
        return dm.test_dataset[i]
    elif i < (len(dm.test_dataset) + len(dm.val_dataset)):
        return dm.val_dataset[i - len(dm.test_dataset)]
    else:
        return dm.train_dataset[i - len(dm.test_dataset) - len(dm.val_dataset)]
    
from typing import List, Tuple

from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))

In [6]:
label_names = np.array(list(dm.test_dataset['name']) + list(dm.val_dataset['name']) + list(dm.train_dataset['name']))

In [7]:
kris_janson = "kris tiffany maslog janson ( born december 21, 1989 ) simply known as kris janson, is a filipino beauty pageant titleholder from cebu city, crowned binibining pilipinas intercontinental 2014 at the binibining pilipinas 2014 pageant held on march 30th, 2014 at the smart araneta coliseum, quezon city, philippines."

In [8]:
call_rr(kris_janson)

('Kris Janson', 49, 0.9884936213493347)

In [9]:
call_rt(kris_janson)

('Kris Janson', 49, 0.9869921207427979)

In [10]:
call_rr("kris tiffany maslog janson ( born december 21, 1989 ) simply known as kris janson, is a filipino beauty pageant titleholder from cebu city, crowned binibining pilipinas intercontinental 2014 at the binibining pilipinas 2014 pageant held on march 30th, 2014 at the smart araneta coliseum, quezon city, philippines.")

('Kris Janson', 49, 0.9884936213493347)

In [11]:
dm.test_dataset[6]['name'], dm.test_dataset[6]['target_text']

('Dillon Sheppard',
 'dillon sheppard -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .\n')

In [12]:
call_rt("'dillon sheppard -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .")

('Dillon Sheppard', 6, 0.8875733017921448)

In [13]:
call_rr("'dillon sheppard -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .")

('Dillon Sheppard', 6, 0.9624467492103577)

In [14]:
call_rt("'<mask> sheppard -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .")

('Dillon Sheppard', 6, 0.9376591444015503)

In [15]:
call_rr("'<mask> sheppard -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .")

('Dillon Sheppard', 6, 0.9239954352378845)

In [16]:
call_rt("'<mask> <mask> -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .")

('Dillon Sheppard', 6, 0.3441327214241028)

In [17]:
call_rr("'<mask> <mask> -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .")

('Thando Mngomeni', 488791, 0.15380197763442993)

In [26]:
display_profile_by_index(488_791)

0,1
fullname,thando mngomeni
name,thando mngomeni
caps,11 32 29 9 36 16 43
article_title,thando mngomeni
nationalyears,2004-2007
position,midfielder
pcupdate,1 june 2011
years,2002 2007 -- 2008 2008 -- 2009 2009 -- 2013 -- 2004 2004 -- 2005 2005 -- 2006
currentclub,bidvest wits
clubs,supersport sundowns bidvest wits united helsingborgs if bush bucks engen santos mamelodi


In [27]:
display_profile_by_index(6)

0,1
nationalgoals,0 0 0 0
fullname,dillon douglas sheppard
position,left-winger
pcupdate,23 august 2014
years,1999 -- 2011 2011 -- 2013 2013 -- 2014 2014 -- -- 2004 2004 2005 2006 -- 2009 2009
ntupdate,2 june 2012
nationalcaps,1 9 9 18
height,1.80 m ftin 0 on
youthclubs,school of excellence
caps,24 105 4 10 37 39 33 14 0


In [28]:
dm.profile_tokenizer

PreTrainedTokenizer(name_or_path='google/tapas-base', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[EMPTY]']})

In [36]:
from utils import try_encode_table_tapas, get_profile_df
dillon_sheppard = get_profile_df(get_person(6)['input_text']['table']['column_header'], get_person(6)['input_text']['table']['content'])

In [49]:
dillon_sheppard_encoded = try_encode_table_tapas(df=dillon_sheppard, tokenizer=dm.profile_tokenizer, query="who is this person?", max_length=128, num_cols=50)
# dm.profile_tokenizer.decode(dillon_sheppard_encoded['input_ids'])
dm.profile_tokenizer.decode(dillon_sheppard_encoded['input_ids'].squeeze())

'[CLS] who is this person? [SEP] nationalgoals fullname position pcupdate years ntupdate nationalcaps height youthclubs caps nationalteam birth _ date article _ title currentclub clubs name nationalyears birth _ place clubnumber goals 0 0 0 0 dillon douglas sheppard left - winger 23 august 2014 1999 - - 2011 2011 2 june 2012 1 9 9 18 1. 80 m ftin school of excellence 24 105 4 10 37 south south africa africa u 27 february 1979 dillon sheppard bidvest wits school mamelodi dillon sheppard 1996 2007 1997 - - durban, south africa 29 1 15 0 0 1'

In [55]:
dillon_sheppard_text = "'<mask> <mask> -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa ."
dillon_sheppard_words = dillon_sheppard_text.split(' ')
for i in range(len(dillon_sheppard_words)):
    ds_masked_version = dillon_sheppard_words[:]
    ds_masked_version[i] = '<mask>'
    ds_masked_text = ' '.join(ds_masked_version)
    print(ds_masked_text)
    print(call_rr(ds_masked_text))

<mask> <mask> -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .
('Thando Mngomeni', 488791, 0.0968029797077179)
'<mask> <mask> -lrb- born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .
('Thando Mngomeni', 488791, 0.15380197763442993)
'<mask> <mask> <mask> born 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .
('Buhle Mkhwanazi', 245060, 0.056675318628549576)
'<mask> <mask> -lrb- <mask> 27 february 1979 -rrb- is a south african football -lrb- soccer -rrb- left-winger who plays for premier soccer league club bidvest wits and south africa .
('Thando Mngomeni', 488791, 0.1603737771511078)
'<mask> <mask> -lrb- born <mask> february 1979 -rrb- is a south african footbal

In [56]:
kris_janson = """kris tiffany maslog janson ( born december 21, 1989 ) simply known as kris janson, is a filipino beauty pageant titleholder from cebu city, crowned binibining pilipinas intercontinental 2014 at the binibining pilipinas 2014 pageant held on march 30th, 2014 at the smart araneta coliseum, quezon city, philippines."""
call_rr(kris_janson)

('Kris Janson', 49, 0.9884936213493347)

In [59]:
kris_janson_masked = """<mask> <mask> <mask> <mask> ( <mask> <mask> <mask>, <mask> ) <mask> <mask> <mask> <mask> <mask>, < <mask> <mask> 30th, <mask> <mask> the smart <mask> <mask>, <mask> city, philippines."""
call_rr(kris_janson_masked)

('N. R. Narayana Murthy', 193508, 0.0613858699798584)

In [60]:
call_rt(kris_janson_masked)

('Mike Velarde', 725672, 0.1449543535709381)

In [61]:
kris_janson_masked_prev = """<mask> <mask> <mask> <mask> ( <mask> <mask> <mask>, <mask> ) <mask> <mask> <mask> kris janson, <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, <mask> <mask> <mask> <mask> araneta <mask>, <mask> <mask>, <mask>."""
call_rt(kris_janson_masked_prev)

('Kris Janson', 49, 0.8139309287071228)

In [89]:
"""<mask> <mask> <mask> <mask> ( <mask> <mask> <mask>, <mask> ) <mask> <mask> <mask> kris janson, <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, <mask> <mask> <mask> <mask> araneta <mask>, <mask> <mask>, <mask>.""".count('<mask>')

43

In [66]:
display_profile_by_index(49)

0,1
eye_color,brown
name,kris janson
birth_name,kris tiffany maslog janson
hometown,"cebu city , cebu"
title,"binibining pilipinas intercontinental 2014 ,"
competitions,binibining pilipinas 2014
hair_color,brown
birth_date,21 december 1989
article_title,kris janson
height,1.73


In [86]:
# an intermediate during beam search
print(call_rr("kris <mask> maslog janson ( born <mask> <mask>, <mask> ) simply known as kris janson, is <mask> filipino <mask> <mask> titleholder <mask> cebu city, crowned <mask> pilipinas <mask> 2014 at the binibining pilipinas 2014 <mask> held on march 30th, 2014 at the smart <mask> coliseum, quezon city, philippines."))

# same thing, with additional kris masked
print(call_rr("kris <mask> maslog janson ( born <mask> <mask>, <mask> ) simply known as <mask> janson, is <mask> filipino <mask> <mask> titleholder <mask> cebu city, crowned <mask> pilipinas <mask> 2014 at the binibining pilipinas 2014 <mask> held on march 30th, 2014 at the smart <mask> coliseum, quezon city, philippines."))

# same thing, with both kris and janson masked
print(call_rr("kris <mask> maslog <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is <mask> filipino <mask> <mask> titleholder <mask> cebu city, crowned <mask> pilipinas <mask> 2014 at the binibining pilipinas 2014 <mask> held on march 30th, 2014 at the smart <mask> coliseum, quezon city, philippines."))

# same thing, with all three masked
print(call_rr("<mask> <mask> <mask> <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is <mask> filipino <mask> <mask> titleholder <mask> cebu city, crowned <mask> pilipinas <mask> 2014 at the binibining pilipinas 2014 <mask> held on march 30th, 2014 at the smart <mask> coliseum, quezon city, philippines."))

# with everything I can think of masked 
print(call_rr("<mask> <mask> <mask> <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>."))


('Kris Janson', 49, 0.9600957632064819)
('Kris Janson', 49, 0.9804427027702332)
('Kris Janson', 49, 0.9705662727355957)
('Kimberly Anastacia Karlsson', 655131, 0.48285189270973206)
('Angeli Gomez', 134511, 0.16611042618751526)


In [74]:
call_rt("<mask> <mask> <mask> <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.")

('Gwendoline Ruais', 437639, 0.04145599156618118)

In [71]:
print("<mask> <mask> maslog <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.")
# HOW TF CAN IT FIGURE THIS OUT???
# Nvm it's the middle name 'maslog'.

<mask> <mask> maslog <mask> ( born <mask> <mask>, <mask> )  known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.


In [93]:
print((call_rr("<mask> <mask> maslog <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.")))


('Kris Janson', 49, 0.9915623068809509)


In [78]:
("<mask> <mask> maslog <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.")

'<mask> <mask> maslog <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.'

In [79]:
"<mask> <mask> <maslog> <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>."

'<mask> <mask> <maslog> <mask> ( born <mask> <mask>, <mask> ) simply known as <mask> <mask>, is a <mask> <mask> <mask> titleholder from <mask> <mask>, crowned <mask> <mask> <mask> <mask> at the <mask> <mask> <mask> <mask> held on march 30th, <mask> at the smart araneta coliseum, quezon city, <mask>.'

In [90]:
# masked with beam search b=128
kris_janson_128 = "<mask> <mask> <mask> <mask> ( <mask> <mask> <mask>, <mask> ) <mask> <mask> <mask> <mask> <mask>, <mask> <mask> filipino <mask> <mask> <mask> <mask> cebu city, <mask> <mask> <mask> <mask> 2014 <mask> <mask> <mask> <mask> 2014 <mask> held <mask> <mask> 30th, <mask> <mask> the smart <mask> <mask>, <mask> city, philippines."
kris_janson_128.count('<mask>')

35

In [94]:
call_rt(kris_janson_128)

('Nigel Paul C. Villarete', 295288, 0.43455469608306885)

In [95]:
call_rr(kris_janson_128)

('John Geesnell Lim Yap Ii', 494143, 0.09956586360931396)