In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from dataloader import WikipediaDataModule

import numpy as np


num_cpus = 8

dm = WikipediaDataModule(
    document_model_name_or_path='roberta-base',
    profile_model_name_or_path='google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]
loading wiki_bio[1.2.0] split val[:100%]
loading wiki_bio[1.2.0] split test[:100%]
                        

In [2]:
import torch
from utils import get_profile_embeddings_by_model_key

def get_profile_embeddings(model_key: str):
    profile_embeddings = get_profile_embeddings_by_model_key(model_key=model_key)

    print("concatenating train, val, and test profile embeddings")
    all_profile_embeddings = torch.cat(
        (profile_embeddings['test'], profile_embeddings['val'], profile_embeddings['train']), dim=0
    )

    print("all_profile_embeddings:", all_profile_embeddings.shape)
    return all_profile_embeddings

rt_profile_embeddings = get_profile_embeddings('model_3_2')
rr_profile_embeddings = get_profile_embeddings('model_3_3__placeholder')

>> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/test.pkl
concatenating train, val, and test profile embeddings
all_profile_embeddings: torch.Size([728321, 3072])
>> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/test.pkl
co

In [3]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from model import AbstractModel, CoordinateAscentModel

from model_cfg import model_paths_dict

rt = CoordinateAscentModel.load_from_checkpoint(model_paths_dict['model_3_2'])
rr = CoordinateAscentModel.load_from_checkpoint(model_paths_dict['model_3_3__placeholder'])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 0.0001 and patience 6


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaMod

Initialized model with learning_rate = 0.0001 and patience 6


In [4]:
import torch
import transformers

from model import AbstractModel

class ModelWrapper:
    document_tokenizer: transformers.AutoTokenizer
    profile_embeddings: torch.Tensor
    max_seq_length: int
    
    def __init__(self,
            model: AbstractModel,
            document_tokenizer: transformers.AutoTokenizer,
            profile_embeddings: torch.Tensor,
            max_seq_length: int = 128
        ):
        self.model = model
        self.model.eval()
        self.document_tokenizer = document_tokenizer
        self.profile_embeddings = profile_embeddings.clone().detach()
        self.max_seq_length = max_seq_length
                 
    def to(self, device):
        self.model.to(device)
        self.profile_embeddings = self.profile_embeddings.to(device)
        return self # so semantics `model = MyModelWrapper().to('cuda')` works properly

    def __call__(self, text_input_list):
        model_device = next(self.model.parameters()).device

        tokenized_documents = self.document_tokenizer.batch_encode_plus(
            text_input_list,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        batch = {f"document__{k}": v for k,v in tokenized_documents.items()}

        with torch.no_grad():
            document_embeddings = self.model.forward_document(batch=batch, document_type='document')
            document_to_profile_logits = document_embeddings @ (self.profile_embeddings.T)
        # breakpoint()
        assert document_to_profile_logits.shape == (len(text_input_list), len(self.profile_embeddings))
        return document_to_profile_logits

    
rt_mw = ModelWrapper(rt, dm.document_tokenizer, rt_profile_embeddings).to('cuda')
rr_mw = ModelWrapper(rr, dm.document_tokenizer, rr_profile_embeddings).to('cuda')


In [5]:
from typing import List, Tuple

from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def get_person(i: int):
    if i < len(dm.test_dataset):
        return dm.test_dataset[i]
    elif i < (len(dm.test_dataset) + len(dm.val_dataset)):
        return dm.val_dataset[i - len(dm.test_dataset)]
    else:
        return dm.train_dataset[i - len(dm.test_dataset) - len(dm.val_dataset)]

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))


In [6]:
label_names = np.array(list(dm.test_dataset['name']) + list(dm.val_dataset['name']) + list(dm.train_dataset['name']))

In [61]:
def call_rr(s: str, include_train = True):
    probs = rr_mw([s]).squeeze()
    if not include_train:
        probs = probs[:72831*2]
    probs = probs.softmax(dim=0)
    return (label_names[probs.argmax()], probs.argmax().item(), probs.max().item())

def call_rt(s: str, include_train: bool = True):
    probs = rt_mw([s]).squeeze()
    if not include_train:
        probs = probs[:72831*2]
    probs = probs.softmax(dim=0)
    return (label_names[probs.argmax()], probs.argmax().item(), probs.max().item())

In [10]:
import pandas as pd

test_docs = pd.DataFrame(dm.test_dataset['document'], columns=['document'])
test_docs.head()

Unnamed: 0,document
0,"leonard shenoff randle ( born february 12 , 19..."
1,philippe adnot ( born 25 august 1945 in rhèges...
2,miroslav popov ( born 14 june 1995 in dvůr krá...
3,john `` jack '' reynolds ( 21 february 1869 --...
4,"william ato ankrah , ( born 7th july 1979 ) be..."


In [12]:
test_docs['num_words'] = test_docs['document'].map(lambda doc: (1 + doc.count(' ')))

In [33]:
idx = 9_150
print(
    '\n'.join(
        test_docs.sort_values(by='num_words')['document'].iloc[idx : idx+5].tolist()
    )
)

matt murphy is a canadian musician and actor .
he is perhaps best known as the vocalist and guitarist of 1990s band the super friendz .

glen tourville is a retired american soccer player who currently serves as an assistant coach with the ohio state buckeyes women 's soccer team .

carlos alcántara cuevas ( born 1 february 1985 ) is a spanish footballer who plays for la hoya lorca cf as a left back .

erkki kerttula ( 5 november 1909 -- 4 november 1989 ) was a finnish fencer .
he competed at the 1948 and 1952 summer olympics .

tommy jönsson ( born march 4 , 1976 ) is a former swedish football player , who played as defender and retired in 2010 .



In [62]:
s = "tommy jönsson ( born march 4 , 1976 ) is a former swedish football player , who played as defender and retired in 2010 ."
call_rr(s, include_train=False)

('Tommy Jönsson', 52471, 0.9964974522590637)

In [72]:
def print_all_possible_masks(s: str):
    words = s.split(' ')
    for i in range(len(words)):
        s2_words = words.copy()
        s2_words[i] = '<mask>'
        s2_pred = list(call_rr(' '.join(s2_words), include_train=False))
        s2_pred[-1] = f'{s2_pred[-1]:.3f}'
        print(words[i], '\t', '\t'.join(map(str,s2_pred)))

In [73]:
print_all_possible_masks(s)

tommy 	 Tommy Jönsson	52471	0.994
jönsson 	 Tommy Jönsson	52471	0.971
( 	 Tommy Jönsson	52471	0.993
born 	 Tommy Jönsson	52471	0.997
march 	 Tommy Jönsson	52471	0.995
4 	 Tommy Jönsson	52471	0.994
, 	 Tommy Jönsson	52471	0.997
1976 	 Tommy Jönsson	52471	0.994
) 	 Tommy Jönsson	52471	0.997
is 	 Tommy Jönsson	52471	0.996
a 	 Tommy Jönsson	52471	0.996
former 	 Tommy Jönsson	52471	0.996
swedish 	 Tommy Jönsson	52471	0.997
football 	 Tommy Jönsson	52471	0.997
player 	 Tommy Jönsson	52471	0.997
, 	 Tommy Jönsson	52471	0.996
who 	 Tommy Jönsson	52471	0.997
played 	 Tommy Jönsson	52471	0.996
as 	 Tommy Jönsson	52471	0.996
defender 	 Tommy Jönsson	52471	0.996
and 	 Tommy Jönsson	52471	0.996
retired 	 Tommy Jönsson	52471	0.994
in 	 Tommy Jönsson	52471	0.997
2010 	 Tommy Jönsson	52471	0.995
. 	 Tommy Jönsson	52471	0.996


In [76]:
s2 = "tommy <mask> ( born march 4 , 1976 ) is a former swedish football player , who played as defender and retired in 2010 ."
print_all_possible_masks(s2)

tommy 	 Tommy Jönsson	52471	0.899
<mask> 	 Tommy Jönsson	52471	0.971
( 	 Tommy Jönsson	52471	0.964
born 	 Tommy Jönsson	52471	0.971
march 	 Tommy Jönsson	52471	0.953
4 	 Tommy Jönsson	52471	0.942
, 	 Tommy Jönsson	52471	0.969
1976 	 Tommy Jönsson	52471	0.940
) 	 Tommy Jönsson	52471	0.974
is 	 Tommy Jönsson	52471	0.971
a 	 Tommy Jönsson	52471	0.971
former 	 Tommy Jönsson	52471	0.968
swedish 	 Tommy Jönsson	52471	0.964
football 	 Tommy Jönsson	52471	0.973
player 	 Tommy Jönsson	52471	0.975
, 	 Tommy Jönsson	52471	0.956
who 	 Tommy Jönsson	52471	0.971
played 	 Tommy Jönsson	52471	0.973
as 	 Tommy Jönsson	52471	0.971
defender 	 Tommy Jönsson	52471	0.909
and 	 Tommy Jönsson	52471	0.970
retired 	 Tommy Jönsson	52471	0.909
in 	 Tommy Jönsson	52471	0.971
2010 	 Tommy Jönsson	52471	0.960
. 	 Tommy Jönsson	52471	0.966


In [78]:
s3 = "<mask> <mask> ( born march 4 , 1976 ) is a former swedish football player , who played as defender and retired in 2010 ."
print_all_possible_masks(s3)

<mask> 	 Tommy Jönsson	52471	0.899
<mask> 	 Tommy Jönsson	52471	0.899
( 	 Tommy Jönsson	52471	0.734
born 	 Tommy Jönsson	52471	0.898
march 	 Tommy Jönsson	52471	0.571
4 	 Tommy Jönsson	52471	0.610
, 	 Tommy Jönsson	52471	0.970
1976 	 Tommy Jönsson	52471	0.527
) 	 Tommy Jönsson	52471	0.887
is 	 Tommy Jönsson	52471	0.899
a 	 Tommy Jönsson	52471	0.895
former 	 Tommy Jönsson	52471	0.893
swedish 	 Tommy Jönsson	52471	0.410
football 	 Tommy Jönsson	52471	0.911
player 	 Tommy Jönsson	52471	0.914
, 	 Tommy Jönsson	52471	0.827
who 	 Tommy Jönsson	52471	0.898
played 	 Tommy Jönsson	52471	0.911
as 	 Tommy Jönsson	52471	0.897
defender 	 Tommy Jönsson	52471	0.767
and 	 Tommy Jönsson	52471	0.892
retired 	 Tommy Jönsson	52471	0.669
in 	 Tommy Jönsson	52471	0.897
2010 	 Tommy Jönsson	52471	0.917
. 	 Tommy Jönsson	52471	0.868


In [79]:
print_all_possible_masks("<mask> <mask> ( born march 4 , 1976 ) is a former <mask> football player , who played as defender and retired in 2010 .")

<mask> 	 Tommy Jönsson	52471	0.410
<mask> 	 Tommy Jönsson	52471	0.410
( 	 Juan Diego González	21768	0.235
born 	 Tommy Jönsson	52471	0.423
march 	 Andrei Trofimov	15775	0.111
4 	 Franck Bernhard	46653	0.341
, 	 Tommy Jönsson	52471	0.236
1976 	 Vincent Cobos	57781	0.448
) 	 Tommy Jönsson	52471	0.578
is 	 Tommy Jönsson	52471	0.407
a 	 Tommy Jönsson	52471	0.399
former 	 Tommy Jönsson	52471	0.444
<mask> 	 Tommy Jönsson	52471	0.410
football 	 Tommy Jönsson	52471	0.538
player 	 Tommy Jönsson	52471	0.420
, 	 Tommy Jönsson	52471	0.519
who 	 Tommy Jönsson	52471	0.422
played 	 Tommy Jönsson	52471	0.430
as 	 Tommy Jönsson	52471	0.419
defender 	 Tommy Jönsson	52471	0.107
and 	 Tommy Jönsson	52471	0.424
retired 	 Tommy Jönsson	52471	0.729
in 	 Tommy Jönsson	52471	0.450
2010 	 Tommy Jönsson	52471	0.383
. 	 Tommy Jönsson	52471	0.356


In [68]:
print_all_possible_masks("<mask> <mask> ( born march 4 , 1976 ) is a former <mask> football player , who played as defender and retired in <mask> .")

<mask> 	 Tommy Jönsson	52471	0.38
<mask> 	 Tommy Jönsson	52471	0.38
( 	 Eduardo Rergis	29984	0.37
born 	 Tommy Jönsson	52471	0.39
march 	 Andrei Trofimov	15775	0.13
4 	 Kamel Habri	122141	0.24
, 	 Tommy Jönsson	52471	0.19
1976 	 Vincent Cobos	57781	0.34
) 	 Tommy Jönsson	52471	0.45
is 	 Tommy Jönsson	52471	0.38
a 	 Tommy Jönsson	52471	0.34
former 	 Tommy Jönsson	52471	0.36
<mask> 	 Tommy Jönsson	52471	0.38
football 	 Tommy Jönsson	52471	0.47
player 	 Tommy Jönsson	52471	0.40
, 	 Tommy Jönsson	52471	0.50
who 	 Tommy Jönsson	52471	0.44
played 	 Tommy Jönsson	52471	0.47
as 	 Tommy Jönsson	52471	0.52
defender 	 Tommy Jönsson	52471	0.07
and 	 Tommy Jönsson	52471	0.51
retired 	 Tommy Jönsson	52471	0.15
in 	 Tommy Jönsson	52471	0.11
<mask> 	 Tommy Jönsson	52471	0.38
. 	 Tommy Jönsson	52471	0.23


In [70]:
print_all_possible_masks("<mask> <mask> ( born march 4 , <mask> ) is a former <mask> football player , who played as defender and retired in <mask> .")

<mask> 	 Vincent Cobos	57781	0.34
<mask> 	 Vincent Cobos	57781	0.34
( 	 Eduardo Rergis	29984	0.14
born 	 Vincent Cobos	57781	0.35
march 	 Bo Svensson	3691	0.23
4 	 Mouloud Akloul	27467	0.06
, 	 Vincent Cobos	57781	0.11
<mask> 	 Vincent Cobos	57781	0.34
) 	 Vincent Cobos	57781	0.58
is 	 Vincent Cobos	57781	0.34
a 	 Vincent Cobos	57781	0.31
former 	 Vincent Cobos	57781	0.35
<mask> 	 Vincent Cobos	57781	0.34
football 	 Vincent Cobos	57781	0.33
player 	 Vincent Cobos	57781	0.37
, 	 Vincent Cobos	57781	0.75
who 	 Vincent Cobos	57781	0.46
played 	 Vincent Cobos	57781	0.44
as 	 Vincent Cobos	57781	0.46
defender 	 Vincent Cobos	57781	0.22
and 	 Vincent Cobos	57781	0.48
retired 	 Vincent Cobos	57781	0.20
in 	 Vincent Cobos	57781	0.10
<mask> 	 Vincent Cobos	57781	0.34
. 	 Vincent Cobos	57781	0.19
