In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from datamodule import WikipediaDataModule

import numpy as np


num_cpus = 8

dm = WikipediaDataModule(
    document_model_name_or_path='roberta-base',
    profile_model_name_or_path='google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]
loading wiki_bio[1.2.0] split val[:100%]
loading wiki_bio[1.2.0] split test[:100%]
                        

In [2]:
import torch
from utils import get_profile_embeddings_by_model_key

def get_profile_embeddings(model_key: str):
    profile_embeddings = get_profile_embeddings_by_model_key(model_key=model_key)

    print("concatenating train, val, and test profile embeddings")
    all_profile_embeddings = torch.cat(
        (profile_embeddings['test'], profile_embeddings['val'], profile_embeddings['train']), dim=0
    )

    print("all_profile_embeddings:", all_profile_embeddings.shape)
    return all_profile_embeddings

rt_profile_embeddings = get_profile_embeddings('model_3_2')
rr_profile_embeddings = get_profile_embeddings('model_3_3')

>> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/test.pkl
concatenating train, val, and test profile embeddings
all_profile_embeddings: torch.Size([728321, 3072])
>> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3/test.pkl
concatenating train, val, and test profil

In [3]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from model import AbstractModel, CoordinateAscentModel

from model_cfg import model_paths_dict

rt = CoordinateAscentModel.load_from_checkpoint(model_paths_dict['model_3_2'])
rr = CoordinateAscentModel.load_from_checkpoint(model_paths_dict['model_3_3'])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 0.0001 and patience 6


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaMod

Initialized model with learning_rate = 0.0001 and patience 6


In [4]:
import torch
import transformers

from model import AbstractModel

class ModelWrapper:
    document_tokenizer: transformers.AutoTokenizer
    profile_embeddings: torch.Tensor
    max_seq_length: int
    
    def __init__(self,
            model: AbstractModel,
            document_tokenizer: transformers.AutoTokenizer,
            profile_embeddings: torch.Tensor,
            max_seq_length: int = 128
        ):
        self.model = model
        self.model.eval()
        self.document_tokenizer = document_tokenizer
        self.profile_embeddings = profile_embeddings.clone().detach()
        self.max_seq_length = max_seq_length
                 
    def to(self, device):
        self.model.to(device)
        self.profile_embeddings = self.profile_embeddings.to(device)
        return self # so semantics `model = MyModelWrapper().to('cuda')` works properly

    def __call__(self, text_input_list):
        model_device = next(self.model.parameters()).device

        tokenized_documents = self.document_tokenizer.batch_encode_plus(
            text_input_list,
            max_length=self.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        batch = {f"document__{k}": v for k,v in tokenized_documents.items()}

        with torch.no_grad():
            document_embeddings = self.model.forward_document(batch=batch, document_type='document')
            document_to_profile_logits = document_embeddings @ (self.profile_embeddings.T)
        # breakpoint()
        assert document_to_profile_logits.shape == (len(text_input_list), len(self.profile_embeddings))
        return document_to_profile_logits

    
rt_mw = ModelWrapper(rt, dm.document_tokenizer, rt_profile_embeddings).to('cuda')
rr_mw = ModelWrapper(rr, dm.document_tokenizer, rr_profile_embeddings).to('cuda')


In [5]:
from typing import List, Tuple

from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def get_person(i: int):
    if i < len(dm.test_dataset):
        return dm.test_dataset[i]
    elif i < (len(dm.test_dataset) + len(dm.val_dataset)):
        return dm.val_dataset[i - len(dm.test_dataset)]
    else:
        return dm.train_dataset[i - len(dm.test_dataset) - len(dm.val_dataset)]

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))


In [6]:
label_names = np.array(list(dm.test_dataset['name']) + list(dm.val_dataset['name']) + list(dm.train_dataset['name']))

In [7]:
import pandas as pd

label_names_s = pd.Series(label_names)

In [8]:
label_names_s.apply(lambda s: 'lebron' in s.lower()).to_numpy().nonzero()

(array([123185, 138382, 299329, 581936, 652825]),)

In [9]:
label_names[[123185, 138382, 299329, 581936, 652825]]

array(['Luke Hellebronth', 'Gram Lebron', 'Lebron James',
       'E. Lebron Fairbanks', 'Vilmos Hellebronth'], dtype='<U476')

In [10]:
display_profile_by_index(299329)

0,1
position,small forward
draft_pick,1
years,2003 end 2014 start -- present start -- 2009 end 2010 start -- 2013
nationality,american
team,cleveland cavaliers cleveland cavaliers miami heat cleveland cavaliers
bbr,jamesle01
high_school,"st. -rrb- vincent -- st. mary -lrb- akron , ohio"
draft_team,cleveland cavaliers
draft_year,2003
height_in,8


In [11]:
def call_rr(s: str):
    probs = rr_mw([s]).softmax(1)
    return (label_names[probs.argmax()], probs.argmax().item(), probs.max().item())

def call_rt(s: str):
    probs = rt_mw([s]).softmax(1)
    return (label_names[probs.argmax()], probs.argmax().item(), probs.max().item())

In [12]:
lebron = get_person(299329)

In [13]:
def get_nearest_neighbors(doc: str, original_idx: int = None):
    probs = rt_mw([doc]).squeeze()
    probs = probs / probs.max()
    if original_idx is not None:
        probs[original_idx] = 0
        # print(probs[original_idx])
    # probs = probs.softmax(dim=0)
    topk = probs.topk(10)
    for val, idx in zip(topk.values.cpu(), topk.indices.cpu()):
        print(idx.item(), label_names[idx], f'{val.item()*100:.2f}%')

In [14]:
get_nearest_neighbors(lebron['document'], 299_329)

600196 Richard Jefferson 55.12%
662299 James Jones 52.34%
15588 Carmelo Anthony 50.24%
641165 Jarrett Jack 48.69%
667184 Brevin Knight 48.20%
261438 Kevin Love 46.42%
497444 Chris Bosh 45.06%
149737 Sean May 44.81%
447981 Rudy Gay 44.41%
269568 Solomon Jones 44.09%


In [15]:
def get_nearest_neighbors_by_name(name: str):
    person_idx = (label_names == name).nonzero()[0]
    if len(person_idx) != 1:
        raise ValueError(f'found {len(person_idx)} people matching name {name}')
    print(person_idx[0])
    document = get_person(int(person_idx[0]))['document']
    print(f'*** Nearest-neighbors for {name}: ***')
    get_nearest_neighbors(document, person_idx)

In [16]:
get_nearest_neighbors_by_name("Barack Obama")

671350
*** Nearest-neighbors for Barack Obama: ***
75269 Barack Obama , Sr. . 75.74%
654466 Rahm Emanuel 60.60%
374986 Hillary Rodham Clinton 57.85%
460976 Michelle Obama 56.53%
192810 Barack Obama Presidential Primary Campaign , 2008 55.79%
71616 Abraham Lincoln 53.40%
642361 Hubert Humphrey 50.58%
250101 John Kerry 49.96%
404145 Max Baucus 49.72%
135163 Roy Blunt 48.80%


In [17]:
get_nearest_neighbors_by_name("Walt Disney")

413761
*** Nearest-neighbors for Walt Disney: ***
339104 Thomas Edison 47.87%
418604 Roy O. Disney 47.34%
539371 Walter Lantz 44.97%
547356 Walter Dyett 44.61%
7148 Walt Bellamy 43.94%
677767 Matt Damon 42.85%
560698 Elias Disney 42.64%
623234 Walter E. Sachs 40.13%
96578 Dick Clark 39.50%
77943 Lillian Disney 39.47%


In [18]:
get_nearest_neighbors_by_name("Jerry Seinfeld")

429678
*** Nearest-neighbors for Jerry Seinfeld: ***
574680 Jerry Siegel 49.91%
86849 Jerry Stiller 49.57%
211885 Rob Huebel 44.26%
98555 Jim Belushi 43.69%
699343 Geri Jewell 42.66%
599136 Jerry Jewell 39.88%
437339 Peter Steinfeld 39.83%
249657 Gilbert Gottfried 39.68%
70522 Ray Liotta 39.11%
504532 Rob Wiethoff 38.90%


In [41]:
import re

def get_profile(ex):
    table_info = ex['input_text']['table']
    return dict(zip(table_info['column_header'], table_info['content']))

def get_document_with_redacted_name_and_birthday(ex, mask_token='<mask>'):
    document = ex['document']
    profile = get_profile(ex)
    birth_date = profile.get('birth_date', '').lower()
    name = profile.get('name', profile['article_title']).lower()
    words_to_mask = birth_date.split() + name.split()
    
    for w in words_to_mask:
        document = re.sub(
            (r'\b{}\b').format(re.escape(w)),
            mask_token, document, count=0
        )
    return document

In [20]:
def get_nearest_neighbors_by_name_redacted(name: str):
    person_idx = (label_names == name).nonzero()[0]
    if len(person_idx) != 1:
        raise ValueError(f'found {len(person_idx)} people matching name {name}')
    print(person_idx[0])
    ex = get_person(int(person_idx[0]))
    document = ex['document']
    redacted_document = get_document_with_redacted_name_and_birthday(ex)
    print(f'*** Nearest-neighbors for {name}: ***')
    get_nearest_neighbors(redacted_document, person_idx)

In [21]:
jerry_seinfeld = get_person(429678)
get_nearest_neighbors_by_name('Jerry Seinfeld')

429678
*** Nearest-neighbors for Jerry Seinfeld: ***
574680 Jerry Siegel 49.91%
86849 Jerry Stiller 49.57%
211885 Rob Huebel 44.26%
98555 Jim Belushi 43.69%
699343 Geri Jewell 42.66%
599136 Jerry Jewell 39.88%
437339 Peter Steinfeld 39.83%
249657 Gilbert Gottfried 39.68%
70522 Ray Liotta 39.11%
504532 Rob Wiethoff 38.90%


In [22]:
get_nearest_neighbors_by_name_redacted('Jerry Seinfeld')

429678
*** Nearest-neighbors for Jerry Seinfeld: ***
307272 Hank Azaria 63.28%
243031 Rodger Bumpass 62.99%
70522 Ray Liotta 62.97%
404535 Jerry Doyle 62.20%
162597 Harold Ramis 61.17%
9446 Spike Lee 60.73%
723533 Judd Apatow 60.68%
508131 Jeff Foxworthy 59.10%
598952 J. B. Smoove 57.90%
291054 J. Allen Williams 57.68%


In [23]:
get_nearest_neighbors_by_name_redacted('Matt Damon')

677767
*** Nearest-neighbors for Matt Damon: ***
4620 Matt Stone 64.70%
586050 Ethan Hawke 62.54%
296080 Matthew Weiner 61.92%
532669 Griffin Dunne 61.80%
135652 Matthew Mcconaughey 61.27%
391723 Ryan Phillippe 60.55%
602277 Matt Reeves 60.18%
297958 Matthew Morrison 60.14%
272823 Matt Letscher 58.05%
311898 Matt Bennett 57.49%


In [30]:
get_document_with_redacted_name_and_birthday(get_person(677_767))

"matthew paige `` <mask> '' <mask> ( ; born october 8 , 1970 ) is an american actor , screenwriter and producer .\n<mask> and ben affleck wrote an academy award-winning screenplay , `` good will hunting '' ( 1997 ) , for which he also received a number of best actor nominations .\nhe is among forbes magazine 's most bankable stars and one of the top-40 highest-grossing actors of all time .\nin addition to the many awards and nominations <mask> has received , such as academy , golden globe and other industry awards , for his work in the film industry -- in 2007 , <mask> received a star on the hollywood walk of fame and was named the sexiest man alive by `` people '' magazine .\n<mask> has become known for his versatility , starring in commercially and critically successful films such as the rogue secret agent jason bourne in the first three installments of the `` bourne '' series , the youthful thief linus caldwell in the `` ocean 's trilogy '' , the anti-hero in `` the talented mr. rip

In [24]:
get_nearest_neighbors_by_name_redacted('Ronaldinho')

665717
*** Nearest-neighbors for Ronaldinho: ***
169761 Alex 58.42%
664081 Robinho 56.86%
515310 Fred 55.62%
535328 Renato 54.47%
2221 Ronaldo 51.95%
360052 Kaká 50.97%
560092 Romário 48.08%
692581 Hani Al-dhabit 47.89%
447864 Carlos Alberto 47.22%
549263 Lincoln 46.45%


In [25]:
get_nearest_neighbors_by_name_redacted('Albert Einstein')

502074
*** Nearest-neighbors for Albert Einstein: ***
693823 Hermann Weyl 73.52%
582377 Werner Heisenberg 70.85%
6823 Pascual Jordan 69.23%
225426 Jules Henri Poincaré 63.01%
501980 Joseph Smith Iii 61.75%
498361 James Franck 61.34%
218918 Erich Regener 60.63%
257805 Adolf Hitler 60.20%
623393 Vladimir Lenin 58.92%
247822 Max Born 58.54%


In [26]:
get_nearest_neighbors_by_name_redacted('Richard Feynman')

227326
*** Nearest-neighbors for Richard Feynman: ***
671361 Luis Walter Alvarez 71.49%
357427 Mohammad Abdus Salam Ur 65.16%
639750 Isidor Isaac Rabi 62.06%
498299 Robert Hofstadter 59.96%
694453 David Bohm 58.37%
233659 Julian Schwinger 57.85%
567047 Polykarp Kusch 57.48%
611525 Seth Neddermeyer 54.89%
226989 Clinton Joseph Davisson 53.74%
530813 H. P. Lovecraft 53.68%


In [27]:
get_nearest_neighbors_by_name_redacted('Adolf Hitler')

257805
*** Nearest-neighbors for Adolf Hitler: ***
379644 Heinrich Himmler 100.00%
357886 Kurt Waldheim 93.75%
441664 Joseph Goebbels 91.47%
192442 Franz Joseph Hermann Michael Maria Von Papen Zu Köningen 86.76%
282285 Ernst Rüdiger Camillo Starhemberg 84.38%
533722 Ernst Kaltenbrunner 82.26%
42629 Hermann Göring 81.04%
640081 Karl Dönitz 80.94%
530315 Robert Ley 77.82%
672653 Reinhard Heydrich 73.41%


In [28]:
get_nearest_neighbors_by_name_redacted('Adolf Hitler')

257805
*** Nearest-neighbors for Adolf Hitler: ***
379644 Heinrich Himmler 100.00%
357886 Kurt Waldheim 93.75%
441664 Joseph Goebbels 91.47%
192442 Franz Joseph Hermann Michael Maria Von Papen Zu Köningen 86.76%
282285 Ernst Rüdiger Camillo Starhemberg 84.38%
533722 Ernst Kaltenbrunner 82.26%
42629 Hermann Göring 81.04%
640081 Karl Dönitz 80.94%
530315 Robert Ley 77.82%
672653 Reinhard Heydrich 73.41%


In [42]:
get_nearest_neighbors_by_name_redacted('Lebron James')

299329
*** Nearest-neighbors for Lebron James: ***
662299 James Jones 71.85%
686723 Anderson Varejão 69.81%
600196 Richard Jefferson 69.70%
261438 Kevin Love 68.92%
622895 Dwayne Bowe 63.97%
15588 Carmelo Anthony 63.82%
478226 Jason Kapono 62.06%
151490 Timofey Mozgov 59.46%
647844 Jae Crowder 59.06%
9820 Jamal Crawford 58.67%


In [31]:
def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))

In [32]:
display_profile_by_index(677_767)

0,1
caption,damon at the paris premiere of -rcb- -rcb-
name,matt damon
image_size,220px
image,matt damon 2014.jpg
article_title,matt damon


In [37]:
display_profile_by_index(677_760)

0,1
nfl,fox725424
name,vernon fox
finalteam,denver broncos
college,fresno state
position,safety
weight,203
finalyear,2009
undraftedyear,2002
debutyear,2002
debutteam,san diego chargers


In [38]:
display_profile_by_index(665_717)

0,1
nationalgoals,2 3 18 33
fullname,ronaldo de assis moreira
position,attacking midfielder / forward
pcupdate,13 august 2015
years,1987 2008 -- 2011 2011 -- 2012 2012 -- 2014 2014 -- -- 2015 2015 -- 1998 2001 -- 2003 2003 -- 2008
ntupdate,23 april 2013
nationalcaps,6 5 27 97
height,1.81 m ftin on
youthclubs,grêmio
caps,52 55 145 76 33 47 25 3


In [43]:
display_profile_by_index(299_329)

0,1
position,small forward
draft_pick,1
years,2003 end 2014 start -- present start -- 2009 end 2010 start -- 2013
nationality,american
team,cleveland cavaliers cleveland cavaliers miami heat cleveland cavaliers
bbr,jamesle01
high_school,"st. -rrb- vincent -- st. mary -lrb- akron , ohio"
draft_team,cleveland cavaliers
draft_year,2003
height_in,8
