In [4]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from datamodule import WikipediaDataModule
from model import AbstractModel, CoordinateAscentModel
from utils import get_profile_embeddings_by_model_key

import argparse
import collections
import glob
import os
import re

import datasets
import pandas as pd
import torch
import transformers
from tqdm import tqdm


from model_cfg import model_paths_dict

datasets.utils.logging.set_verbosity_error()


num_cpus = len(os.sched_getaffinity(0))


def get_profile_embeddings(model_key: str):
    profile_embeddings = get_profile_embeddings_by_model_key(model_key=model_key)

    print("concatenating train, val, and test profile embeddings")
    all_profile_embeddings = torch.cat(
        (profile_embeddings['test'], profile_embeddings['val'], profile_embeddings['train']), dim=0
    )

    print("all_profile_embeddings:", all_profile_embeddings.shape)
    return all_profile_embeddings


def get_output_folder_by_model_key(model_key: str) -> str:
    adv_csvs_folder = os.path.normpath(
        os.path.join(
            os.path.abspath(__file__), os.pardir, os.pardir, 'adv_csvs_full_2'
        )
    )
    return os.path.join(adv_csvs_folder, model_key)

def load_adv_csv(dm: WikipediaDataModule) -> pd.DataFrame:
    # Load all the stuff
    adv_df = None
    for model_name in ['model_1', 'model_3_2', 'model_3_3__placeholder', 'model_4']:
        adv_csvs_folder = os.path.normpath(
            os.path.join(
                os.getcwd(), os.pardir, 'adv_csvs_full_2'
            )
        )
        print('adv_csvs_folder', adv_csvs_folder)
        csv_filenames = glob.glob(
            os.path.join(
                adv_csvs_folder,
                f'{model_name}/results__b_1__k_1__n_1000.csv'
            )
        )
        print(model_name, csv_filenames)
        for filename in csv_filenames:
            df = pd.read_csv(filename)
            df['model_name'] = re.search(r'adv_csvs_full_2/(model_\d.+)/.+.csv', filename).group(1)
            df['k'] = re.search(r'adv_csvs_full_2/.+/.+__k_(\d+)__.+.csv', filename).group(1)
            df['i'] = df.index

            df = df[df['result_type'] == 'Successful']

            mini_df = df[['perturbed_text', 'model_name', 'i', 'k']]
            mini_df = mini_df.iloc[:100]
            
            if adv_df is None:
                adv_df = mini_df
            else:
                adv_df = pd.concat((adv_df, mini_df), axis=0)
    
    # Load baseline redacted data
    mini_val_dataset = dm.test_dataset[:1000]
    ner_df = pd.DataFrame(
        columns=['perturbed_text'],
        data=mini_val_dataset['document_redact_ner_bert']
    )
    ner_df['model_name'] = 'named_entity'
    ner_df['i'] = ner_df.index
        
    lex_df = pd.DataFrame(
        columns=['perturbed_text'],
        data=mini_val_dataset['document_redact_lexical']
    )
    lex_df['model_name'] = 'lexical'
    lex_df['i'] = lex_df.index

    # Combine both adversarial and baseline redacted data
    baseline_df = pd.concat((lex_df, ner_df), axis=0)
    baseline_df['k'] = 0
    full_df = pd.concat((adv_df, baseline_df), axis=0)

    # Put newlines back
    full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('<SPLIT>', '\n'))

    # Standardize mask tokens
    full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('[MASK]', dm.mask_token))
    full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('<mask>', dm.mask_token))

    return full_df


def get_adv_predictions(model_key: str):
    checkpoint_path = model_paths_dict[model_key]
    assert isinstance(checkpoint_path, str), f"invalid checkpoint_path {checkpoint_path} for {model_key}"
    print(f"running eval on {model_key} loaded from {checkpoint_path}")
    model = CoordinateAscentModel.load_from_checkpoint(
        checkpoint_path
    )

    print(f"loading data with {num_cpus} CPUs")
    dm = WikipediaDataModule(
        document_model_name_or_path=model.document_model_name_or_path,
        profile_model_name_or_path=model.profile_model_name_or_path,
        dataset_name='wiki_bio',
        dataset_train_split='train[:256]',
        dataset_val_split='val[:256]',
        dataset_test_split='test[:100%]',
        dataset_version='1.2.0',
        num_workers=num_cpus,
        train_batch_size=256,
        eval_batch_size=256,
        max_seq_length=128,
        sample_spans=False,
    )
    dm.setup("fit")

    all_profile_embeddings = get_profile_embeddings(model_key=model_key).cuda()

    model.document_model.eval()
    model.document_model.cuda()
    model.document_embed.eval()
    model.document_embed.cuda()

    adv_csv = load_adv_csv(dm=dm)

    topk_values = []
    topk_idxs = []
    batch_size = 256
    i = 0
    while i < len(adv_csv):
        ex = adv_csv.iloc[i:i+batch_size]
        test_batch = dm.document_tokenizer.batch_encode_plus(
            ex['perturbed_text'].tolist(),
            max_length=dm.max_seq_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        test_batch = {
            f'perturbed_text__{k}': v for k,v in test_batch.items()
        }
        with torch.no_grad():
            document_embeddings = model.forward_document(batch=test_batch, document_type='perturbed_text')
            document_to_profile_logits = document_embeddings @ all_profile_embeddings.T
            document_to_profile_probs = document_to_profile_logits.softmax(dim=1)
            topk_100 = document_to_profile_probs.topk(100)
            topk_values.append(topk_100.values)
            topk_idxs.append(topk_100.indices)

        i += batch_size
    
    adv_csv['pred_topk_values'] = torch.cat(topk_values, dim=0).cpu().tolist()
    adv_csv['pred_topk_idxs'] = torch.cat(topk_idxs, dim=0).cpu().tolist()
    return adv_csv



In [5]:
roberta_roberta_predictions = get_adv_predictions(model_key='model_3_3__placeholder')
roberta_tapas_predictions = get_adv_predictions(model_key='model_3_2')

running eval on model_3_3__placeholder loaded from /home/jxm3/research/deidentification/unsupervised-deidentification/saves/ca__roberta__dropout_0.5_1.0_0.0__e3072__ls0.1/deid-wikibio-4_default/1c9464tp_750/checkpoints/epoch=58-step=134342-idf_total.ckpt


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaMod

Initialized model with learning_rate = 0.0001 and patience 6
loading data with 8 CPUs
Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:256]
loading wiki_bio[1.2.0] split val[:256]
loading wiki_bio[1.2.0] split test[:100%]
                        >> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_3__placeholder/test.pkl
concatenating train, val, and test profile embeddings
all_profile_embeddings: torch.Size([728321, 3072])
adv_csvs_folder /home/jxm3/research/deidentification/unsupervised-deidentification/adv_csvs_full_2
model_1 []
adv_csv

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 0.0001 and patience 6
loading data with 8 CPUs
Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:256]
loading wiki_bio[1.2.0] split val[:256]
loading wiki_bio[1.2.0] split test[:100%]
                        >> loaded 582659 train embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/train.pkl
>> loaded 72831 val embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/val.pkl
>> loaded 72831 test embeddings from /home/jxm3/research/deidentification/unsupervised-deidentification/embeddings/profile/model_3_2/test.pkl
concatenating train, val, and test profile embeddings
all_profile_embeddings: torch.Size([728321, 3072])
adv_csvs_folder /home/jxm3/research/deidentification/unsupervised-deidentification/adv_csvs_full_2
model_1 []
adv_csvs_folder /home/jxm3/research/deidentifi

In [8]:
new_model_name = {
    'model_3_1': 'roberta_tapas__no_masking',
    'model_3_2': 'roberta_tapas',
    'model_3_3__placeholder': 'roberta_roberta',
    'model_3_4': 'pmlm_tapas'
}
roberta_roberta_predictions['model_name'] = roberta_roberta_predictions['model_name'].apply(lambda s: new_model_name.get(s,s))

out_df = roberta_roberta_predictions.rename(columns={'pred_topk_values': 'roberta_roberta__pred_topk_values', 'pred_topk_idxs': 'roberta_roberta__pred_topk_idxs'})
out_df['roberta_tapas__pred_topk_values'] = roberta_tapas_predictions['pred_topk_values']
out_df['roberta_tapas__pred_topk_idxs'] = roberta_tapas_predictions['pred_topk_idxs']

out_df['roberta_roberta__was_correct'] = out_df.apply(lambda row: row['i'] == row['roberta_roberta__pred_topk_idxs'][0], axis=1)
out_df['roberta_tapas__was_correct'] = out_df.apply(lambda row: row['i'] == row['roberta_tapas__pred_topk_idxs'][0], axis=1)

out_df.head()

Unnamed: 0,perturbed_text,model_name,i,k,roberta_roberta__pred_topk_values,roberta_roberta__pred_topk_idxs,roberta_tapas__pred_topk_values,roberta_tapas__pred_topk_idxs,roberta_roberta__was_correct,roberta_tapas__was_correct
0,"<mask> shenoff <mask> ( born february 12, <mas...",roberta_tapas,0,1,"[0.2761824131011963, 0.14961159229278564, 0.07...","[0, 578457, 719788, 504331, 467718, 530731, 68...","[0.0948665514588356, 0.07953277975320816, 0.02...","[530731, 424788, 412385, 441290, 299079, 46771...",True,False
1,<mask> <mask> ( born 25 august <mask> in rhège...,roberta_tapas,1,1,"[0.8573248982429504, 0.006370349787175655, 0.0...","[1, 176364, 39633, 199778, 467415, 569950, 708...","[0.19266889989376068, 0.11290991306304932, 0.1...","[627677, 39633, 467415, 72806, 646267, 1, 1931...",True,False
2,<mask> <mask> ( born 14 june <mask> in <mask> ...,roberta_tapas,2,1,"[0.9336107969284058, 0.020198944956064224, 0.0...","[2, 264775, 122539, 434330, 333439, 577180, 67...","[0.09999250620603561, 0.09631256014108658, 0.0...","[132875, 256174, 663514, 8198, 45484, 411731, ...",True,False
3,<mask> `` <mask> '' <mask> ( 21 february <mask...,roberta_tapas,3,1,"[0.25506049394607544, 0.05121405050158501, 0.0...","[3, 243519, 455518, 492947, 263248, 569161, 42...","[0.3412027060985565, 0.30935412645339966, 0.03...","[622014, 242196, 55061, 34822, 3, 530143, 6619...",True,False
4,"<mask> <mask> <mask>, ( born 7th july 1979 ) b...",roberta_tapas,4,1,"[0.44907131791114807, 0.19501550495624542, 0.0...","[4, 445867, 331079, 248452, 288786, 638191, 93...","[0.3858789801597595, 0.27763092517852783, 0.11...","[53983, 520062, 4, 45472, 580095, 560843, 5559...",True,False


In [9]:
out_df.groupby('model_name').mean()

Unnamed: 0_level_0,i,roberta_roberta__was_correct,roberta_tapas__was_correct
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lexical,499.5,0.164,0.224
named_entity,499.5,0.674,0.557
roberta_roberta,50.56,0.0,0.39
roberta_tapas,51.16,0.67,0.0


In [10]:
bugs_A = out_df[~out_df['roberta_roberta__was_correct'] &  out_df['roberta_tapas__was_correct']]
bugs_B = out_df[ out_df['roberta_roberta__was_correct'] & ~out_df['roberta_tapas__was_correct']]

In [16]:
count_A = 0
for _, row in bugs_A.iterrows():
    print(row['perturbed_text'], row['i'])
    print('\tR-R:', row['roberta_roberta__pred_topk_idxs'][0], row['roberta_roberta__pred_topk_values'][0])
    print('\tR-T:', row['roberta_tapas__pred_topk_idxs'][0], row['roberta_tapas__pred_topk_values'][0])
    print('\n'*2)
    
    count_A += 1
    if count_A > 20: break

<mask> <mask> ( <mask> <mask> <mask> 1995 <mask> <mask> <mask> <mask> <mask> ) is a <mask> grand prix motorcycle racer.
he currently races in the fim cev moto2 championship for montaze broz racing team aboard a suter. 2
	R-R: 143556 0.32515156269073486
	R-T: 2 0.23325538635253906



<mask> (, born toader <mask> ; february <mask>, <mask> -- july <mask>, 2007 ) was the patriarch of the <mask> orthodox church from 1986 to 2007.
teoctist served his first years as <mask> <mask> <mask> <mask> <mask> regime, <mask> was <mask> <mask> <mask> <mask> collaboration.
<mask> offered <mask> <mask> <mask> <mask> <mask> revolution <mask> 1989, <mask> <mask> <mask> <mask> <mask> <mask> and served a further 17 years.
a promoter of <mask> dialogue, patriarch teoctist invited pope john paul ii to visit <mask> in 1999.
<mask> 5
	R-R: 292691 0.015303807333111763
	R-T: 5 0.9081231951713562



<mask> <mask> ( born 27 february 1979 ) is a south african football ( soccer ) left-winger who plays for premier socce

In [17]:
count_B = 0
for _, row in bugs_B.iterrows():
    print(row['perturbed_text'], row['i'])
    print('\tR-R:', row['roberta_roberta__pred_topk_idxs'][0], row['roberta_roberta__pred_topk_values'][0])
    print('\tR-T:', row['roberta_tapas__pred_topk_idxs'][0], row['roberta_tapas__pred_topk_values'][0])
    print('\n'*2)
    
    count_B += 1
    if count_B > 20: break

<mask> shenoff <mask> ( born february 12, <mask> ) is a former major league baseball player.
he was the first-round pick of the washington senators in the secondary phase of the june <mask> major league baseball draft, tenth overall. 0
	R-R: 0 0.2761824131011963
	R-T: 530731 0.0948665514588356



<mask> <mask> ( born 25 august <mask> in rhèges ) is a member of the senate of france.
he was first elected in 1989, and represents the aube department.
a farmer by profession, he serves as an independent, and also serves as the head of the general council of aube, to which he was elected to represent the canton of <mask> in 1980.
in 1998 and 2008, he was re-elected to the senate in the first round, avoiding the need for a run-off vote.
having contributed to the creation of the 1
	R-R: 1 0.8573248982429504
	R-T: 627677 0.19266889989376068



<mask> <mask> ( born 14 june <mask> in <mask> <mask> <mask> <mask> ) is a <mask> grand prix motorcycle racer.
he currently races in the fim cev moto2 cham

In [6]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from datamodule import WikipediaDataModule

import numpy as np


num_cpus = 8

dm = WikipediaDataModule(
    document_model_name_or_path='roberta-base',
    profile_model_name_or_path='google/tapas-base',
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:100%]',
    dataset_test_split='test[:100%]',
    dataset_version='1.2.0',
    num_workers=num_cpus,
    train_batch_size=256,
    eval_batch_size=256,
    max_seq_length=128,
    sample_spans=False,
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]
loading wiki_bio[1.2.0] split val[:100%]
loading wiki_bio[1.2.0] split test[:100%]
                        

In [7]:
def get_person(i: int):
    if i < len(dm.test_dataset):
        return dm.test_dataset[i]
    elif i < (len(dm.test_dataset) + len(dm.val_dataset)):
        return dm.val_dataset[i - len(dm.test_dataset)]
    else:
        return dm.train_dataset[i - len(dm.test_dataset) - len(dm.val_dataset)]

In [8]:
from typing import List, Tuple

from IPython.display import HTML, display
import html

wrap_th = lambda s: f'<th>{s}</th>'
wrap_td = lambda s: f'<td>{s}</td>'

def table_from_table_rows(rows_str: str) -> List[Tuple[str, str]]:
    return [[el.strip() for el in r.split('||')] for r in rows_str.split('\n')]

def make_prof_html(profile: str) -> str:
    table = table_from_table_rows(profile)
    s = '<table style="border: 1px solid black"><tbody>'
    # print('table:', table)
    for rkey, rval in table:
        s += '<tr>'
        s += f'<th><b>{rkey}</b></th>'
        s += f'<td>{rval}</td>'
        s += '</tr>'
    s += '</tbody></table>'
    return s

def display_profile_by_index(idx: int):
    display(HTML(make_prof_html(get_person(idx)['profile'])))

In [9]:
get_person(49)['profile']

'eye_color || brown\nname || kris janson\nbirth_name || kris tiffany maslog janson\nhometown || cebu city , cebu\ntitle || binibining pilipinas intercontinental 2014 ,\ncompetitions || binibining pilipinas 2014\nhair_color || brown\nbirth_date || 21 december 1989\narticle_title || kris janson\nheight || 1.73'

In [10]:
original_text = get_person(49)['target_text']
original_text

'kris tiffany maslog janson -lrb- born december 21 , 1989 -rrb- simply known as kris janson , is a filipino beauty pageant titleholder from cebu city , crowned binibining pilipinas intercontinental 2014 at the binibining pilipinas 2014 pageant held on march 30th , 2014 at the smart araneta coliseum , quezon city , philippines .\n'

In [11]:
perturbed_text = bugs_A[bugs_A['i'] == 49].iloc[0]['perturbed_text']
perturbed_text

NameError: name 'bugs_A' is not defined

In [12]:
display_profile_by_index(49)

0,1
eye_color,brown
name,kris janson
birth_name,kris tiffany maslog janson
hometown,"cebu city , cebu"
title,"binibining pilipinas intercontinental 2014 ,"
competitions,binibining pilipinas 2014
hair_color,brown
birth_date,21 december 1989
article_title,kris janson
height,1.73
