### Evaluating deidentified data

After about a month of improving the model I deidentified some new data and want to see how it compares to the old data.

In [None]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from dataloader import WikipediaDataModule

import os

num_cpus = os.cpu_count()
dm = WikipediaDataModule(
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    max_seq_length=128,
    dataset_name='wiki_bio',
    dataset_train_split='train[:1]', # not used
    dataset_val_split='val[:20%]',
    dataset_version='1.2.0',
    word_dropout_ratio=0.0,
    word_dropout_perc=0.0,
    num_workers=1,
    train_batch_size=64,
    eval_batch_size=64
)
dm.setup("fit")

In [2]:
import pandas as pd

### Load pre-generated redacted data from various models.

Models are explained in `../model_cfg.py`.

In [85]:
import glob
import re


adv_df = None
for model_name in ['model_4', 'model_5', 'model_6', 'model_7', 'model_8', 'model_9']:
    csv_filenames = glob.glob(f'../adv_csvs/{model_name}*/results_1_*0.csv')
    print(model_name, csv_filenames)
    for filename in csv_filenames:
        df = pd.read_csv(filename)
        df['model_name'] = re.search(r'adv_csvs/(model_\d.*)/.+.csv', filename).group(1)
        df['i'] = df.index
        mini_df = df[['perturbed_text', 'model_name', 'i']]
        
        mini_df = mini_df.iloc[:100]
        
        if adv_df is None:
            adv_df = mini_df
        else:
            adv_df = pd.concat((adv_df, mini_df), axis=0)

model_4 ['../adv_csvs/model_4/results_1_100.csv']
model_5 ['../adv_csvs/model_5/results_1_100.csv']
model_6 ['../adv_csvs/model_6/results_1_100.csv']
model_7 []
model_8 ['../adv_csvs/model_8_1day/results_1_1000.csv', '../adv_csvs/model_8_ls0.1/results_1_1000.csv', '../adv_csvs/model_8_ls0.01/results_1_1000.csv']
model_9 ['../adv_csvs/model_9_ls0.1/results_1_1000.csv', '../adv_csvs/model_9_ls0.01/results_1_1000.csv', '../adv_csvs/model_9_ls0.05/results_1_1000.csv']


In [86]:
adv_df.groupby('model_name').count()

Unnamed: 0_level_0,perturbed_text,i
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
model_4,100,100
model_5,100,100
model_6,100,100
model_8_1day,100,100
model_8_ls0.01,100,100
model_8_ls0.1,100,100
model_9_ls0.01,100,100
model_9_ls0.05,100,100
model_9_ls0.1,100,100


### Get baseline redacted data

Redacted via NER and Lexical redaction.

In [87]:
mini_val_dataset = dm.val_dataset[:100]
ner_df = pd.DataFrame(
    columns=['perturbed_text'],
    data=mini_val_dataset['document_redact_ner']
)
ner_df['model_name'] = 'named_entity'
ner_df['i'] = ner_df.index
       
lex_df = pd.DataFrame(
    columns=['perturbed_text'],
    data=mini_val_dataset['document_redact_lexical']
)
lex_df['model_name'] = 'lexical'
lex_df['i'] = lex_df.index

baseline_df = pd.concat((lex_df, ner_df), axis=0)

In [88]:
baseline_df.head()

Unnamed: 0,perturbed_text,model_name,i
0,<mask> <mask> <mask> <mask> <mask> ( also know...,lexical,0
1,<mask> <mask> is a male former table tennis pl...,lexical,1
2,<mask> <mask> ( born <mask> <mask> <mask> ) is...,lexical,2
3,"<mask> <mask> , ( born <mask> <mask> , <mask> ...",lexical,3
4,<mask> l. <mask> is a former <mask> member of ...,lexical,4


In [89]:
full_df = pd.concat((adv_df, baseline_df), axis=0)
full_df['model_name'].value_counts()

model_4           100
model_5           100
model_6           100
model_8_1day      100
model_8_ls0.1     100
model_8_ls0.01    100
model_9_ls0.1     100
model_9_ls0.01    100
model_9_ls0.05    100
lexical           100
named_entity      100
Name: model_name, dtype: int64

In [90]:
# this line puts newlines back
full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('<SPLIT>', '\n'))

# this line replaces BERT-style masks (from PMLM) with roberta-style ones, so we can
# count them in a single command
full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('[MASK]', '<mask>'))

### Truncating

Hugely important step that was missing in the prior analysis!

In [91]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('roberta-base')

In [92]:
def truncate_text(text: str, max_length=128) -> str:
    input_ids = tokenizer(text, truncation=True, max_length=128)['input_ids']
    reconstructed_text = (
        tokenizer
            .decode(input_ids)
            .replace('<mask>', ' <mask> ')
            .replace('  <mask>', ' <mask>')
            .replace('<mask>  ', '<mask> ')
            .replace('<s>', '')
            .replace('</s>', '')
            .strip()
    )
    return reconstructed_text

# truncate_text(sample_long_text)
sample_long_text = full_df['perturbed_text'].iloc[14]
print(sample_long_text)
print()
print(truncate_text(sample_long_text))

<mask> <mask> (born 4 <mask> <mask>) is a danish professional football midfielder , who currently plays for danish 1st division side <mask> boldklub .
<mask> began playing football in kolt-hasselager if , where he was picked for agf , where he got his other footballing education .
he was part of the year ' 88 , who won in the junior league , like michael lumb , frederik krabbe , michael vester , niels kristensen , morten beck andersen and anders syberg , who all had the onset of agf 's 1 .
hold .
in the autumn of 2009 he was loaned to næstved bk , and just before winter transfer window end he switched permanently to the club .
in 2010 he changed to fc fredericia , where he played until 2012 when he got vendsyssel ff as a new club in january 2015 he was given at his own request that he want to terminated his contract with the vendsyssel ff .
on 6 february 2015 he signed a two-year contract with lyngby boldklub


<mask> <mask> (born 4 <mask> <mask> ) is a danish professional football mid

In [93]:
full_df['perturbed_text_truncated'] = full_df['perturbed_text'].apply(truncate_text)

### Measuring utility

Unit 1: number of redacted words.

In [94]:
def count_masks(s):
    return s.count('<mask>')

In [95]:
full_df['num_masks'] = full_df.apply(lambda s: count_masks(s['perturbed_text_truncated']), axis=1)
full_df.groupby('model_name').mean()['num_masks']

model_name
lexical           16.51
model_4            9.45
model_5           10.14
model_6            3.54
model_8_1day      14.25
model_8_ls0.01    13.10
model_8_ls0.1     12.96
model_9_ls0.01    10.51
model_9_ls0.05     9.41
model_9_ls0.1      8.57
named_entity      13.94
Name: num_masks, dtype: float64

Unit 2: compressed size.

In [96]:
import zlib

def count_compressed_bytes(s: str) -> int:
    return len(zlib.compress(s.encode()))

teststr = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus
pretium justo eget elit eleifend, et dignissim quam eleifend. Nam vehicula nisl
posuere velit volutpat, vitae scelerisque nisl imperdiet. Phasellus dignissim,
dolor amet."""

count_compressed_bytes(teststr)

157

In [97]:
original_text_truncated = [truncate_text(d) for d in mini_val_dataset['document']]

In [98]:
original_total_bytes = count_compressed_bytes('\n'.join(original_text_truncated))

1 - full_df.groupby('model_name').apply(lambda s: count_compressed_bytes('\n'.join(s['perturbed_text_truncated']))) / original_total_bytes

model_name
lexical           0.154175
model_4           0.097302
model_5           0.105630
model_6           0.040074
model_8_1day      0.128408
model_8_ls0.01    0.112535
model_8_ls0.1     0.115951
model_9_ls0.01    0.091608
model_9_ls0.05    0.098014
model_9_ls0.1     0.083707
named_entity      0.125845
dtype: float64

### Reidentification rate (privacy metric)

In [99]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

In [111]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

import datasets

split = 'val[:20%]'
prof_data = datasets.load_dataset('wiki_bio', split=split, version='1.2.0')

def make_table_str(ex):
    ex['table_str'] = (
        ' '.join(ex['input_text']['table']['column_header'] + ex['input_text']['table']['content'])
    )
    return ex

prof_data = prof_data.map(make_table_str)
profile_corpus = prof_data['table_str']

tokenized_profile_corpus = [
    get_words_from_doc(prof) for prof in profile_corpus
]

bm25 = BM25Okapi(tokenized_profile_corpus)

Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-ba6837fc22371371.arrow


In [117]:
def get_top_k(ex):
    query = ex["perturbed_text_truncated"].split()
    top_k = bm25.get_scores(query).argsort()[::-1]
    ex["correct_idx"] = top_k.tolist().index(ex["i"])
    ex["is_correct"] = 1 if top_k[0] == ex["i"] else 0
    return ex

In [None]:
num_proc = min(8, len(os.sched_getaffinity(0)))
full_df = full_df.progress_apply(get_top_k, axis=1)
print(full_df["is_correct"].mean())

  0%|          | 0/1100 [00:00<?, ?it/s]

In [120]:
full_df.groupby('model_name').mean()['is_correct'] * 100

model_name
lexical            0.0
model_4           33.0
model_5           34.0
model_6           65.0
model_8_1day      26.0
model_8_ls0.01    37.0
model_8_ls0.1     31.0
model_9_ls0.01    48.0
model_9_ls0.05    46.0
model_9_ls0.1     50.0
named_entity      66.0
Name: is_correct, dtype: float64

In [130]:
full_df.columns

Index(['perturbed_text', 'model_name', 'i', 'perturbed_text_truncated',
       'num_masks', 'correct_idx', 'is_correct'],
      dtype='object')

In [141]:
for _,ex in full_df[(full_df['model_name'] == 'model_8_1day') & (full_df['is_correct'] == 1)].head(n=10)[['i', 'perturbed_text_truncated']].iterrows():
    print(ex['i'])
    print(ex['perturbed_text_truncated'])
    print('\n')


0
pope michael <mask> of alexandria ( also known as khail <mask> ) was the coptic pope of alexandria and <mask> <mask> the <mask> of st. mark ( <mask> -- <mask> ).
in <mask> , the governor of egypt, ahmad ibn tulun, forced khail to pay heavy contributions, forcing him to sell a church and some attached properties to the local jewish community.
this building was at one time believed to have later become the site of the cairo geniza.


7
<mask> <mask> ( born <mask> neil morrison on 22 <mask> <mask> ) is a <mask> musician and author, best known as the singer of indie <mask> band carter <mask> .


8
<mask> <mask> ( born <mask> <mask> , <mask> in emporia, <mask> ) is a former professional american football defensive <mask> for the seattle seahawks, san diego chargers, new england patriots, baltimore ravens, and san francisco 49ers of the national football league.


11
<mask> <mask> ( born <mask> <mask> , <mask> in <mask> , switzerland ) is a retired swiss professional ice hockey <mask> .
pl

In [142]:
bm25.get_scores('<mask> m. <mask> is an american politician, a democrat and <mask> member of the maryland house <mask> delegates.')

array([0.        , 0.        , 2.37250826, ..., 3.26690884, 1.46606554,
       5.446037  ])

In [148]:
import numpy as np
s = bm25.get_scores('<mask> m. <mask> is an american politician, a democrat and <mask> member of the maryland house <mask> delegates.'.split())
np.arange(len(s))[s == s.max()]

array([18])

In [149]:
prof_data[18]

{'input_text': {'table': {'column_header': ['term_end',
    'name',
    'order',
    'profession',
    'religion',
    'birth_place',
    'state_delegate',
    'birth_date',
    'article_title',
    'party',
    'term_start',
    'children',
    'predecessor'],
   'row_number': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   'content': ['2003',
    'darren m. swain',
    'maryland house of delegates',
    'administrator',
    'ame',
    'windsor , north carolina',
    'maryland',
    '06 may 1970',
    'darren swain\n',
    'democrat',
    'january , 2013 1999',
    'one son',
    'tiffany alston']},
  'context': 'darren swain\n'},
 'target_text': 'darren m. swain is an american politician , a democrat and a member of the maryland house of delegates .\n',
 'table_str': 'term_end name order profession religion birth_place state_delegate birth_date article_title party term_start children predecessor 2003 darren m. swain maryland house of delegates administrator ame windsor , north carolina m