### Evaluating deidentified data

After training a new set of models I deidentified a small amount of data (k=1, n=100, b=2). Now i'm going to evaluate it.

In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from dataloader import WikipediaDataModule

import os

num_cpus = os.cpu_count()
dm = WikipediaDataModule(
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    max_seq_length=128,
    dataset_name='wiki_bio',
    dataset_train_split='train[:1]', # not used
    dataset_val_split='val[:20%]',
    dataset_version='1.2.0',
    word_dropout_ratio=0.0,
    word_dropout_perc=0.0,
    num_workers=1,
    train_batch_size=64,
    eval_batch_size=64
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 1 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:1]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split val[:20%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_train__1__1.2.0__wiki.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_val__20___1.2.0__wiki.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bio_val__20___1.2.0__lexical_redacted.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-wiki_bi

In [2]:
import pandas as pd

### Load pre-generated redacted data from various models.

Models are explained in `../model_cfg.py`.

In [13]:
import glob
import re

import pandas as pd


adv_df = None
for model_name in ['model_4', 'model_5', 'model_6', 'model_7', 'model_8', 'model_9', 'model_2_1', 'model_2_2', 'model_2_3', 'model_2_4', 'model_2_5', 'model_3_1']:
    csv_filenames = glob.glob(f'../adv_csvs/{model_name}*/results_1_*0.csv')
    print(model_name, csv_filenames)
    for filename in csv_filenames:
        df = pd.read_csv(filename)
        df['model_name'] = re.search(r'adv_csvs/(model_\d.*)/.+.csv', filename).group(1)
        df['i'] = df.index
        
        df = df[df['result_type'] == 'Successful'] # Take only successful 'attacks', i.e. ones that actually fooled this model.
        
        mini_df = df[['perturbed_text', 'model_name', 'i']]
        
        mini_df = mini_df.iloc[:100]
        
        if adv_df is None:
            adv_df = mini_df
        else:
            adv_df = pd.concat((adv_df, mini_df), axis=0)

model_4 ['../adv_csvs/model_4/results_1_100.csv']
model_5 ['../adv_csvs/model_5/results_1_100.csv']
model_6 ['../adv_csvs/model_6/results_1_100.csv']
model_7 []
model_8 ['../adv_csvs/model_8_1day/results_1_1000.csv', '../adv_csvs/model_8_ls0.1/results_1_1000.csv', '../adv_csvs/model_8_ls0.01/results_1_1000.csv']
model_9 ['../adv_csvs/model_9_ls0.1/results_1_1000.csv', '../adv_csvs/model_9_ls0.01/results_1_1000.csv', '../adv_csvs/model_9_ls0.05/results_1_1000.csv']
model_2_1 ['../adv_csvs/model_2_1/results_1_100.csv']
model_2_2 ['../adv_csvs/model_2_2/results_1_100.csv']
model_2_3 ['../adv_csvs/model_2_3/results_1_100.csv']
model_2_4 ['../adv_csvs/model_2_4/results_1_100.csv']
model_2_5 ['../adv_csvs/model_2_5/results_1_100.csv']
model_3_1 ['../adv_csvs/model_3_1/results_1_30.csv']


In [14]:
adv_df.groupby('model_name').count()

Unnamed: 0_level_0,perturbed_text,i
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
model_2_1,100,100
model_2_2,100,100
model_2_3,98,98
model_2_4,100,100
model_2_5,100,100
model_3_1,21,21
model_4,100,100
model_5,98,98
model_6,100,100
model_8_1day,100,100


### Get baseline redacted data

Redacted via NER and Lexical redaction.

In [15]:
mini_val_dataset = dm.val_dataset[:100]
ner_df = pd.DataFrame(
    columns=['perturbed_text'],
    data=mini_val_dataset['document_redact_ner']
)
ner_df['model_name'] = 'named_entity'
ner_df['i'] = ner_df.index
       
lex_df = pd.DataFrame(
    columns=['perturbed_text'],
    data=mini_val_dataset['document_redact_lexical']
)
lex_df['model_name'] = 'lexical'
lex_df['i'] = lex_df.index

baseline_df = pd.concat((lex_df, ner_df), axis=0)

In [16]:
baseline_df.head()

Unnamed: 0,perturbed_text,model_name,i
0,<mask> <mask> <mask> <mask> <mask> ( also know...,lexical,0
1,<mask> <mask> is a male former table tennis pl...,lexical,1
2,<mask> <mask> ( born <mask> <mask> <mask> ) is...,lexical,2
3,"<mask> <mask> , ( born <mask> <mask> , <mask> ...",lexical,3
4,<mask> l. <mask> is a former <mask> member of ...,lexical,4


In [17]:
full_df = pd.concat((adv_df, baseline_df), axis=0)
full_df['model_name'].value_counts()

model_4           100
model_9_ls0.01    100
lexical           100
model_2_5         100
model_2_4         100
model_2_2         100
model_2_1         100
model_9_ls0.05    100
model_9_ls0.1     100
model_8_ls0.01    100
model_8_ls0.1     100
model_8_1day      100
model_6           100
named_entity      100
model_5            98
model_2_3          98
model_3_1          21
Name: model_name, dtype: int64

In [18]:
# this line puts newlines back
full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('<SPLIT>', '\n'))

# this line replaces BERT-style masks (from PMLM) with roberta-style ones, so we can
# count them in a single command
full_df['perturbed_text'] = full_df['perturbed_text'].apply(lambda s: s.replace('[MASK]', '<mask>'))

### Truncating

Hugely important step that was missing in the prior analysis!

In [19]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('roberta-base')

In [20]:
def truncate_text(text: str, max_length=128) -> str:
    input_ids = tokenizer(text, truncation=True, max_length=128)['input_ids']
    reconstructed_text = (
        tokenizer
            .decode(input_ids)
            .replace('<mask>', ' <mask> ')
            .replace('  <mask>', ' <mask>')
            .replace('<mask>  ', '<mask> ')
            .replace('<s>', '')
            .replace('</s>', '')
            .strip()
    )
    return reconstructed_text

# truncate_text(sample_long_text)
sample_long_text = full_df['perturbed_text'].iloc[14]
print(sample_long_text)
print()
print(truncate_text(sample_long_text))

<mask> <mask> (born 4 <mask> <mask>) is a danish professional football midfielder , who currently plays for danish 1st division side <mask> boldklub .
<mask> began playing football in kolt-hasselager if , where he was picked for agf , where he got his other footballing education .
he was part of the year ' 88 , who won in the junior league , like michael lumb , frederik krabbe , michael vester , niels kristensen , morten beck andersen and anders syberg , who all had the onset of agf 's 1 .
hold .
in the autumn of 2009 he was loaned to næstved bk , and just before winter transfer window end he switched permanently to the club .
in 2010 he changed to fc fredericia , where he played until 2012 when he got vendsyssel ff as a new club in january 2015 he was given at his own request that he want to terminated his contract with the vendsyssel ff .
on 6 february 2015 he signed a two-year contract with lyngby boldklub


<mask> <mask> (born 4 <mask> <mask> ) is a danish professional football mid

In [21]:
full_df['perturbed_text_truncated'] = full_df['perturbed_text'].apply(truncate_text)

### Measuring utility

Unit 1: number of redacted words.

In [22]:
def count_masks(s):
    return s.count('<mask>')

In [23]:
full_df['num_masks'] = full_df.apply(lambda s: count_masks(s['perturbed_text_truncated']), axis=1)
full_df.groupby('model_name').mean()['num_masks']

model_name
lexical           16.540000
model_2_1          6.970000
model_2_2          8.960000
model_2_3         13.632653
model_2_4         15.270000
model_2_5         15.090000
model_3_1          0.000000
model_4            9.450000
model_5            9.887755
model_6            3.540000
model_8_1day      14.060000
model_8_ls0.01    12.930000
model_8_ls0.1     12.960000
model_9_ls0.01    10.510000
model_9_ls0.05     9.250000
model_9_ls0.1      8.570000
named_entity      13.940000
Name: num_masks, dtype: float64

Unit 2: compressed size.

In [24]:
import zlib

def count_compressed_bytes(s: str) -> int:
    return len(zlib.compress(s.encode()))

teststr = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus
pretium justo eget elit eleifend, et dignissim quam eleifend. Nam vehicula nisl
posuere velit volutpat, vitae scelerisque nisl imperdiet. Phasellus dignissim,
dolor amet."""

count_compressed_bytes(teststr)

157

In [25]:
original_text_truncated = [truncate_text(d) for d in mini_val_dataset['document']]

In [26]:
original_total_bytes = count_compressed_bytes('\n'.join(original_text_truncated))

1 - full_df.groupby('model_name').apply(lambda s: count_compressed_bytes('\n'.join(s['perturbed_text_truncated']))) / original_total_bytes

model_name
lexical           0.152822
model_2_1         0.076091
model_2_2         0.092035
model_2_3         0.129618
model_2_4         0.147626
model_2_5         0.150189
model_3_1         0.874724
model_4           0.097302
model_5           0.107623
model_6           0.040074
model_8_1day      0.121646
model_8_ls0.01    0.105559
model_8_ls0.1     0.115951
model_9_ls0.01    0.091608
model_9_ls0.05    0.089900
model_9_ls0.1     0.083707
named_entity      0.125845
dtype: float64

### Reidentification rate (privacy metric)

In [27]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

In [28]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

import datasets

split = 'val[:20%]'
prof_data = datasets.load_dataset('wiki_bio', split=split, version='1.2.0')

def make_table_str(ex):
    ex['table_str'] = (
        ' '.join(ex['input_text']['table']['column_header'] + ex['input_text']['table']['content'])
    )
    return ex

prof_data = prof_data.map(make_table_str)
profile_corpus = prof_data['table_str']

tokenized_profile_corpus = [
    get_words_from_doc(prof) for prof in profile_corpus
]

bm25 = BM25Okapi(tokenized_profile_corpus)

Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-ba6837fc22371371.arrow


In [29]:
def get_top_k(ex):
    query = ex["perturbed_text_truncated"].split()
    top_k = bm25.get_scores(query).argsort()[::-1]
    ex["correct_idx"] = top_k.tolist().index(ex["i"])
    ex["is_correct"] = 1 if top_k[0] == ex["i"] else 0
    return ex

In [31]:
full_df.head()

Unnamed: 0,perturbed_text,model_name,i,perturbed_text_truncated,num_masks
0,pope <mask> <mask> <mask> alexandria (also kno...,model_4,0,pope <mask> <mask> <mask> alexandria (also kno...,9
1,<mask> <mask> is a male former <mask> <mask> p...,model_4,1,<mask> <mask> is a male former <mask> <mask> p...,6
2,<mask> <mask> (born 30 <mask> <mask>) is a tur...,model_4,2,<mask> <mask> (born 30 <mask> <mask> ) is a tu...,6
3,"<mask> <mask> , (born march 14 , <mask>) is a ...",model_4,3,"<mask> <mask> , (born march 14, <mask> ) is a ...",4
4,<mask> <mask>. <mask> is a former democratic m...,model_4,4,<mask> <mask> . <mask> is a former democratic ...,7


In [52]:
import re

def fix_text(s):
    return re.sub('<<font color = \w+>mask__\d+</font>>', '<mask>', s)

model_3_1_df = full_df[full_df['model_name'] == 'model_3_1']
model_3_1_df['perturbed_text'] = model_3_1_df['perturbed_text'].map(fix_text)
model_3_1_df['perturbed_text_truncated'] = model_3_1_df['perturbed_text_truncated'].map(fix_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_3_1_df['perturbed_text'] = model_3_1_df['perturbed_text'].map(fix_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_3_1_df['perturbed_text_truncated'] = model_3_1_df['perturbed_text_truncated'].map(fix_text)


In [57]:
num_proc = min(8, len(os.sched_getaffinity(0)))
full_df = full_df.progress_apply(get_top_k, axis=1)
full_df.groupby('model_name').mean()['is_correct'] * 100

  0%|          | 0/1617 [00:00<?, ?it/s]

model_name
lexical            0.000000
model_2_1         50.000000
model_2_2         48.000000
model_2_3         35.714286
model_2_4         29.000000
model_2_5         33.000000
model_3_1          9.523810
model_4           33.000000
model_5           34.693878
model_6           65.000000
model_8_1day      26.000000
model_8_ls0.01    37.000000
model_8_ls0.1     31.000000
model_9_ls0.01    48.000000
model_9_ls0.05    46.000000
model_9_ls0.1     50.000000
named_entity      66.000000
Name: is_correct, dtype: float64

In [53]:
num_proc = min(8, len(os.sched_getaffinity(0)))
model_3_1_df = model_3_1_df.progress_apply(get_top_k, axis=1)
print(model_3_1_df["is_correct"].mean())

  0%|          | 0/21 [00:00<?, ?it/s]

0.2857142857142857


In [54]:
model_3_1_df.groupby('model_name').mean()['is_correct'] * 100

model_name
model_3_1    28.571429
Name: is_correct, dtype: float64

In [56]:
from IPython.display import display, HTML

# model_3_1_full_df = pd.read_csv('../adv_csvs/model_3_1/results_1_30')

display(HTML(model_3_1_df[['perturbed_text_truncated', 'is_correct']].to_html(escape=True)))

Unnamed: 0,perturbed_text_truncated,is_correct
0,"pope <mask> <mask> <mask> alexandria ( also known as khail iii ) was the coptic pope of alexandria and patriarch of the see of st. mark ( <mask> -- <mask> ).\nin 882, the governor of egypt, ahmad ibn tulun, forced khail to pay heavy contributions, forcing him to sell",1
1,<mask> jun is a <mask> former <mask> tennis player <mask> <mask>.,0
2,<mask> Öztürk ( born <mask> november 1977 ) is a turkish professional footballer.\nhe currently plays as a <mask> for <mask> <mask>.,1
3,"<mask> <mask>, ( born march <mask>, <mask> ) is a professional squash player who represents france.\nshe reached a career-high world ranking of world no. <mask> in july 2015.",0
4,<mask> l. <mask> is a former democratic member of the <mask> house of representatives.\nhe was born in <mask> to michael and angela pitullio <mask>.,0
6,"<mask> demonte <mask> ( <mask> august 5, 1981 ), nicknamed <mask> the <mask> storm '', is an <mask> professional <mask> <mask> <mask>",0
7,"jim <mask> ( born james <mask> <mask> on <mask> <mask> <mask> ) is a <mask> musician and <mask>, best known as the singer of indie punk band <<font color = green>mask__",0
8,"<mask> <mask> ( born <mask> <mask>, <mask> <mask> <mask>, <mask> ) is a former professional american football defensive <mask> <<font color = cyan",0
9,blessed <mask> of <mask> t.o.s.d. ( ) was a catholic visionary and anchoress from <mask> ( kotor ).\nshe was a teenage convert from orthodoxy of serbian descent from montenegro ( zeta ).\nshe became a dominican tertiary and was posthumously venerated as a saint in kotor.\nshe was later beatified in 1934.,1
10,"<mask> <mask> <mask> ( born april <mask>, <mask> <mask> <mask> <mask> ) is a <mask> actress and model.",0


In [None]:
for _,ex in full_df[(full_df['model_name'] == 'model_8_1day') & (full_df['is_correct'] == 1)].head(n=10)[['i', 'perturbed_text_truncated']].iterrows():
    print(ex['i'])
    print(ex['perturbed_text_truncated'])
    print('\n')


In [64]:
### Testing bm25 accuracy on some regular stuff

mini_val_dataset = dm.val_dataset[:100]

In [73]:
import numpy as np
def get_bm25_guess_idx(doc: str) -> int:
    query = re.sub(r'[^\w|\s]', ' ',doc).split(' ')
    scores = bm25.get_scores(query)
    k = scores.argmax()
    # print((scores == scores[k]).sum()-1, "other equal ppl from", len(scores), "total")
    return k

doc_bm25_guess_idxs = np.array([get_bm25_guess_idx(doc) for doc in mini_val_dataset['document']])

In [74]:
(doc_bm25_guess_idxs == np.arange(100)).mean()

0.99

In [75]:
doc_lex_bm25_guess_idxs = np.array([get_bm25_guess_idx(doc) for doc in mini_val_dataset['document_redact_lexical']])

In [76]:
(doc_lex_bm25_guess_idxs == np.arange(100)).mean()

0.0