### Collecting intermediate results

I trained three models for reidentification. I know they're not fully trained (hopefully I can train much better ones soon!) but I am still going to evaluate them against two deidentification baselines, lexical and NER.

In [5]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from dataloader import WikipediaDataModule

import os

num_cpus = os.cpu_count()
dm = WikipediaDataModule(
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    max_seq_length=128,
    dataset_name='wiki_bio',
    dataset_train_split='train[:1]', # not used
    dataset_val_split='val[:1000]',
    dataset_version='1.2.0',
    word_dropout_ratio=0.0,
    word_dropout_perc=0.0,
    num_workers=1,
    train_batch_size=64,
    eval_batch_size=64
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 1 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:1]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split val[:1000]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


  0%|          | 0/1 [00:00<?, ?ex/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
dm.val_dataset[0]

{'input_text': {'table': {'column_header': ['successor',
    'name',
    'residence',
    'ended',
    'feast_day',
    'title',
    'enthroned',
    'predecessor',
    'death_date',
    'buried',
    'birth_place',
    'nationality',
    'religion',
    'article_title',
    'type'],
   'row_number': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
   'content': ['gabriel i',
    'michael iii of alexandria',
    "saint mark 's church",
    '16 march 907',
    '16 -rrb- march -lrb- 20 baramhat in the coptic calendar',
    '56th of st. mark pope of alexandria & patriarch of the see',
    '25 april 880',
    'shenouda i',
    '16 march 907',
    'monastery of saint macarius the great',
    'egypt',
    'egyptian',
    'coptic orthodox christian',
    'pope michael iii of alexandria\n',
    'pope']},
  'context': 'pope michael iii of alexandria\n'},
 'target_text': 'pope michael iii of alexandria -lrb- also known as khail iii -rrb- was the coptic pope of alexandria and patriarch of the see o

In [1]:
import pandas as pd

### Load pre-generated redacted data from various models.

Models are explained in `../model_cfg.py`.

In [70]:
import glob
import re


adv_df = None
for model_name in ['model_4', 'model_5', 'model_6']:
    csv_filenames = glob.glob(f'../adv_csvs/{model_name}/*0.csv')
    print(model_name, csv_filenames)
    for filename in csv_filenames:
        df = pd.read_csv(filename)
        k = re.search(r'_(\d+)_\d+.csv', filename).group(1)
        df['model_name'] = model_name + '__k' + str(k)
        df['i'] = df.index
        mini_df = df[['perturbed_text', 'model_name', 'i']]
        if adv_df is None:
            adv_df = mini_df
        else:
            adv_df = pd.concat((adv_df, mini_df), axis=0)

model_4 ['../adv_csvs/model_4/results_1_100.csv', '../adv_csvs/model_4/results_10_100.csv', '../adv_csvs/model_4/results_1000_100.csv']
model_5 ['../adv_csvs/model_5/results_1_100.csv', '../adv_csvs/model_5/results_10_100.csv']
model_6 ['../adv_csvs/model_6/results_1_100.csv', '../adv_csvs/model_6/results_10_100.csv', '../adv_csvs/model_6/results_1000_100.csv']


In [71]:
adv_df.head()

Unnamed: 0,perturbed_text,model_name,i
0,pope <mask> <mask> <mask> alexandria (also kno...,model_4__k1,0
1,<mask> <mask> is a male former <mask> <mask> p...,model_4__k1,1
2,<mask> <mask> (born 30 <mask> <mask>) is a tur...,model_4__k1,2
3,"<mask> <mask> , (born march 14 , <mask>) is a ...",model_4__k1,3
4,<mask> <mask>. <mask> is a former democratic m...,model_4__k1,4


### Get baseline redacted data

Redacted via NER and Lexical redaction.

In [72]:
mini_val_dataset = dm.val_dataset[:100]
ner_df = pd.DataFrame(
    columns=['perturbed_text'],
    data=mini_val_dataset['document_redact_ner']
)
ner_df['model_name'] = 'named_entity'
ner_df['i'] = ner_df.index
       
lex_df = pd.DataFrame(
    columns=['perturbed_text'],
    data=mini_val_dataset['document_redact_lexical']
)
lex_df['model_name'] = 'lexical'
lex_df['i'] = lex_df.index

baseline_df = pd.concat((lex_df, ner_df), axis=0)

In [73]:
baseline_df.head()

Unnamed: 0,perturbed_text,model_name,i
0,<mask> <mask> <mask> of <mask> ( also known as...,lexical,0
1,<mask> <mask> is a male former table tennis pl...,lexical,1
2,<mask> <mask> ( born <mask><mask> <mask> <mask...,lexical,2
3,<mask> <mask> <mask> ( born <mask> <mask>4 <ma...,lexical,3
4,<mask> <mask> <mask> is a former <mask> member...,lexical,4


In [74]:
full_df = pd.concat((adv_df, baseline_df), axis=0)
full_df['model_name'].value_counts()

model_4__k1       100
model_4__k10      100
model_4__k1000    100
model_5__k1       100
model_5__k10      100
model_6__k1       100
model_6__k10      100
model_6__k1000    100
lexical           100
named_entity      100
Name: model_name, dtype: int64

In [75]:
full_df['i'].value_counts()

0     10
63    10
73    10
72    10
71    10
      ..
30    10
29    10
28    10
27    10
99    10
Name: i, Length: 100, dtype: int64

### Measuring utility

Unit 1: number of redacted words.

In [90]:
def count_masks(s):
    return s.count('<mask>')

In [91]:
count_masks(full_df.iloc[0]['perturbed_text'])

9

In [96]:
full_df.groupby('model_name').apply(lambda s: count_masks('\n'.join(s['perturbed_text']))) / 100.0

model_name
lexical           25.40
model_4__k1        9.45
model_4__k10      14.84
model_4__k1000    40.64
model_5__k1        8.25
model_5__k10      29.02
model_6__k1        3.54
model_6__k10       8.35
model_6__k1000    22.44
named_entity      19.34
dtype: float64

Unit 2: compressed size.

In [89]:
import zlib

def count_compressed_bytes(s: str) -> int:
    return len(zlib.compress(s.encode()))

teststr = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus
pretium justo eget elit eleifend, et dignissim quam eleifend. Nam vehicula nisl
posuere velit volutpat, vitae scelerisque nisl imperdiet. Phasellus dignissim,
dolor amet."""

count_compressed_bytes(teststr)

157

In [98]:
original_text = mini_val_dataset['document']

In [101]:
original_total_bytes = count_compressed_bytes('\n'.join(original_text))

1 - full_df.groupby('model_name').apply(lambda s: count_compressed_bytes('\n'.join(s['perturbed_text']))) / original_total_bytes

model_name
lexical           0.128123
model_4__k1       0.074333
model_4__k10      0.115889
model_4__k1000    0.311295
model_5__k1       0.063548
model_5__k10      0.202129
model_6__k1       0.026288
model_6__k10      0.064575
model_6__k1000    0.169165
named_entity      0.146099
dtype: float64

### Reidentification rate (privacy metric)

In [104]:
from typing import List

from nltk.corpus import stopwords
from rank_bm25 import BM25Okapi

eng_stopwords = stopwords.words('english')
from tqdm.auto import tqdm
tqdm.pandas()


def get_words_from_doc(s: List[str]) -> List[str]:
    words = s.split()
    return [w for w in words if not w in eng_stopwords]

In [106]:
import datasets

split = 'val[:20%]'
prof_data = datasets.load_dataset('wiki_bio', split=split, version='1.2.0')

def make_table_str(ex):
    ex['table_str'] = (
        ' '.join(ex['input_text']['table']['column_header'] + ex['input_text']['table']['content'])
    )
    return ex

prof_data = prof_data.map(make_table_str)
profile_corpus = prof_data['table_str']

Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


  0%|          | 0/14566 [00:00<?, ?ex/s]

In [107]:
tokenized_profile_corpus = [
    get_words_from_doc(prof) for prof in profile_corpus
]

In [108]:
bm25 = BM25Okapi(tokenized_profile_corpus)

In [109]:
def get_top_k(ex):
    query = ex["perturbed_text"].split()
    top_k = bm25.get_scores(query).argsort()[::-1]
    # breakpoint()
    ex["correct_idx"] = top_k.tolist().index(ex["i"])
    ex["is_correct"] = 1 if top_k[0] == ex["i"] else 0
    return ex
    
num_proc = min(8, len(os.sched_getaffinity(0)))
full_df = full_df.progress_apply(get_top_k, axis=1)
print(full_df["is_correct"].mean())

  0%|          | 0/1000 [00:00<?, ?it/s]

0.4


In [113]:
full_df.groupby('model_name').mean()['is_correct'] * 100

model_name
lexical            0.0
model_4__k1       42.0
model_4__k10      33.0
model_4__k1000    18.0
model_5__k1       45.0
model_5__k10      24.0
model_6__k1       72.0
model_6__k10      62.0
model_6__k1000    39.0
named_entity      65.0
Name: is_correct, dtype: float64

### Checking distribution of word counts

In [124]:
num_words_per_doc = pd.Series([len(t.split()) for t in mini_val_dataset['target_text']])

In [127]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('roberta-base')

In [138]:
def num_tokens_roberta_tokenizer(text: str) -> int:
    return len(tokenizer.encode(text))

num_tokens_per_doc = pd.Series(map(num_tokens_roberta_tokenizer, mini_val_dataset['target_text']))

In [139]:
num_tokens_per_doc

0     123
1      17
2      47
3      50
4      41
     ... 
95     99
96    129
97    138
98    123
99    226
Length: 100, dtype: int64

In [140]:
num_tokens_per_doc.mean()

144.57

In [142]:
(num_tokens_per_doc > 128).sum()

41