# Checking accuracy with and without word dropout

In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

In [2]:
from model import DocumentProfileMatchingTransformer

checkpoint_path = "/home/jxm3/research/deidentification/unsupervised-deidentification/saves/deid-wikibio_deid_exp/okpvvffw_46/checkpoints/epoch=7-step=1823.ckpt"
model = DocumentProfileMatchingTransformer.load_from_checkpoint(
    checkpoint_path,
    dataset_name='wiki_bio',
    model_name_or_path='distilbert-base-uncased',
    num_workers=1,
    loss_fn='exact',
    num_neighbors=2048,
    base_folder="/home/jxm3/research/deidentification/unsupervised-deidentification",
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized DocumentProfileMatchingTransformer with learning_rate = 0.0002


In [3]:
from datamodule import WikipediaDataModule
import os

num_cpus = os.cpu_count()

dm = WikipediaDataModule(
    model_name_or_path='distilbert-base-uncased',
    dataset_name='wiki_bio',
    num_workers=min(8, num_cpus),
    train_batch_size=64,
    eval_batch_size=64,
    max_seq_length=64,
    redaction_strategy="",
    base_folder="/home/jxm3/research/deidentification/unsupervised-deidentification",
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 8


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-5535f82839d9fec4.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-5b1c3941089b7f1b.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-8a9b289bc8e70b72.arrow
Loading cached processed dataset at 

## 2. Evaluate model normally

In [4]:
import textattack
import torch
import transformers

class MyModelWrapper(textattack.models.wrappers.ModelWrapper):
    model: DocumentProfileMatchingTransformer
    tokenizer: transformers.PreTrainedTokenizer
    profile_embeddings: torch.Tensor
    max_seq_length: int
    
    def __init__(self, model: DocumentProfileMatchingTransformer, tokenizer: transformers.PreTrainedTokenizer, max_seq_length: int = 64):
        self.model = model
        self.tokenizer = tokenizer
        self.profile_embeddings = torch.tensor(model.val_embeddings)
        self.max_seq_length = max_seq_length
                 
    def to(self, device):
        self.model.to(device)
        self.profile_embeddings.to(device)
        return self # so semantics `model = MyModelWrapper().to('cuda')` works properly

    def __call__(self, text_input_list, batch_size=32):
        model_device = next(self.model.parameters()).device
        tokenized_ids = self.tokenizer.batch_encode_plus(
            text_input_list,
            max_length=self.max_seq_length,
            padding=True,
            truncation=True
        )
        tokenized_ids = {k: torch.tensor(v).to(model_device) for k,v in tokenized_ids.items()}
        
        # TODO: implement batch size if we start running out of memory here.
        with torch.no_grad():
            document_embeddings = self.model.document_model(**tokenized_ids)
            document_embeddings = document_embeddings['last_hidden_state'][:, 0, :] # (batch, document_emb_dim)
            document_embeddings = self.model.lower_dim_embed(document_embeddings) # (batch, emb_dim)

        document_to_profile_probs = torch.nn.functional.softmax(
            document_embeddings @ self.profile_embeddings.T.to(model_device), dim=-1)
        assert document_to_profile_probs.shape == (len(text_input_list), len(self.profile_embeddings))
        return document_to_profile_probs
            

In [5]:
from typing import Tuple

from collections import OrderedDict

import datasets

class WikiDataset(textattack.datasets.Dataset):
    dataset: datasets.Dataset
    
    def __init__(self, dm: WikipediaDataModule):
        self.shuffled = True
        self.dataset = dm.val_dataset
        self.label_names = list(dm.val_dataset['name'])
    
    def __len__(self) -> int:
        return len(self.dataset)
    
    def __getitem__(self, i: int) -> Tuple[OrderedDict, int]:
        return self.dataset['document'][i], self.dataset['text_key_id'][i].item()
        

In [39]:
from textattack import Attack
from textattack.constraints.pre_transformation import RepeatModification

model_wrapper = MyModelWrapper(model, dm.tokenizer)
model_wrapper.to('cuda')
dataset = WikiDataset(dm)


In [7]:
batch_size = 256
num_examples = 2048 # len(dataset)
num_examples

i = 0
all_preds = []
all_labels = []
while i < num_examples:
    samples = [dataset[i] for i in range(i, min(len(dataset),i+batch_size))]
    text = [s[0] for s in samples]
    labels = [s[1] for s in samples]
    preds = model_wrapper(text)
    all_preds += preds.argmax(dim=1).tolist()
    all_labels += labels
    i += batch_size

In [12]:
(torch.tensor(all_preds) == torch.tensor(all_labels)).sum().item(), len(all_preds)

(1903, 2048)

In [9]:
!nvidia-smi

Mon Mar 28 14:35:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:1D:00.0 Off |                  Off |
| 30%   27C    P8    19W / 300W |      3MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [61]:
from typing import List

import random

from textattack.augmentation import Augmenter
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics import WordEmbeddingDistance
from textattack.shared import AttackedText

class WordSwapSingleWord(textattack.transformations.word_swap.WordSwap):
    """Takes a sentence and transforms it by replacing with a single fixed word.
    """
    single_word: str
    def __init__(self, single_word: str = "?", **kwargs):
        super().__init__(**kwargs)
        self.single_word = single_word

    def _get_replacement_words(self, _word: str):
        return [self.single_word]


class MultiWordSwapSingleWord(WordSwapSingleWord):
    """Based on WordSwapEmbedding but swaps multiple words at once.
    """
    max_num_words_to_swap: int
    max_num_texts_to_return: int

    def __init__(self, single_word: str,
        max_num_words_to_swap: int, max_num_texts_to_return: int):
        super().__init__(single_word=single_word)
        self.max_num_words_to_swap = max_num_words_to_swap
        self.max_num_texts_to_return = max_num_texts_to_return

    def _get_transformations(
        self, current_text: AttackedText, indices_to_modify: List[int]) -> List[AttackedText]:
        words = current_text.words
        word_swap_options = []
        for i in indices_to_modify:
            word_to_replace = words[i]
            word_swap_options.append(
                (
                    i,
                    [r for r in self._get_replacement_words(word_to_replace) if r != word_to_replace]
                )
            )
            
        # print("word_swap_options:", word_swap_options)
        # can't swap more words than we have positions available.
        num_words_to_swap = min(self.max_num_words_to_swap, len(indices_to_modify))

        # create outputs by swapping words with input.
        transformed_texts = []
        for _ in range(self.max_num_texts_to_return):
            swap_options = random.sample(word_swap_options, num_words_to_swap)
            swap_idxs = []
            swap_words = []
            for idx, word_choices in swap_options:
                if len(word_choices):
                    swap_idxs.append(idx)
                    swap_words.append(random.choice(word_choices))
            # Create a transformed text from the newly-chosen word indices and replacement words.
            transformed_texts.append(
                current_text.replace_words_at_indices(swap_idxs, swap_words)
            )

        # print("transformed_texts:", transformed_texts)
        return transformed_texts


        
augmenter = Augmenter(
    # transformation=WordSwapEmbedding(max_candidates=50),
    transformation=MultiWordSwapSingleWord(
        "[MASK]",
        max_num_words_to_swap=3,
        max_num_texts_to_return=1,
    ),
    constraints=[
        RepeatModification(),
        StopwordModification(),
    ],
    fast_augment=False,
    transformations_per_example=1,
    pct_words_to_swap = 0.25, # TODO: how should we set this number?
)

In [62]:
augmenter.augment("Hi there my strange weird friend how is it going with you today?")

['Hi there my [MASK] [MASK] friend how is it going with you [MASK]?']

In [63]:
augmenter.augment_many(["Hello there", "the angel from my nightmares"])

[['[MASK] there'], ['the [MASK] from my [MASK]']]

In [65]:
batch_size = 256
num_examples = 2048 # len(dataset)
num_examples

i = 0
all_preds = []
all_labels = []
while i < num_examples:
    print(i, '/', num_examples)
    samples = [dataset[i] for i in range(i, min(len(dataset),i+batch_size))]
    print('\t [augmentation]')
    text = augmenter.augment_many([s[0] for s in samples])
    text = [t[0] for t in text] # unroll lists returned by augment_many()
    labels = [s[1] for s in samples]
    print('\t [prediction]')
    preds = model_wrapper(text)
    all_preds += preds.argmax(dim=1).tolist()
    all_labels += labels
    i += batch_size

0 / 2048
	 [augmentation]
	 [prediction]
256 / 2048
	 [augmentation]
	 [prediction]
512 / 2048
	 [augmentation]
	 [prediction]
768 / 2048
	 [augmentation]
	 [prediction]
1024 / 2048
	 [augmentation]
	 [prediction]
1280 / 2048
	 [augmentation]
	 [prediction]
1536 / 2048
	 [augmentation]
	 [prediction]
1792 / 2048
	 [augmentation]
	 [prediction]


In [66]:
(torch.tensor(all_preds) == torch.tensor(all_labels)).sum().item(), len(all_preds)

(1463, 2048)

In [67]:
for t in text: print(t)

[MASK] townes `` [MASK] '' hope , kbe , kc * sg , [MASK] -lrb- may [MASK] , 1903 -- july 27 , [MASK] -rrb- , was a british-born american comedian , vaudevillian , actor , [MASK] , dancer , [MASK] , and [MASK] .
with a career spanning nearly 80 [MASK] , hope [MASK] in over 70 films and shorts , [MASK] a series of `` [MASK] '' [MASK] [MASK] bing crosby and [MASK] lamour .
in addition to hosting the academy awards fourteen [MASK] -[MASK] more than any other host -rrb- , he [MASK] in [MASK] stage productions and television roles and was the [MASK] of fourteen books .
the song [MASK] thanks for the memory '' is widely regarded as [MASK] 's [MASK] tune .
celebrated for his [MASK] [MASK] [MASK] [MASK] service [MASK] -lrb- [MASK] -[MASK] shows to [MASK] active [MASK] american military [MASK] -- he [MASK] [MASK] [MASK] for the uso between [MASK] and 1991 -- hope was declared an honorary [MASK] of the united [MASK] [MASK] forces in 1997 by act of the [MASK].s. congress .
[MASK] participated in t