## More tests trying to figure out how my model is so good at redacted data.

In [1]:
import sys
sys.path.append('/home/jxm3/research/deidentification/unsupervised-deidentification')

from datamodule import WikipediaDataModule

import os

num_cpus = os.cpu_count()
dm = WikipediaDataModule(
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    max_seq_length=128,
    dataset_name='wiki_bio',
    dataset_train_split='train[:100%]',
    dataset_val_split='val[:20%]',
    dataset_version='1.2.0',
    word_dropout_ratio=0.0,
    word_dropout_perc=0.0,
    num_workers=4,
    train_batch_size=512,
    eval_batch_size=512
)
dm.setup("fit")

Initializing WikipediaDataModule with num_workers = 4 and mask token `<mask>`
loading wiki_bio[1.2.0] split train[:100%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)


loading wiki_bio[1.2.0] split val[:20%]


Using custom data configuration default
Reusing dataset wiki_bio (/home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da)
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-3c0ffadd02c12daf.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-7d07543b6205ca87.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-7440752484ad8676.arrow
Loading cached processed dataset at /home/jxm3/.cache/huggingface/datasets/wiki_bio/default/1.2.0/c05ce066e9026831cd7535968a311fc80f074b58868cfdffccbc811dff2ab6da/cache-2c6f94b0d2dcc153.arrow


In [2]:
from model import CoordinateAscentModel

# model that was trained at the link given above, gets >99% validation accuracy,
# and is trained with word dropout!

from model_cfg import model_paths_dict

checkpoint_path = model_paths_dict["model_5"]

model = CoordinateAscentModel.load_from_checkpoint(
    checkpoint_path,
    document_model_name_or_path="roberta-base",
    profile_model_name_or_path="google/tapas-base",
    learning_rate=1e-5,
    pretrained_profile_encoder=False,
    lr_scheduler_factor=0.5,
    lr_scheduler_patience=1,
    train_batch_size=1,
    num_workers=8,
    gradient_clip_val=10.0,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initialized model with learning_rate = 1e-05 and patience 1


In [3]:
import numpy as np
import torch
import tqdm

def precompute_profile_embeddings():
    model.profile_model.cuda()
    model.profile_model.eval()

    model.train_profile_embeddings = np.zeros((len(dm.train_dataset), model.profile_embedding_dim))'
    for train_batch in tqdm.tqdm(dm.train_dataloader(), desc="Precomputing train embeddings", colour="yellow", leave=False):
        with torch.no_grad():
            profile_embeddings = model.forward_profile(batch=train_batch)
        model.train_profile_embeddings[train_batch["text_key_id"]] = profile_embeddings.cpu()
    model.train_profile_embeddings = torch.tensor(model.train_profile_embeddings, dtype=torch.float32)

    model.val_profile_embeddings = np.zeros((len(dm.val_dataset), model.profile_embedding_dim))
    for val_batch in tqdm.tqdm(dm.val_dataloader()[0], desc="Precomputing val embeddings", colour="green", leave=False):
        with torch.no_grad():
            profile_embeddings = model.forward_profile(batch=val_batch)
        model.val_profile_embeddings[val_batch["text_key_id"]] = profile_embeddings.cpu()
    model.val_profile_embeddings = torch.tensor(model.val_profile_embeddings, dtype=torch.float32)
    model.profile_model.train()

precompute_profile_embeddings()

  self.colour = colour
                                                                            1.42s/it]

In [5]:
model.train_profile_embeddings.shape

torch.Size([582659, 768])

In [6]:
model.val_profile_embeddings.shape

torch.Size([14566, 768])

In [12]:
doc = "<mask> a. <mask> (<mask> <mask> , <mask> -- <mask> <mask> , <mask>) was a <mask> <mask> <mask> <mask> and a <mask> .\nmost <mask> , he <mask> from <mask> to <mask> as <mask> <mask> of <mask> <mask> , from <mask> to <mask> as the <mask> <mask> <mask> to the <mask> <mask> and from <mask> to <mask> as <mask> to the <mask> <mask> of <mask> during the <mask> <mask> of <mask> <mask> .\n<mask> <mask> to the <mask> of <mask> <mask> in the <mask>.s. <mask> and is a <mask> of the <mask> <mask> <mask> of <mask> ."
doc_tokenized = dm.document_tokenizer(doc, max_length=dm.max_seq_length, truncation=True, return_tensors='pt')
with torch.no_grad():
    doc_emb = model.forward_document_inputs(inputs=doc_tokenized)

In [14]:
val_sims = (doc_emb @ model.val_profile_embeddings.T)
val_sims.softmax(dim=1)

tensor([[1.3588e-16, 1.6434e-11, 5.9551e-10,  ..., 2.1308e-11, 3.3903e-12,
         3.8873e-10]])

In [16]:
val_sims.softmax(dim=1).max(), val_sims.softmax(dim=1).argmax()

(tensor(0.9280), tensor(98))

In [18]:
dm.val_dataset[98]['name'] # correct!

'Vernon A. Walters'

In [20]:
train_sims = (doc_emb @ model.train_profile_embeddings.T)
train_sims.softmax(dim=1).max(), train_sims.softmax(dim=1).argmax(),

(tensor(0.4031), tensor(455976))

In [23]:
dm.train_dataset[455976]['name']

'Michael A. Sheehan'

In [24]:
dm.train_dataset[455976]['document']

'michael a. sheehan ( born february 10 , 1955 ) is a united states author and former government official and military officer .\nhe is currently distinguished chair at the u.s. military academy in west point and a terrorist analyst for nbc news .\nhe was born in red bank , new jersey .\n'

In [26]:
dm.val_dataset[98]['document']

'vernon a. walters ( january 3 , 1917 -- february 10 , 2002 ) was a united states army officer and a diplomat .\nmost notably , he served from 1972 to 1976 as deputy director of central intelligence , from 1985 to 1989 as the united states ambassador to the united nations and from 1989 to 1991 as ambassador to the federal republic of germany during the decisive phase of german reunification .\nwalters rose to the rank of lieutenant general in the u.s. army and is a member of the military intelligence hall of fame .\n'

In [27]:
doc

'<mask> a. <mask> (<mask> <mask> , <mask> -- <mask> <mask> , <mask>) was a <mask> <mask> <mask> <mask> and a <mask> .\nmost <mask> , he <mask> from <mask> to <mask> as <mask> <mask> of <mask> <mask> , from <mask> to <mask> as the <mask> <mask> <mask> to the <mask> <mask> and from <mask> to <mask> as <mask> to the <mask> <mask> of <mask> during the <mask> <mask> of <mask> <mask> .\n<mask> <mask> to the <mask> of <mask> <mask> in the <mask>.s. <mask> and is a <mask> of the <mask> <mask> <mask> of <mask> .'

In [44]:
train_sims.softmax(dim=1).sort().values.flip(-1)[:, :10]

tensor([[0.4031, 0.1569, 0.1311, 0.0178, 0.0162, 0.0152, 0.0140, 0.0137, 0.0105,
         0.0100]])