"""
Notebook to prototype 
1. How to turn EM into a MLM. (With LM head initialized with vanilla Roberta MLM head)
2. How to add "mask" token to the EM-based MLM when the EM shares GPT2 embedding and has no mask token.
3. Quickly check the candidate EM-based MLM generates for located indices.
"""

In [1]:
#!/usr/bin/env python
# coding: utf-8

# standard libraries
import os
import sys
import json
import logging
from collections import namedtuple
project_dir = "/home/s3/hyeryung/mucoco" #'/home/hyeryung_son/mucoco' # 
sys.path.append(project_dir)
os.chdir(project_dir)

# installed packages
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM, AutoModelForSequenceClassification
import wandb
import random

# custom libraries
import mucoco.losses as lossbuilder
import mucoco.options as options
import mucoco.utils as utils
from mucoco.utils import TargetProbability, TargetEmbeddings, TargetSimplex, Lambda, Optimizer, OptimizerLE, get_epsilon, locate
from new_module.evaluation.evaluate_wandb import evaluate
from new_module.decode_utils import score_hypotheses, constrained_beam_search_v0
from new_module.utils.robertacustom_em_for_cand_gen import RobertaCustomForMaskedLM

# How to turn EM into a MLM. (With LM head initialized with vanilla Roberta MLM head)

In [3]:
## I implemented RobertaCustomForMaskedLM class. That is a Roberta MLM architecture that shares gpt2 embedding.

In [None]:
## Double checking the implementation
## Seems alright. (forward is just to meet the requirement. __init__ is what's important.)
## The key is where I define new_embeds and set it as input_embeddings in __init__
# class RobertaCustomForMaskedLM(RobertaCustomPreTrainedModel):
#     _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

#     def __init__(self, config):
#         super().__init__(config)

#         if config.is_decoder:
#             logger.warning(
#                 "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
#                 "bi-directional self-attention."
#             )


#         self.num_labels = config.num_labels
#         self.config = config
#         # print(config.vocab_size)


#         self.roberta = RobertaModel(config, add_pooling_layer=False)
#         embeds = self.roberta.get_input_embeddings()
#         old_dim = getattr(config,'n_embd', embeds.embedding_dim)
#         new_dim = getattr(config,'new_n_embd', None)
#         new_vocab_size = getattr(config,'new_vocab_size', config.vocab_size)
#         if new_dim is not None:
#             new_embeds = nn.Sequential(nn.Embedding(new_vocab_size, new_dim), nn.Linear(new_dim, old_dim, bias=False))
#             self.roberta.set_input_embeddings(new_embeds)

#         self.lm_head = RobertaLMHead(config)

#         # Initialize weights and apply final processing
#         self.post_init()

#     def forward(
#             self,
#             input_ids: Optional[torch.LongTensor] = None,
#             attention_mask: Optional[torch.FloatTensor] = None,
#             token_type_ids: Optional[torch.LongTensor] = None,
#             position_ids: Optional[torch.LongTensor] = None,
#             head_mask: Optional[torch.FloatTensor] = None,
#             inputs_embeds: Optional[torch.FloatTensor] = None,
#             encoder_hidden_states: Optional[torch.FloatTensor] = None,
#             encoder_attention_mask: Optional[torch.FloatTensor] = None,
#             labels: Optional[torch.LongTensor] = None,
#             output_attentions: Optional[bool] = None,
#             output_hidden_states: Optional[bool] = None,
#             return_dict: Optional[bool] = None,
#         ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
#             r"""
#             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
#                 Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
#                 config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
#                 loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
#             kwargs (`Dict[str, any]`, optional, defaults to *{}*):
#                 Used to hide legacy arguments that have been deprecated.
#             """
#             return_dict = return_dict if return_dict is not None else self.config.use_return_dict

#             outputs = self.roberta(
#                 input_ids,
#                 attention_mask=attention_mask,
#                 token_type_ids=token_type_ids,
#                 position_ids=position_ids,
#                 head_mask=head_mask,
#                 inputs_embeds=inputs_embeds,
#                 encoder_hidden_states=encoder_hidden_states,
#                 encoder_attention_mask=encoder_attention_mask,
#                 output_attentions=output_attentions,
#                 output_hidden_states=output_hidden_states,
#                 return_dict=return_dict,
#             )
#             sequence_output = outputs[0]
#             prediction_scores = self.lm_head(sequence_output)

#             masked_lm_loss = None
#             if labels is not None:
#                 # move labels to correct device to enable model parallelism
#                 labels = labels.to(prediction_scores.device)
#                 loss_fct = CrossEntropyLoss()
#                 masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

#             if not return_dict:
#                 output = (prediction_scores,) + outputs[2:]
#                 return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

#             return MaskedLMOutput(
#                 loss=masked_lm_loss,
#                 logits=prediction_scores,
#                 hidden_states=outputs.hidden_states,
#                 attentions=outputs.attentions,
#             )


In [None]:
config={'model_paths': ['gpt2-large', "models/roberta-base-jigsaw-toxicity-mlm-with-gpt2-large-embeds/checkpoint_best"],
       'device': 'cuda',
       'k_per_location': 10}

mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])
mlm_config = AutoConfig.from_pretrained(config['model_paths'][1])
mlm = RobertaCustomForMaskedLM.from_pretrained(config['model_paths'][1], config=mlm_config)

original_mlm_model = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')
mlm.lm_head = original_mlm_model.lm_head ## <- This line does the magic of replacing the custom mlm's lm head with that of roberta-base.
## This trick does work b/c when I take the regular RobertaForSequenceClassification and does the trick, it works pretty well as a mlm.

# How to add "mask" token to the EM-based MLM when the EM shares GPT2 embedding and has no mask token.

In [85]:
config={'model_paths': ['gpt2-large', "models/roberta-base-jigsaw-toxicity-mlm-with-gpt2-large-embeds/checkpoint_best"],
       'device': 'cuda',
       'k_per_location': 10}

mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])
mlm_config = AutoConfig.from_pretrained(config['model_paths'][1])
mlm = RobertaCustomForMaskedLM.from_pretrained(config['model_paths'][1], config=mlm_config)

original_mlm_model = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')
mlm.lm_head = original_mlm_model.lm_head 

https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0


In [86]:
## Add mask token to tokenizers
original_mlm_tokenizer = AutoTokenizer.from_pretrained('roberta-base', cache_dir='hf_cache')

mlm_tokenizer.add_special_tokens({'mask_token': original_mlm_tokenizer.mask_token})

https://huggingface.co:443 "HEAD /roberta-base/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/vocab.json HTTP/1.1" 200 0


1

In [87]:
## Add embedding for mask token
## I implemented this method for RobertaCustomPreTrainedModel which RobertaCustomForMaskedLM inherits.
mlm.add_mask_token_embedding()

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50258. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/vocab.json HTTP/1.1" 200 0


Sequential(
  (0): Embedding(50258, 1280)
  (1): Linear(in_features=1280, out_features=768, bias=False)
)

In [88]:
mlm.to(config['device'])
mlm.eval()

RobertaCustomForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Sequential(
        (0): Embedding(50258, 1280)
        (1): Linear(in_features=1280, out_features=768, bias=False)
      )
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (den

In [89]:
## Double checking if the implementation is correct. 
## The code seems correct to me.
# class RobertaCustomPreTrainedModel(PreTrainedModel):

#     """
#     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
#     models.
#     """

#     config_class = RobertaConfig
#     base_model_prefix = "roberta"
#     supports_gradient_checkpointing = True
#     _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]
        
#     def add_mask_token_embedding(
#         self, pad_to_multiple_of: Optional[int] = None
#     ) -> nn.Sequential:
        
#         model_embeds = self._add_mask_token_embedding(pad_to_multiple_of)

#         # Update base model and current model config
#         self.config.vocab_size = model_embeds[0].weight.shape[0]
#         self.vocab_size = model_embeds[0].weight.shape[0]

#         # Tie weights again if needed
#         self.tie_weights()

#         return model_embeds

#     def _add_mask_token_embedding(self, pad_to_multiple_of=None):
#         old_embeddings = self.get_input_embeddings()
#         new_num_tokens = old_embeddings[0].weight.shape[0] + 1
#         new_embeddings = self._get_embeddings_with_mask_token_embedding(old_embeddings, new_num_tokens, pad_to_multiple_of)
        
#         self.set_input_embeddings(new_embeddings)

#         new_num_tokens = new_embeddings[0].weight.shape[0]

#         # if word embeddings are not tied, make sure that lm head is resized as well
#         if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
#             old_lm_head = self.get_output_embeddings()
#             new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
#             if hasattr(old_lm_head, "_hf_hook"):
#                 hook = old_lm_head._hf_hook
#                 add_hook_to_module(new_lm_head, hook)
#             old_lm_head_requires_grad = old_lm_head.weight.requires_grad
#             new_lm_head.requires_grad_(old_lm_head_requires_grad)
#             self.set_output_embeddings(new_lm_head)

#         return self.get_input_embeddings()

#     def _get_embeddings_with_mask_token_embedding(
#         self,
#         old_embeddings: nn.Sequential,
#         new_num_tokens: Optional[int] = None,
#         pad_to_multiple_of: Optional[int] = None,
#     ) -> nn.Sequential:

#         if pad_to_multiple_of is not None:
#             if not isinstance(pad_to_multiple_of, int):
#                 raise ValueError(
#                     f"Asking to pad the embedding matrix to a multiple of `{pad_to_multiple_of}`, which is not and integer. Please make sure to pass an integer"
#                 )
#             new_num_tokens = ((new_num_tokens + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
#         else:
#             logger.info(
#                 "You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding"
#                 f" dimension will be {new_num_tokens}. This might induce some performance reduction as *Tensor Cores* will not be available."
#                 " For more details about this, or help on choosing the correct value for resizing, refer to this guide:"
#                 " https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc"
#             )

#         # Sequential(
#         #   (0): Embedding(50257, 1280)
#         #   (1): Linear(in_features=1280, out_features=768, bias=False)
#         # )
#         old_num_tokens, old_intermediate_embedding_dim = old_embeddings[0].weight.size()
        
#         # Build new embeddings
#         new_intermediate_embeddings = nn.Embedding(
#             new_num_tokens,
#             old_intermediate_embedding_dim,
#             device=old_embeddings[0].weight.device,
#             dtype=old_embeddings[0].weight.dtype,
#         )

#         # initialize all new embeddings (in particular added tokens)
#         self._init_weights(new_intermediate_embeddings)

#         # Copy token embeddings from the previous weights

#         # numbers of tokens to copy
#         n = min(old_num_tokens, new_num_tokens)

#         new_intermediate_embeddings.weight.data[:n, :] = old_embeddings[0].weight.data[:n, :]
        
#         # also set the weight of the last token by taking inverse. y W^T (W W^T)^{-1} 
#         ## -> for numerical stability, instead of taking inverse, use linalg.solve
#         ## 1) get the mask token embedding of Roberta
#         mlm = AutoModelForMaskedLM.from_pretrained('roberta-base')
#         mlm_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
#         y = mlm.get_input_embeddings().weight.data[mlm_tokenizer.mask_token_id, :].unsqueeze(-1)
        
#         ## 2) calculate this -> we want to find solution x that satisfies W x = y where W is the embedding projection layer in the custom roberta that projects gpt2 embedding into roberta embedding
#         ## What we hope is to find x such that it is projected to a mask token embedding and EM converted to MLM can hopefully recognize the embedding and treat it properly by generating candidates to fill it in. 
#         ## Since W is not a square matrix, first rewrite the equation as WTW x = WTy -> Now, it can be seen as Ax = B where A = WTW, B = WTy
#         W = old_embeddings[1].weight
#         A = W.t().matmul(W)
#         B = W.t().matmul(y)
#         x = torch.linalg.solve(A, B)
        
#         ## 3) set the weight of the last token
#         new_intermediate_embeddings.weight.data[-1, :] = x.data.squeeze()

#         # update requires_grad
#         old_embeddings_requires_grad = old_embeddings[0].weight.requires_grad
#         new_intermediate_embeddings.requires_grad_(old_embeddings_requires_grad)

#         new_embeddings = nn.Sequential(new_intermediate_embeddings, 
#                                        old_embeddings[1])
            
#         return new_embeddings

In [90]:
## cross checking if our embedding for mask token is close to the roberta's mask token embedding.

mask_token_embedding = mlm.get_input_embeddings()(torch.LongTensor([mlm_tokenizer.mask_token_id]).to(config['device']))
mask_token_embedding_original = original_mlm_model.get_input_embeddings()(torch.LongTensor([original_mlm_tokenizer.mask_token_id]).to(config['device']))

In [91]:
## Difference of -0.004~0.005 seems reasonable to me.
embed_diff = (mask_token_embedding - mask_token_embedding_original)

print(f"max: {embed_diff.max():.4f}")
print(f"min: {embed_diff.min():.4f}")
print(f"abs max: {embed_diff.abs().max():.4f}")
print(f"abs min: {embed_diff.abs().min():.4f}")
print(f"abs mean: {embed_diff.abs().mean():.4f}")
print(f"abs std: {embed_diff.abs().std():.4f}")

max: 0.0051
min: -0.0041
abs max: 0.0051
abs min: 0.0000
abs mean: 0.0007
abs std: 0.0006


In [96]:
def cosine_similarity(x,y):
    x = x.squeeze()
    y = y.squeeze()
    num = x.dot(y)
    denom = torch.sqrt(x.dot(x)*y.dot(y))
    print(f"x dot y: {num:.10f}")
    print(f"||x|| * ||y||: {denom:.10f}")
    return num/denom

In [97]:
print(f"Cosine similarity: {nn.functional.cosine_similarity(mask_token_embedding, mask_token_embedding_original)}")
print(f"Cosine similarity: {cosine_similarity(mask_token_embedding, mask_token_embedding_original)}")

Cosine similarity: tensor([1.000], device='cuda:0', grad_fn=<DivBackward0>)
x dot y: 4.0875220299
||x|| * ||y||: 4.0878634453
Cosine similarity: 0.9999164938926697


In [33]:
## How far is the embedding from other embeddings? 
# random_embedding = mlm.get_input_embeddings()(torch.LongTensor([random.randint(0, len(mlm_tokenizer))]).to(config['device']))

In [69]:
## calculate the median of other embeddings of robertacustom mlm (except for mask token embedding)
embeddings = []
for index in range(0, len(mlm_tokenizer)-1):
    embeddings.append(mlm.get_input_embeddings()(torch.LongTensor([index]).to(config['device'])))
embeddings = torch.stack(embeddings, dim=0)
median_embedding = torch.median(embeddings, dim=0).values

In [70]:
## The gap is larger than that from original mask token embedding 
## -> Is this a reason why even after initializing x properly, mask token does not seem to be recognized? 
## (To understand what I mean, please look at the below section where I checked candidates the EM-based MLM generates)
embed_diff = (mask_token_embedding - median_embedding)

print(f"max: {embed_diff.max():.4f}")
print(f"min: {embed_diff.min():.4f}")
print(f"abs max: {embed_diff.abs().max():.4f}")
print(f"abs min: {embed_diff.abs().min():.4f}")
print(f"abs mean: {embed_diff.abs().mean():.4f}")
print(f"abs std: {embed_diff.abs().std():.4f}")
print(f"Cosine similarity: {cosine_similarity(mask_token_embedding, median_embedding)}")

max: 0.9583
min: -1.0497
abs max: 1.0497
abs min: 0.0002
abs mean: 0.0291
abs std: 0.0720
tensor(-0.112, device='cuda:0', grad_fn=<DotBackward0>)
tensor(1.126, device='cuda:0', grad_fn=<SqrtBackward0>)
Cosine similarity: -0.09932808578014374


In [71]:
## Added 23/01/19 : check if the mask token embedding in RobertaForSequenceClassification trained as an EM 
##                             diverged a lot from the original mask token embedding and
##                               check how different it is from the other token embeddings that are likely be gradient-descended.
config={'device': 'cuda',
       'k_per_location': 3}
config['model_paths'] = ['gpt2-large', "/shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-energy-training/step_600_best_checkpoint"]

roberta_em_mlm = AutoModelForMaskedLM.from_pretrained(config['model_paths'][1])

Some weights of the model checkpoint at /shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-energy-training/step_600_best_checkpoint were not used when initializing RobertaForMaskedLM: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at /shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-energy-training/step_600_best_checkpoint and are n

In [72]:
roberta_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])

In [73]:
original_mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])

In [74]:
original_mlm_model = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')
roberta_em_mlm.lm_head = original_mlm_model.lm_head 

https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0


In [75]:
roberta_em_mlm.to(config['device'])

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [76]:
mask_token_embedding = roberta_em_mlm.get_input_embeddings()(torch.LongTensor([roberta_tokenizer.mask_token_id]).to(config['device']))

In [77]:
mask_token_embedding_original = original_mlm_model.get_input_embeddings()(torch.LongTensor([original_mlm_tokenizer.mask_token_id]).to(config['device']))

In [78]:
## Difference of -0.004~0.005 seems reasonable to me.
embed_diff = (mask_token_embedding - mask_token_embedding_original)

print(f"max: {embed_diff.max():.4f}")
print(f"min: {embed_diff.min():.4f}")
print(f"abs max: {embed_diff.abs().max():.4f}")
print(f"abs min: {embed_diff.abs().min():.4f}")
print(f"abs mean: {embed_diff.abs().mean():.4f}")
print(f"abs std: {embed_diff.abs().std():.4f}")
print(f"Cosine similarity: {cosine_similarity(mask_token_embedding, mask_token_embedding_original)}")

max: 0.0001
min: -0.0001
abs max: 0.0001
abs min: 0.0000
abs mean: 0.0000
abs std: 0.0000
tensor(4.086, device='cuda:0', grad_fn=<DotBackward0>)
tensor(4.086, device='cuda:0', grad_fn=<SqrtBackward0>)
Cosine similarity: 1.0


In [81]:
## calculate the median of other embeddings of robertacustom mlm (except for mask token embedding)
embeddings = []
for index in range(0, len(roberta_tokenizer)-1):
    embeddings.append(roberta_em_mlm.get_input_embeddings()(torch.LongTensor([index]).to(config['device'])))
embeddings = torch.stack(embeddings, dim=0)
median_embedding = torch.median(embeddings, dim=0).values

In [82]:
## The gap is larger than that from original mask token embedding 
## -> Is this a reason why even after initializing x properly, mask token does not seem to be recognized? 
## (To understand what I mean, please look at the below section where I checked candidates the EM-based MLM generates)
embed_diff = (mask_token_embedding - median_embedding)

print(f"max: {embed_diff.max():.4f}")
print(f"min: {embed_diff.min():.4f}")
print(f"abs max: {embed_diff.abs().max():.4f}")
print(f"abs min: {embed_diff.abs().min():.4f}")
print(f"abs mean: {embed_diff.abs().mean():.4f}")
print(f"abs std: {embed_diff.abs().std():.4f}")
print(f"Cosine similarity: {cosine_similarity(mask_token_embedding, median_embedding)}")

max: 0.9452
min: -1.0451
abs max: 1.0451
abs min: 0.0000
abs mean: 0.0283
abs std: 0.0716
tensor(-0.090, device='cuda:0', grad_fn=<DotBackward0>)
tensor(1.068, device='cuda:0', grad_fn=<SqrtBackward0>)
Cosine similarity: -0.0846225693821907


# Quickly check the candidate EM-based MLM generates for located indices.

## Results
- original sentence: `wearing games and holy ****ing shit do I hate horse wearing games.`
- masked sentence `(masking locations determined manually): wearing games and<mask><mask><mask><mask> do I hate horse wearing games.`
- roberta custom em (mask token added) turned mlm : `wearing games and indoors indoors indoors indoors do I hate horse wearing games.`
- roberta em turned mlm : `wearing games and beating you. I do I hate horse wearing games.`
- roberta mlm : `wearing games and riding horses. Why do I hate horse wearing games.`

## Codes that are common to below test cases

In [52]:
#test_sentence = "wearing games and holy ****ing shit do I hate horse wearing games."
test_sentence = "wearing games and<mask><mask><mask><mask> do I hate horse wearing games."

In [51]:
original_mlm_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
primary_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][0])
primary_tokenizer.add_special_tokens({'mask_token':original_mlm_tokenizer.mask_token})
primary_mask_token_id = primary_tokenizer.mask_token_id

https://huggingface.co:443 "HEAD /roberta-base/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /roberta-base/resolve/main/vocab.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/vocab.json HTTP/1.1" 200 0


In [225]:
config={'device': 'cuda',
       'k_per_location': 3}
config['target_label_ids'] = [0,0]
config['losses'] = ['gpt2', 'classification_no_prefix']
config['cache_dir'] = 'hf_cache'
config['target_type'] = 'embeds'
config['min_epsilons'] = [0.75]
config['build_loss_dict'] = {"coeff_steps": 200, "coeff_pattern": "constant", "loss_type": "xentropy", "length_normalize": "false", "AR_temperature": 1.0, "AR_top_k": 0, "AR_top_p": 0.96, "max_output_length": 20}

In [None]:
## Preparation needed to score hypotheses
wandb.init(config={"closs_weight": 0.5})
source_batch = torch.LongTensor([[mlm_tokenizer.bos_token_id]]).cuda()
class dummyArgs:
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)
build_loss_args=dummyArgs(**config['build_loss_dict'])
name2tokenizer = {}
name2model = {}
name2config = {}
loss2modelname = {}
loss2tokenizer = {}
embed_luts = []
embed_scales = []
prev_vocab_size = None
vocab_size = None
primary_vocab_size = None
primary_model = None
gold_losses = []
label_ids = config['target_label_ids'] # target label's ids for each loss
keywords = ["the" for _ in config['losses']]
new_kweight = 5.0
use_context = 'false'
allsat = True
additional_batch = source_batch
context_batch = [None]

for i, model_path in enumerate(config['model_paths']):
    if model_path not in name2model: #making sure we are not loading the model twice in case some constraints use the same model. 
        
        try:
            name2tokenizer[model_path] = AutoTokenizer.from_pretrained(config['tokenizer_paths'][i], cache_dir=config['cache_dir'],  use_fast=True)
        except:
            name2tokenizer[model_path] = AutoTokenizer.from_pretrained(config['tokenizer_paths'][i], cache_dir=config['cache_dir'],  use_fast=False)
            
        name2config[model_path] = AutoConfig.from_pretrained(model_path, cache_dir=config['cache_dir'])

        if config['model_types'][i] == "sentence-transformer":
            name2model[model_path] = lossbuilder.ModelWrapper(SentenceTransformer(model_path))
        elif "Custom" in config['model_types'][i]:
            name2model[model_path] = lossbuilder.ModelWrapper(getattr(utils, config['model_types'][i]).from_pretrained(model_path, config=name2config[model_path], cache_dir=config['cache_dir']))
        else:
            name2model[model_path] = lossbuilder.ModelWrapper(getattr(transformers, config['model_types'][i]).from_pretrained(model_path, config=name2config[model_path], cache_dir=config['cache_dir']))
        name2model[model_path].eval()
        name2model[model_path].cuda()
        embed_lut_ = name2model[model_path].get_input_embeddings()
        if isinstance(embed_lut_, torch.nn.Sequential):
            new_vocab_size = embed_lut_[0].num_embeddings
        else:
            new_vocab_size = embed_lut_.num_embeddings
        if prev_vocab_size is None:
            vocab_size=new_vocab_size
        prev_vocab_size = vocab_size
    
    input_embeds = name2model[model_path].get_input_embeddings()
    if isinstance(input_embeds, torch.nn.Sequential):
        input_embeds = input_embeds[0]
    embed_luts.append(input_embeds)
    
    if config['target_type'] == "embeds":
        embed_luts[-1].requires_grad=False
    
    if i == 0:
        primary_vocab_size = vocab_size
        primary_embed_dim = embed_luts[-1].embedding_dim
        primary_model = name2model[model_path]
    
    if getattr(name2model[model_path], "get_decoder", None) is None: #this is for MarianMT models which have a weird embedding_scale parameter
        embed_scales.append(1.0)
    else:
        embed_scales.append(getattr(name2model[model_path].get_decoder(), "embed_scale", 1.0))


## RobertaCustom With Mask Token Added

In [None]:
config['model_paths'] = ['gpt2-large', "models/roberta-base-jigsaw-toxicity-mlm-with-gpt2-large-embeds/checkpoint_best"]
config['tokenizer_paths'] = config['model_paths']
config['model_types'] = ['AutoModelForCausalLM', 'RobertaCustomForSequenceClassification']

In [118]:
inputs = mlm_tokenizer(test_sentence, return_tensors="pt")
lm_outputs = mlm(**inputs.to(config['device']))

## get locations (indexes) in test_sentence that is filled with mask token.
indices_in_mlm_tokens = (inputs.input_ids == mlm_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
print(indices_in_mlm_tokens)

## get top k tokens for each index
logits = lm_outputs.logits
predicted_token_ids = torch.topk(logits[0, indices_in_mlm_tokens], k=config['k_per_location'], dim=-1)

tensor([4, 5, 6, 7], device='cuda:0')


In [120]:
## Check for candidates
## Candidates at each locations are the same.. -> Problematic!
print(test_sentence)
print("-"*50)
for i in range(4):
    for j in range(config['k_per_location']):
        print(f"Candidate {j} @ {i}: {predicted_token_ids.indices[i, j]}\t-----\t{mlm_tokenizer.decode(predicted_token_ids.indices[i, j])}")
    print("-"*50)

wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
--------------------------------------------------
Candidate 0 @ 0: 20	-----	5
Candidate 1 @ 0: 152	-----	�
Candidate 2 @ 0: 31797	-----	 indoors
--------------------------------------------------
Candidate 0 @ 1: 20	-----	5
Candidate 1 @ 1: 31797	-----	 indoors
Candidate 2 @ 1: 152	-----	�
--------------------------------------------------
Candidate 0 @ 2: 20	-----	5
Candidate 1 @ 2: 31797	-----	 indoors
Candidate 2 @ 2: 152	-----	�
--------------------------------------------------
Candidate 0 @ 3: 20	-----	5
Candidate 1 @ 3: 31797	-----	 indoors
Candidate 2 @ 3: 152	-----	�
--------------------------------------------------


In [121]:
hypotheses = []
num_located_tokens = len(indices_in_mlm_tokens)
num_all_cases = config['k_per_location'] ** num_located_tokens
tok_cand_combo = [0 for i in range(num_located_tokens)]

In [122]:
for case_id in range(num_all_cases):
    # print(case_id)
    for i in range(num_located_tokens):
        tok_cand_combo[i] = (case_id // (config['k_per_location']**i)) % config['k_per_location']
    
    tmp_seq = inputs['input_ids'].clone()
    for pos_id, tok_cand_id in enumerate(tok_cand_combo):
        tmp_seq[0, indices_in_mlm_tokens[pos_id]] = predicted_token_ids.indices[pos_id, tok_cand_id]

    # need to do decode with RobertaTokenizer and encode with GPT2Tokenizer
    tmp_dec_seq = primary_tokenizer(mlm_tokenizer.batch_decode(tmp_seq, skip_special_tokens=True), return_tensors="pt").input_ids.cuda()
    hypotheses.append(tmp_dec_seq.squeeze(0))

In [None]:
## Code block to score hypotheses using GPT2 and EM.
lossfns = []
for i, loss in enumerate(config['losses']):
    lossfns.append(lossbuilder.build_loss(loss, name2model[config['model_paths'][i]], name2tokenizer[config['model_paths'][i]], build_loss_args))
    loss2modelname[loss] = config['model_paths'][i]
    loss2tokenizer[loss] = name2tokenizer[config['model_paths'][i]]

candidate_total_losses, candidate_primary_losses, candidate_losses_for_loggings = score_hypotheses(source_batch,
                                                                                                hypotheses, 
                                                                                                config, 
                                                                                                lossfns,
                                                                                        additional_batch=additional_batch, 
                                                                                        context_batch=context_batch,
                                                                                        use_context=use_context,
                                                                                        label_ids=label_ids,
                                                                                        keywords=keywords,
                                                                                        kweight=new_kweight)
best_ix = np.argmin(np.array(candidate_total_losses))
best_prediction = hypotheses[best_ix]
best_text = primary_tokenizer.decode(best_prediction)

In [153]:
print(f"Original sentence: {test_sentence}")
print("-"*100)
print(f"Best hypothesis: {best_text}")
print("-"*100)
print(f"Random sample of hypothesis: {primary_tokenizer.decode(hypotheses[random.randint(0, len(hypotheses))])}")

Original sentence: wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Best hypothesis: wearing games and indoors indoors indoors indoors do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Random sample of hypothesis: wearing games and��55 do I hate horse wearing games.


## ~RobertaCustom With Mask Token Added + Only use first 11 layers~
- Hypothesis: Since the final layer of SeqClassificationModel is trained for SeqClassification task (using CLS token and what not), the candidate generation can improve if we do not use that final layer and only use the first few layers of transformer blocks when converting EM to MLM.
- Result: Did not help.

In [None]:
config['model_paths'] = ['gpt2-large', "models/roberta-base-jigsaw-toxicity-mlm-with-gpt2-large-embeds/checkpoint_best"]
config['tokenizer_paths'] = config['model_paths']
config['model_types'] = ['AutoModelForCausalLM', 'RobertaCustomForSequenceClassification']

In [164]:
mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])
mlm_config = AutoConfig.from_pretrained(config['model_paths'][1])
mlm_config.update({'num_hidden_layers':11}) # Line that does the magic of only using the first 11 transformer blocks.
mlm = RobertaCustomForMaskedLM.from_pretrained(config['model_paths'][1], config=mlm_config)

original_mlm_model = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')
mlm.lm_head = original_mlm_model.lm_head

mlm.add_mask_token_embedding()

mlm.to(config['device'])
mlm.eval()

## Add mask token to tokenizers
original_mlm_tokenizer = AutoTokenizer.from_pretrained('roberta-base', cache_dir='hf_cache')
mlm_tokenizer.add_special_tokens({'mask_token': original_mlm_tokenizer.mask_token})

Some weights of the model checkpoint at models/roberta-base-jigsaw-toxicity-mlm-with-gpt2-large-embeds/checkpoint_best were not used when initializing RobertaCustomForMaskedLM: ['roberta.encoder.layer.11.attention.self.value.weight', 'roberta.encoder.layer.11.attention.self.query.weight', 'roberta.encoder.layer.11.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.weight', 'roberta.encoder.layer.11.attention.self.key.weight', 'roberta.encoder.layer.11.output.LayerNorm.weight', 'roberta.encoder.layer.11.intermediate.dense.weight', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.11.attention.output.LayerNorm.bias', 'roberta.encoder.layer.11.attention.output.dense.bias', 'roberta.encoder.layer.11.attention.self.key.bias', 'roberta.encoder.layer.11.attention.output.LayerNorm.weight', 'roberta.encoder.layer.11.output.LayerNorm.bias', 'roberta.encoder.layer.11.attention.self.query.bias', 'roberta.encoder.layer.11.intermediate.dense.bias', 'rober

1

In [165]:
inputs = mlm_tokenizer(test_sentence, return_tensors="pt")
lm_outputs = mlm(**inputs.to(config['device']))

## get locations (indexes) in test_sentence that is filled with mask token.
indices_in_mlm_tokens = (inputs.input_ids == mlm_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
print(indices_in_mlm_tokens)

## get top k tokens for each index
logits = lm_outputs.logits
predicted_token_ids = torch.topk(logits[0, indices_in_mlm_tokens], k=config['k_per_location'], dim=-1)

tensor([4, 5, 6, 7], device='cuda:0')


In [166]:
## Check for candidates
## Candidates at each locations are the same.. -> Problematic!
print(test_sentence)
print("-"*50)
for i in range(4):
    for j in range(config['k_per_location']):
        print(f"Candidate {j} @ {i}: {predicted_token_ids.indices[i, j]}\t-----\t{mlm_tokenizer.decode(predicted_token_ids.indices[i, j])}")
    print("-"*50)

wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
--------------------------------------------------
Candidate 0 @ 0: 7351	-----	 serving
Candidate 1 @ 0: 20496	-----	 Latino
Candidate 2 @ 0: 5253	-----	 distance
--------------------------------------------------
Candidate 0 @ 1: 7351	-----	 serving
Candidate 1 @ 1: 20496	-----	 Latino
Candidate 2 @ 1: 1168	-----	 Z
--------------------------------------------------
Candidate 0 @ 2: 7351	-----	 serving
Candidate 1 @ 2: 1168	-----	 Z
Candidate 2 @ 2: 20496	-----	 Latino
--------------------------------------------------
Candidate 0 @ 3: 7351	-----	 serving
Candidate 1 @ 3: 20496	-----	 Latino
Candidate 2 @ 3: 1168	-----	 Z
--------------------------------------------------


In [167]:
hypotheses = []
num_located_tokens = len(indices_in_mlm_tokens)
num_all_cases = config['k_per_location'] ** num_located_tokens
tok_cand_combo = [0 for i in range(num_located_tokens)]

In [168]:
for case_id in range(num_all_cases):
    # print(case_id)
    for i in range(num_located_tokens):
        tok_cand_combo[i] = (case_id // (config['k_per_location']**i)) % config['k_per_location']
    
    tmp_seq = inputs['input_ids'].clone()
    for pos_id, tok_cand_id in enumerate(tok_cand_combo):
        tmp_seq[0, indices_in_mlm_tokens[pos_id]] = predicted_token_ids.indices[pos_id, tok_cand_id]

    # need to do decode with RobertaTokenizer and encode with GPT2Tokenizer
    tmp_dec_seq = primary_tokenizer(mlm_tokenizer.batch_decode(tmp_seq, skip_special_tokens=True), return_tensors="pt").input_ids.cuda()
    hypotheses.append(tmp_dec_seq.squeeze(0))

In [169]:
## Code block to score hypotheses using GPT2 and EM.
lossfns = []
for i, loss in enumerate(config['losses']):
    lossfns.append(lossbuilder.build_loss(loss, name2model[config['model_paths'][i]], name2tokenizer[config['model_paths'][i]], build_loss_args))
    loss2modelname[loss] = config['model_paths'][i]
    loss2tokenizer[loss] = name2tokenizer[config['model_paths'][i]]

candidate_total_losses, candidate_primary_losses, candidate_losses_for_loggings = score_hypotheses(source_batch,
                                                                                                hypotheses, 
                                                                                                config, 
                                                                                                lossfns,
                                                                                        additional_batch=additional_batch, 
                                                                                        context_batch=context_batch,
                                                                                        use_context=use_context,
                                                                                        label_ids=label_ids,
                                                                                        keywords=keywords,
                                                                                        kweight=new_kweight)
best_ix = np.argmin(np.array(candidate_total_losses))
best_prediction = hypotheses[best_ix]
best_text = primary_tokenizer.decode(best_prediction)

In [170]:
print(f"Original sentence: {test_sentence}")
print("-"*100)
print(f"Best hypothesis: {best_text}")
print("-"*100)
print(f"Random sample of hypothesis: {primary_tokenizer.decode(hypotheses[random.randint(0, len(hypotheses))])}")

Original sentence: wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Best hypothesis: wearing games and indoors indoors indoors indoors do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Random sample of hypothesis: wearing games and serving Z Z serving do I hate horse wearing games.


## Use AutoModelForSequenceClassification for energy model instead
What if we not use custom roberta EM and thus remove the need to add mask token? (That is, what if we just use regular roberta classifier for our em and turn it into a mlm?)

In [None]:
config['model_paths'] = ['gpt2-large', "/shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-energy-training/step_600_best_checkpoint"]
config['tokenizer_paths'] = config['model_paths']
config['model_types'] = ['AutoModelForCausalLM', 'AutoModelForSequenceClassification']

In [171]:
primary_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][0])

mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])
mlm_config = AutoConfig.from_pretrained(config['model_paths'][1])
# mlm_config.update({'num_hidden_layers':11})

mlm = AutoModelForMaskedLM.from_pretrained(config['model_paths'][1], config=mlm_config)

model_checkpoint = "roberta-base" ## replace the newly initialized lm_head with roberta-base's head
original_mlm_model = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')
mlm.lm_head = original_mlm_model.lm_head
del original_mlm_model

original_mlm_tokenizer = AutoTokenizer.from_pretrained('roberta-base', cache_dir='hf_cache')

mlm_tokenizer.add_special_tokens({'mask_token': original_mlm_tokenizer.mask_token})
primary_tokenizer.add_special_tokens({'mask_token':original_mlm_tokenizer.mask_token})
primary_mask_token_id = primary_tokenizer.mask_token_id
mlm.to(config['device'])
mlm.eval()

https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/vocab.json HTTP/1.1" 200 0
Some weights of the model checkpoint at /shared/s3/lab07/hyeryung/loc_edit/roberta-base-jigsaw-toxicity-classifier-energy-training/step_600_best_checkpoint were not used when initializing RobertaForMaskedLM: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkp

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [172]:
inputs = mlm_tokenizer(test_sentence, return_tensors="pt")
lm_outputs = mlm(**inputs.to(config['device']))

## get locations (indexes) in test_sentence that is filled with mask token.
indices_in_mlm_tokens = (inputs.input_ids == mlm_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

## get top k tokens for each index
logits = lm_outputs.logits
predicted_token_ids = torch.topk(logits[0, indices_in_mlm_tokens], k=config['k_per_location'], dim=-1)

In [189]:
## Check for candidates
## Candidates at each locations are the same.. -> Problematic!
print(test_sentence)
print("-"*50)
for i in range(4):
    for j in range(config['k_per_location']):
        print(f"Candidate {j} @ {i}: {predicted_token_ids.indices[i, j]}\t-----\t{mlm_tokenizer.decode(predicted_token_ids.indices[i, j])}")
    print("-"*50)

wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
--------------------------------------------------
Candidate 0 @ 0: 47	-----	 you
Candidate 1 @ 0: 4108	-----	 beating
Candidate 2 @ 0: 14	-----	 that
--------------------------------------------------
Candidate 0 @ 1: 4	-----	.
Candidate 1 @ 1: 154	-----	ing
Candidate 2 @ 1: 47	-----	 you
--------------------------------------------------
Candidate 0 @ 2: 4	-----	.
Candidate 1 @ 2: 20	-----	 The
Candidate 2 @ 2: 6	-----	,
--------------------------------------------------
Candidate 0 @ 3: 6	-----	,
Candidate 1 @ 3: 370	-----	 You
Candidate 2 @ 3: 38	-----	 I
--------------------------------------------------


In [185]:
hypotheses = []
num_located_tokens = len(indices_in_mlm_tokens)
num_all_cases = config['k_per_location'] ** num_located_tokens
tok_cand_combo = [0 for i in range(num_located_tokens)]

In [186]:
for case_id in range(num_all_cases):
    # print(case_id)
    for i in range(num_located_tokens):
        tok_cand_combo[i] = (case_id // (config['k_per_location']**i)) % config['k_per_location']
    
    tmp_seq = inputs['input_ids'].clone()
    for pos_id, tok_cand_id in enumerate(tok_cand_combo):
        tmp_seq[0, indices_in_mlm_tokens[pos_id]] = predicted_token_ids.indices[pos_id, tok_cand_id]

    # need to do decode with RobertaTokenizer and encode with GPT2Tokenizer
    tmp_dec_seq = primary_tokenizer(mlm_tokenizer.batch_decode(tmp_seq, skip_special_tokens=True), return_tensors="pt").input_ids.cuda()
    hypotheses.append(tmp_dec_seq.squeeze(0))

In [192]:
## Code block to score hypotheses using GPT2 and EM.
lossfns = []
for i, loss in enumerate(config['losses']):
    lossfns.append(lossbuilder.build_loss(loss, name2model[config['model_paths'][i]], name2tokenizer[config['model_paths'][i]], build_loss_args))
    loss2modelname[loss] = config['model_paths'][i]
    loss2tokenizer[loss] = name2tokenizer[config['model_paths'][i]]

candidate_total_losses, candidate_primary_losses, candidate_losses_for_loggings = score_hypotheses(source_batch,
                                                                                                hypotheses, 
                                                                                                config, 
                                                                                                lossfns,
                                                                                        additional_batch=additional_batch, 
                                                                                        context_batch=context_batch,
                                                                                        use_context=use_context,
                                                                                        label_ids=label_ids,
                                                                                        keywords=keywords,
                                                                                        kweight=new_kweight)
best_ix = np.argmin(np.array(candidate_total_losses))
best_prediction = hypotheses[best_ix]
best_text = primary_tokenizer.decode(best_prediction)

In [193]:
print(f"Original sentence: {test_sentence}")
print("-"*100)
print(f"Best hypothesis: {best_text}")
print("-"*100)
print(f"Random sample of hypothesis: {primary_tokenizer.decode(hypotheses[random.randint(0, len(hypotheses))])}")

Original sentence: wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Best hypothesis: wearing games and beating you. I do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Random sample of hypothesis: wearing games and thating,, do I hate horse wearing games.


## ~Use AutoModelForSequenceClassification for energy model instead + Use only 11 layers~
Double checking if the performance can improve by removing the final layer and using the first few layers with AutoModel EM turned MLM.

- Hypothesis: Since the final layer of SeqClassificationModel is trained for SeqClassification task (using CLS token and what not), the candidate generation can improve if we do not use that final layer and only use the first few layers of transformer blocks when converting EM to MLM.
- Result: Got worse. The hypothesis is incorrect.

In [208]:
primary_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][0])

mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])
mlm_config = AutoConfig.from_pretrained(config['model_paths'][1])
mlm_config.update({'num_hidden_layers':11})

mlm = AutoModelForMaskedLM.from_pretrained(config['model_paths'][1], config=mlm_config)

model_checkpoint = "roberta-base" ## replace the newly initialized lm_head with roberta-base's head
original_mlm_model = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')
mlm.lm_head = original_mlm_model.lm_head
del original_mlm_model

original_mlm_tokenizer = AutoTokenizer.from_pretrained('roberta-base', cache_dir='hf_cache')

mlm_tokenizer.add_special_tokens({'mask_token': original_mlm_tokenizer.mask_token})
primary_tokenizer.add_special_tokens({'mask_token':original_mlm_tokenizer.mask_token})
primary_mask_token_id = primary_tokenizer.mask_token_id
mlm.to(config['device'])
mlm.eval()

https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/vocab.json HTTP/1.1" 200 0
Some weights of the model checkpoint at models/roberta-base-jigsaw-toxicity-mlm-with-gpt2-large-embeds/checkpoint_best were not used when initializing RobertaForMaskedLM: ['roberta.encoder.layer.11.attention.self.value.weight', 'roberta.encoder.layer.11.attention.self.query.weight', 'roberta.encoder.layer.11.output.dense.bias', 'roberta.encoder.layer.11.attention.output.dense.weight', 'roberta.encoder.layer.11.attention.self.key.weight', 'roberta.encoder.layer.11.output.LayerNorm.weight', 'roberta.encoder.layer.11.intermediate.dense.weight', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.11.attention

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [209]:
inputs = mlm_tokenizer(test_sentence, return_tensors="pt")
lm_outputs = mlm(**inputs.to(config['device']))

## get locations (indexes) in test_sentence that is filled with mask token.
indices_in_mlm_tokens = (inputs.input_ids == mlm_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

## get top k tokens for each index
logits = lm_outputs.logits
predicted_token_ids = torch.topk(logits[0, indices_in_mlm_tokens], k=config['k_per_location'], dim=-1)

In [210]:
## Check for candidates
## Candidates at each locations are the same.. -> Problematic!
print(test_sentence)
print("-"*50)
for i in range(4):
    for j in range(config['k_per_location']):
        print(f"Candidate {j} @ {i}: {predicted_token_ids.indices[i, j]}\t-----\t{mlm_tokenizer.decode(predicted_token_ids.indices[i, j])}")
    print("-"*50)

wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
--------------------------------------------------
Candidate 0 @ 0: 4555	-----	 eth
Candidate 1 @ 0: 2183	-----	 custom
Candidate 2 @ 0: 38690	-----	starting
--------------------------------------------------
Candidate 0 @ 1: 12018	-----	 ox
Candidate 1 @ 1: 4555	-----	 eth
Candidate 2 @ 1: 38690	-----	starting
--------------------------------------------------
Candidate 0 @ 2: 1088	-----	 around
Candidate 1 @ 2: 12018	-----	 ox
Candidate 2 @ 2: 2183	-----	 custom
--------------------------------------------------
Candidate 0 @ 3: 1088	-----	 around
Candidate 1 @ 3: 12018	-----	 ox
Candidate 2 @ 3: 73	-----	j
--------------------------------------------------


In [211]:
hypotheses = []
num_located_tokens = len(indices_in_mlm_tokens)
num_all_cases = config['k_per_location'] ** num_located_tokens
tok_cand_combo = [0 for i in range(num_located_tokens)]

In [212]:
for case_id in range(num_all_cases):
    # print(case_id)
    for i in range(num_located_tokens):
        tok_cand_combo[i] = (case_id // (config['k_per_location']**i)) % config['k_per_location']
    
    tmp_seq = inputs['input_ids'].clone()
    for pos_id, tok_cand_id in enumerate(tok_cand_combo):
        tmp_seq[0, indices_in_mlm_tokens[pos_id]] = predicted_token_ids.indices[pos_id, tok_cand_id]

    # need to do decode with RobertaTokenizer and encode with GPT2Tokenizer
    tmp_dec_seq = primary_tokenizer(mlm_tokenizer.batch_decode(tmp_seq, skip_special_tokens=True), return_tensors="pt").input_ids.cuda()
    hypotheses.append(tmp_dec_seq.squeeze(0))

In [213]:
## Code block to score hypotheses using GPT2 and EM.
lossfns = []
for i, loss in enumerate(config['losses']):
    lossfns.append(lossbuilder.build_loss(loss, name2model[config['model_paths'][i]], name2tokenizer[config['model_paths'][i]], build_loss_args))
    loss2modelname[loss] = config['model_paths'][i]
    loss2tokenizer[loss] = name2tokenizer[config['model_paths'][i]]

candidate_total_losses, candidate_primary_losses, candidate_losses_for_loggings = score_hypotheses(source_batch,
                                                                                                hypotheses, 
                                                                                                config, 
                                                                                                lossfns,
                                                                                        additional_batch=additional_batch, 
                                                                                        context_batch=context_batch,
                                                                                        use_context=use_context,
                                                                                        label_ids=label_ids,
                                                                                        keywords=keywords,
                                                                                        kweight=new_kweight)
best_ix = np.argmin(np.array(candidate_total_losses))
best_prediction = hypotheses[best_ix]
best_text = primary_tokenizer.decode(best_prediction)

In [214]:
print(f"Original sentence: {test_sentence}")
print("-"*100)
print(f"Best hypothesis: {best_text}")
print("-"*100)
print(f"Random sample of hypothesis: {primary_tokenizer.decode(hypotheses[random.randint(0, len(hypotheses))])}")

Original sentence: wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Best hypothesis: wearing games and custom ox custom ox do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Random sample of hypothesis: wearing games and custom eth customj do I hate horse wearing games.


## Baselne: vanialla MLM

In [216]:
primary_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][0])

https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2-large/resolve/main/vocab.json HTTP/1.1" 200 0


In [217]:
mlm_tokenizer = AutoTokenizer.from_pretrained(config['model_paths'][1])
mlm_config = AutoConfig.from_pretrained(config['model_paths'][1])

In [218]:
model_checkpoint = "roberta-base" ## replace the newly initialized lm_head with roberta-base's head
mlm = AutoModelForMaskedLM.from_pretrained('roberta-base', cache_dir='hf_cache')

https://huggingface.co:443 "HEAD /roberta-base/resolve/main/config.json HTTP/1.1" 200 0


In [219]:
primary_tokenizer.add_special_tokens({'mask_token':mlm_tokenizer.mask_token})
primary_mask_token_id = primary_tokenizer.mask_token_id
mlm.to(config['device'])
mlm.eval()

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [227]:
inputs = mlm_tokenizer(test_sentence, return_tensors="pt")
lm_outputs = mlm(**inputs.to(config['device']))

## get locations (indexes) in test_sentence that is filled with mask token.
indices_in_mlm_tokens = (inputs.input_ids == mlm_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

## get top k tokens for each index
logits = lm_outputs.logits
predicted_token_ids = torch.topk(logits[0, indices_in_mlm_tokens], k=config['k_per_location'], dim=-1)

In [228]:
## Check for candidates
## Candidates at each locations are the same.. -> Problematic!
print(test_sentence)
print("-"*50)
for i in range(4):
    for j in range(config['k_per_location']):
        print(f"Candidate {j} @ {i}: {predicted_token_ids.indices[i, j]}\t-----\t{mlm_tokenizer.decode(predicted_token_ids.indices[i, j])}")
    print("-"*50)

wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
--------------------------------------------------
Candidate 0 @ 0: 5253	-----	 horse
Candidate 1 @ 0: 8087	-----	 horses
Candidate 2 @ 0: 5793	-----	 riding
--------------------------------------------------
Candidate 0 @ 1: 8087	-----	 horses
Candidate 1 @ 1: 5253	-----	 horse
Candidate 2 @ 1: 4	-----	.
--------------------------------------------------
Candidate 0 @ 2: 2	-----	</s>
Candidate 1 @ 2: 4	-----	.
Candidate 2 @ 2: 8087	-----	 horses
--------------------------------------------------
Candidate 0 @ 3: 2612	-----	 Why
Candidate 1 @ 3: 596	-----	 why
Candidate 2 @ 3: 7608	-----	Why
--------------------------------------------------


In [229]:
hypotheses = []
num_located_tokens = len(indices_in_mlm_tokens)
num_all_cases = config['k_per_location'] ** num_located_tokens
tok_cand_combo = [0 for i in range(num_located_tokens)]

In [230]:
for case_id in range(num_all_cases):
    # print(case_id)
    for i in range(num_located_tokens):
        tok_cand_combo[i] = (case_id // (config['k_per_location']**i)) % config['k_per_location']
    
    tmp_seq = inputs['input_ids'].clone()
    for pos_id, tok_cand_id in enumerate(tok_cand_combo):
        tmp_seq[0, indices_in_mlm_tokens[pos_id]] = predicted_token_ids.indices[pos_id, tok_cand_id]

    # need to do decode with RobertaTokenizer and encode with GPT2Tokenizer
    tmp_dec_seq = primary_tokenizer(mlm_tokenizer.batch_decode(tmp_seq, skip_special_tokens=True), return_tensors="pt").input_ids.cuda()
    hypotheses.append(tmp_dec_seq.squeeze(0))

In [231]:
## Code block to score hypotheses using GPT2 and EM.
lossfns = []
for i, loss in enumerate(config['losses']):
    lossfns.append(lossbuilder.build_loss(loss, name2model[config['model_paths'][i]], name2tokenizer[config['model_paths'][i]], build_loss_args))
    loss2modelname[loss] = config['model_paths'][i]
    loss2tokenizer[loss] = name2tokenizer[config['model_paths'][i]]

candidate_total_losses, candidate_primary_losses, candidate_losses_for_loggings = score_hypotheses(source_batch,
                                                                                                hypotheses, 
                                                                                                config, 
                                                                                                lossfns,
                                                                                        additional_batch=additional_batch, 
                                                                                        context_batch=context_batch,
                                                                                        use_context=use_context,
                                                                                        label_ids=label_ids,
                                                                                        keywords=keywords,
                                                                                        kweight=new_kweight)
best_ix = np.argmin(np.array(candidate_total_losses))
best_prediction = hypotheses[best_ix]
best_text = primary_tokenizer.decode(best_prediction)

In [232]:
print(f"Original sentence: {test_sentence}")
print("-"*100)
print(f"Best hypothesis: {best_text}")
print("-"*100)
print(f"Random sample of hypothesis: {primary_tokenizer.decode(hypotheses[random.randint(0, len(hypotheses))])}")

Original sentence: wearing games and<mask><mask><mask><mask> do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Best hypothesis: wearing games and riding horses. Why do I hate horse wearing games.
----------------------------------------------------------------------------------------------------
Random sample of hypothesis: wearing games and horse horse why do I hate horse wearing games.
