#### Mask Filling (Masked Language Modeling)

In [12]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
model_path = "/workspace/storage/llm_addition/train_add_decomp/xlm-roberta-large_scratch/results/checkpoint-3500"
model = AutoModelForMaskedLM.from_pretrained(model_path)
mask_filler = pipeline("fill-mask", model, tokenizer=tokenizer)

In [7]:
text = "Compute with pipeline 98 plus 45 = <mask>"
mask_filler(text)

[{'score': 0.9882730841636658,
  'token': 19309,
  'token_str': 'late',
  'sequence': 'Compute with pipeline 98 plus 45 =late'},
 {'score': 0.004882052540779114,
  'token': 11062,
  'token_str': 'Trans',
  'sequence': 'Compute with pipeline 98 plus 45 = Trans'},
 {'score': 0.000832964782603085,
  'token': 25072,
  'token_str': 'unit',
  'sequence': 'Compute with pipeline 98 plus 45 = unit'},
 {'score': 0.0006677475175820291,
  'token': 277,
  'token_str': 'com',
  'sequence': 'Compute with pipeline 98 plus 45 =com'},
 {'score': 0.0006235087639652193,
  'token': 14012,
  'token_str': 'number',
  'sequence': 'Compute with pipeline 98 plus 45 = number'}]

### Seq2Seq

In [2]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
from transformers import pipeline

path = "/workspace/storage/llm_addition/train/t5-large_scratch_updated/results/best_checkpoint"
model = T5ForConditionalGeneration.from_pretrained(path).to(device)
tokenizer = AutoTokenizer.from_pretrained("t5-large")

2023-07-28 19:04:29.658618: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-28 19:04:29.864145: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-28 19:04:30.464534: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-28 19:04:30.464606: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [29]:
n1 = "1250"
n2 = "3410"

max_len = 20
n1_filler = "0" * (max_len - len(n1))
n2_filler = "0" * (max_len - len(n2))

prompt = f"{n1_filler}{n1}{n2_filler}{n2}"
prompts = [prompt] * 64
prompts

['0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '0000000000000000125000000000000000003410',
 '00000000

In [30]:
import re

input_ids = tokenizer(prompts, return_tensors="pt")['input_ids'].to(device)
preds = model.generate(input_ids)
decoded_preds = tokenizer.batch_decode(preds)
print(decoded_preds)
#pred_number = int(re.findall(r"\d+", decoded_preds)[0])
#pred_number, int(n1) + int(n2)

['<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000000000662</s>', '<pad> 000000000000