In [14]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [15]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenized input
text = "[CLS] I drop the carpet on the floor [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 0
tokenized_text[masked_index] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0] * len(tokenized_text)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [3]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# If you have a GPU, put everything on cuda
# tokens_tensor = tokens_tensor.to('cuda')
# segments_tensors = segments_tensors.to('cuda')
# model.to('cuda')

# Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor, segments_tensors)

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0][0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]


In [4]:
masked_predictions = predictions[0][0][masked_index]

In [9]:
from scipy.special import softmax
softmax(masked_predictions.numpy())

0.9999999721205413

In [11]:
topk_preds = torch.topk(masked_predictions, k)[1].numpy()
topk_preds

array([1012, 1010, 1998, 1011, 2000, 2035, 1997, 1999, 1996, 2005])

In [12]:
softmax(masked_predictions.numpy())[topk_preds]

array([0.19804391, 0.02162127, 0.01590722, 0.01377165, 0.01135784,
       0.00918572, 0.00785187, 0.00753697, 0.00752699, 0.00594372],
      dtype=float32)

In [7]:
import numpy as np
def load_embedding(glove_file):
    glove_dict = {}
    with open(glove_file, 'r', encoding="utf-8") as f:
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            glove_dict[word] = embedding

    return glove_dict

In [8]:
glove_file = "./data/glove/glove.6B.100d.txt"
glove_dict = load_embedding(glove_file)

In [16]:
tokenized_text

['[MASK]', 'i', 'drop', 'the', 'carpet', 'on', 'the', 'floor', '[SEP]']

In [21]:
list1 = [(9, 2), (3, 3), (1, 1)]
list1.sort(key=lambda tup: tup[1], reverse=True)

In [22]:
list1

[(3, 3), (9, 2), (1, 1)]