In [5]:
# Load model directly
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

torch.manual_seed(17)

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small", use_fast=True, 
                                          add_prefix_space=True)
model = AutoModelForCausalLM.from_pretrained("prajjwal1/bert-small", return_dict_in_generate=True, 
                                             pad_token_id=tokenizer.eos_token_id).to(device)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
test_seq = (["Hi","test", "test"])
tokenizer.convert_tokens_to_ids(test_seq)

[100, 3231, 3231]

In [11]:
vocab = tokenizer.get_vocab()
len(vocab)

30522

In [15]:
#get a list of all the tokens in the vocabulary
tokens = list(vocab.keys())
# get the index of the token "test" in the vocabulary
test_token_index = tokens.index("test")
# get the index of the token "test" in the vocabulary
print(test_token_index)
print(tokens[test_token_index])

28369
test


In [94]:
prompt = " a   "
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
print(input_ids)

with torch.no_grad():
    output = model(input_ids)
    logits = output[0]
    probs = logits.softmax(-1)

print(probs.shape)

tensor([[ 101, 1037,  102]])
torch.Size([1, 3, 30522])


In [74]:
for i in range(probs.shape[-1]):
    prob = probs[0, -1, i].item()
    # if prob > 0.0001:
    print(f"Probability of {tokenizer.decode(i)} ({i}) is {prob:.4f}")

Probability of [PAD] (0) is 0.0000
Probability of [unused0] (1) is 0.0000
Probability of [unused1] (2) is 0.0000
Probability of [unused2] (3) is 0.0000
Probability of [unused3] (4) is 0.0000
Probability of [unused4] (5) is 0.0000
Probability of [unused5] (6) is 0.0000
Probability of [unused6] (7) is 0.0000
Probability of [unused7] (8) is 0.0000
Probability of [unused8] (9) is 0.0000
Probability of [unused9] (10) is 0.0000
Probability of [unused10] (11) is 0.0000
Probability of [unused11] (12) is 0.0000
Probability of [unused12] (13) is 0.0000
Probability of [unused13] (14) is 0.0000
Probability of [unused14] (15) is 0.0000
Probability of [unused15] (16) is 0.0000
Probability of [unused16] (17) is 0.0000
Probability of [unused17] (18) is 0.0000
Probability of [unused18] (19) is 0.0000
Probability of [unused19] (20) is 0.0000
Probability of [unused20] (21) is 0.0000
Probability of [unused21] (22) is 0.0000
Probability of [unused22] (23) is 0.0000
Probability of [unused23] (24) is 0.0000


In [86]:
words = ["medicine", "science"]
indexes = [tokenizer.encode(word) for word in words]

print(indexes)

for i in indexes:
    word_prob = torch.prod(torch.stack([probs[0, -1, idx] for idx in i]))
    print(f"Probability of {tokenizer.decode(i)} is {word_prob:.20f}")


[[101, 4200, 102], [101, 2671, 102]]
Probability of [CLS] medicine [SEP] is 0.00000000000000000460
Probability of [CLS] science [SEP] is 0.00000000000000002013


In [87]:
words_2 = ["medicine", "science"]
indexes = [tokenizer.encode(word) for word in words]
#removing the cls and sep tokens
indexes = [i[1:-1] for i in indexes]
print(indexes)

for i in indexes:
    word_prob = torch.prod(torch.stack([probs[0, -1, idx] for idx in i]))
    print(f"Probability of {tokenizer.decode(i)} is {word_prob:.20f}")


[[4200], [2671]]
Probability of medicine is 0.00000477175490232185
Probability of science is 0.00002085873165924568


If we remove the CLS and SEP token from the input the probability 

In [88]:
words_probs = {}

for i in range(probs.shape[-1]):
    prob = probs[0, -1, i].item()
    words_probs[(tokenizer.decode(i)).replace(" ", "")] = prob


print(words_probs)
#get the top 5 words with the highest probability
top_words = sorted(words_probs, key=words_probs.get, reverse=True)[:5]
print(top_words)

print(words_probs["medicine"])
print(words_probs["science"])


['"', 'the', 'he', 'she', 'was']
4.771754902321845e-06
2.0858731659245677e-05


In [85]:
wanted_words_probs = {}
for word in words:
    wanted_words_probs[word] = words_probs[word]

total = sum(wanted_words_probs.values())
for word in wanted_words_probs:
    wanted_words_probs[word] = wanted_words_probs[word] / total

print(wanted_words_probs)

{'medicine': 0.1861749635871919, 'science': 0.8138250364128081}
