In [18]:
from transformers import AutoTokenizer, BertForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

inputs = tokenizer("Another of these advantages to which it is alluded to with the fare-play is to avoid that the team of the lawbreaker [MASK] takes advantage unjustly of the opposite team , which did not happen either .", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

# Get top predictions
top_k = 20
top_indices = torch.topk(logits[0, mask_token_index], top_k).indices.tolist()

# Convert indices back to tokens
predicted_tokens = [tokenizer.decode(index) for index in top_indices]

print("Top", top_k, "predictions for [MASK]:", predicted_tokens)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top 20 predictions for [MASK]: ['team also then game always automatically only now usually player therefore never still instead again group had winner play,']


In [20]:
candidates = [  "a person who participates in or is skilled at some game",
      "someone who plays a musical instrument (as a profession)",
      "a theatrical performer",
      "a person who pursues a number of different social and sexual partners simultaneously",
      "an important participant (as in a business deal)"]

from sentence_transformers import SentenceTransformer
from torch.nn import CosineSimilarity

# Load Sentence-BERT model
model_name = 'bert-base-nli-mean-tokens'
model_2 = SentenceTransformer(model_name)

embeddings1 = model_2.encode(predicted_tokens, convert_to_tensor=True).reshape(1, -1)

cos_sim_list = []
for sentence2 in candidates:
    embeddings2 = model_2.encode(sentence2, convert_to_tensor=True).reshape(1, -1)  
    cos_sim = CosineSimilarity(dim=1, eps=1e-6)
    similarity_score = cos_sim(embeddings1, embeddings2).item()
    cos_sim_list.append(similarity_score)

[0.275272011756897, 0.13427379727363586, 0.11213163286447525, 0.19085539877414703, 0.17880640923976898]


In [23]:
import json
from tqdm import tqdm
from transformers import AutoTokenizer, BertForMaskedLM
import torch
from sentence_transformers import SentenceTransformer
from torch.nn import CosineSimilarity

In [50]:
# FIRST ATTEMPT FOR A LOOP

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
top_k = 10
model_name = 'bert-base-nli-mean-tokens'
model_2 = SentenceTransformer(model_name)

with open("ALL_preprocessed.json") as f:
    data = json.load(f)
    ris = []
    for i in tqdm(range(len(data))):
        input_sentence = data[i]["text"]
        masked_sentence = input_sentence.replace(data[i]["word"], "[MASK]")
        inputs = tokenizer(masked_sentence, return_tensors="pt")
        with torch.no_grad():
            logits = model(**inputs).logits
        # retrieve index of [MASK]
        mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
        # get top predictions
        top_indices = torch.topk(logits[0, mask_token_index], top_k).indices.tolist()
        # convert indices back to tokens
        predicted_tokens = [tokenizer.decode(index) for index in top_indices]
        
        candidates = data[i]["definitions"]
        embeddings1 = model_2.encode(predicted_tokens, convert_to_tensor=True).mean(axis=0)
        cos_sim_list = []
        for sentence2 in candidates:
            embeddings2 = model_2.encode([sentence2], convert_to_tensor=True).reshape(1,-1)
            cos_sim = CosineSimilarity(dim=1, eps=1e-6)
            similarity_score = cos_sim(embeddings1, embeddings2).item()
            cos_sim_list.append(similarity_score)
        best_candidate_idx = torch.tensor(cos_sim_list).argmax(axis=-1).item()
        ris.append( {"id" : data[i]["id"], "answer" : candidates[best_candidate_idx], "gold" : data[i]["gold_definitions"]} )
        

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 7253/7253 [15:28<00:00,  7.81it/s]


In [51]:
eval_ris = 0
for e in ris:
    if e["answer"] in e["gold"]:
        eval_ris += 1
        
print(f"| ACCURACY is: {eval_ris/len(ris)} |")

| ACCURACY is: 0.5059975182683027 |
