In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[?25l[K     |                                | 10 kB 32.0 MB/s eta 0:00:01[K     |▏                               | 20 kB 32.4 MB/s eta 0:00:01[K     |▎                               | 30 kB 20.3 MB/s eta 0:00:01[K     |▍                               | 40 kB 17.2 MB/s eta 0:00:01[K     |▌                               | 51 kB 12.3 MB/s eta 0:00:01[K     |▌                               | 61 kB 14.3 MB/s eta 0:00:01[K     |▋                               | 71 kB 15.1 MB/s eta 0:00:01[K     |▊                               | 81 kB 15.1 MB/s eta 0:00:01[K     |▉                               | 92 kB 16.6 MB/s eta 0:00:01[K     |█                               | 102 kB 13.9 MB/s eta 0:00:01[K     |█                               | 112 kB 13.9 MB/s eta 0:00:01[K     |█                               | 122 kB 13.9 MB/s eta 0:00:01[K     |█▏                              | 133 kB 13.9

In [3]:
import torch

import csv
import sys
import time
from collections import Counter

from transformers import BertTokenizer, BertModel, BertForMaskedLM
import torch.nn.functional as F

# Explore data set

In [None]:
NUM_ENTITY = 'multiple_entity'
TYPE = 'sra'

In [4]:
#if NUM_ENTITY == 'multiple_entity' and TYPE == 'sra':
#    data_dir = './data/combined_data/multiple_entity_distractor/BertBase/complete_data_For_MultipleEntityObjectDistractorAccuracyBertBase.csv'

data_dir = './complete_data_For_MultipleEntityObjectDistractorAccuracyBertBase.csv'

In [5]:
def ordered_items_to_list(items):
    return candidate.strip('[').strip(']').replace("'",'').replace(' ','').split(',')

In [6]:
f = open(data_dir)
reader = csv.DictReader(f, delimiter='\t')

ct = 0
targets = []
sentences = []
candidates = []
num_attractors = []
pre_pred = []

for row in reader:
    sentence = row['sentence']
    target = row['target_occupation']
    candidate = row['ordered_items']
    n_attractors = row['count_attractors']
    rel_rank = float(row['relative_rank'])
    
    targets.append(target)
    sentences.append(sentence)
    candidates.append(ordered_items_to_list(candidate))
    num_attractors.append(n_attractors)
    if rel_rank == 1:
        pre_pred.append(1)
    else:
        pre_pred.append(0)
    ct += 1
    if ct < 10:
        print("No. {} Sentence: {} Target: {}".format(ct, sentence, target))

No. 1 Sentence: Daniel works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 2 Sentence: Daniel has a sister and now works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 3 Sentence: Daniel has a sister , played basketball , and now works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 4 Sentence: Daniel has a sister , played basketball , sang in a choir , and now works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 5 Sentence: Sebastian works as an optician . For his job , Sebastian sells [MASK] . Target: glasses
No. 6 Sentence: Sebastian has a sister and now works as an optician . For his job , Sebastian sells [MASK] . Target: glasses
No. 7 Sentence: Sebastian has a sister , played basketball , and now works as an optician . For his job , Sebastian sells [MASK] . Target: glasses
No. 8 Sentence: Sebastian has a sister , played basketball , sang in a choir , and now works as an optician . For h

In [7]:
targets_counter = Counter(targets)
attractor_counter = Counter(num_attractors)

candidate_targets = Counter()
for candidate in candidates:
    candidate_targets.update(candidate)


print(f"Number of instances: {ct}")
print(candidate_targets)
print(targets_counter)
print(attractor_counter)

Number of instances: 12896
Counter({'flowers': 4128, 'paintings': 4128, 'fish': 4128, 'glasses': 4128, 'meat': 4128, 'bread': 4128, 'santiago': 4128, 'paris': 4128, 'beijing': 4128, 'warsaw': 4128, 'jakarta': 4128, 'helsinki': 4128, 'India': 4128, 'Egypt': 4128, 'France': 4128, 'Italy': 4128, 'Peru': 4128, 'Russia': 4128, 'goal': 512, 'touchdown': 512, 'run': 512, 'century': 512})
Counter({'flowers': 688, 'glasses': 688, 'meat': 688, 'bread': 688, 'fish': 688, 'paintings': 688, 'santiago': 688, 'beijing': 688, 'helsinki': 688, 'paris': 688, 'jakarta': 688, 'warsaw': 688, 'india': 688, 'france': 688, 'egypt': 688, 'peru': 688, 'italy': 688, 'russia': 688, 'touchdown': 128, 'run': 128, 'goal': 128, 'century': 128})
Counter({'3': 8832, '2': 3072, '1': 816, '0': 176})


# BERT masked word prediction

In [8]:
def prepare_text(text, model):
    """
    Input:
        text: typically an instance of a sentence in the data.
        model: can be 'BERT'
    Output:
        res: a string consisting of orginal tokens and start-of-sentence and sentence separators.
    """
    res = []
    if model == 'BERT':
        res.append("[CLS]")
        res += text.strip().split()        
        if "[mask]" in res:
            res[res.index("[mask]")] = "[MASK]"
        #period_index = [ind for ind, tok in enumerate(res) if tok == '.']
        #for i, ind in enumerate(period_index):
        #    res.insert(ind + 1 + i, "[SEP]")
        res.append("[SEP]")
    return " ".join(res)

In [9]:
print(sentences[10000])
prepare_text(sentences[10000], "BERT")

john visited the tower of pisa , sebastian visited peru , daniel visited france , and joe visited egypt . the country john traveled to was [mask] .


'[CLS] john visited the tower of pisa , sebastian visited peru , daniel visited france , and joe visited egypt . the country john traveled to was [MASK] . [SEP]'

## 1. Bert-Base-Uncased

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to("cuda")
model.eval()

def predict_masked(text, candidates, verbose=False):
    """
    Input:
        text: a prepared instance of a sentence in the data.
        candidates: candidate words for which to calculate probabilities.
        verbose: whether to print text along with predicted probabilities
    Output:
        prediction: one of the candidates with highest predicted probability.
        probs: a tensor of predicted probailities of each candidate.
    """
    
    cand_probs = []
    
    if verbose:
        print(text)
    tokenized_text = tokenizer.tokenize(text)
    if "[MASK]" in tokenized_text:
        masked_index = tokenized_text.index("[MASK]")
    elif "[mask]" in tokenized_text:
        masked_index = tokenized_text.index("[mask]")
    else:
        print("No masks found.")
        return -1, torch.ones(len(candidates)) * (-99)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensors = torch.tensor([indexed_tokens]).cuda()
    
    with torch.no_grad():
        outputs = model(tokens_tensors)
        predictions = outputs[0]
        probs = F.softmax(predictions[0, masked_index], dim=-1)
        
    
    for cand in candidates:
        cand_id = [tokenizer.convert_tokens_to_ids(cand)]
        token_weight = probs[cand_id].float().item()
        if verbose:
            print(f"    {cand} | weights: {token_weight:.4f}")
        cand_probs.append(token_weight)
        
    cand_probs = torch.tensor(cand_probs)
    prediction = candidates[cand_probs.argmax().item()]
    
    return prediction, cand_probs


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
predict_masked(prepare_text(sentences[10000], "BERT"), candidates[10000], True)

[CLS] john visited the tower of pisa , sebastian visited peru , daniel visited france , and joe visited egypt . the country john traveled to was [MASK] . [SEP]
    Italy | weights: 0.0000
    France | weights: 0.0000
    Egypt | weights: 0.0000
    Peru | weights: 0.0000
    Russia | weights: 0.0000
    India | weights: 0.0000


('Italy',
 tensor([3.2347e-05, 3.2347e-05, 3.2347e-05, 3.2347e-05, 3.2347e-05, 3.2347e-05]))

In [17]:
texts = [prepare_text(text, "BERT") for text in sentences]

In [21]:
ct = 0
pred_correct = [0] * len(texts)

for i, (text, cand, target, pp) in enumerate(zip(texts, candidates, targets, pre_pred)):
    ct += 1
    pred, _ = predict_masked(text, cand, False)
    if pred.lower() == target.lower():
        pred_correct[i] = 1
    if pred_correct[i] != pp:
        if pp == 1:
            print(f"No. {ct}. {text} | predicted: {pred} | pretrained: {target}")
        else:
            print(f"No. {ct}. {text} | predicted: {pred} | pretrained: {-1}")
        
    if ct % 200 == 0:
        print("processed: {}/{}".format(ct, len(sentences)))
        print("=" * 60)
    

processed: 200/12896
processed: 400/12896
processed: 600/12896
processed: 800/12896
processed: 1000/12896
processed: 1200/12896
processed: 1400/12896
processed: 1600/12896
processed: 1800/12896
processed: 2000/12896
processed: 2200/12896
processed: 2400/12896
processed: 2600/12896
processed: 2800/12896
processed: 3000/12896
processed: 3200/12896
processed: 3400/12896
processed: 3600/12896
processed: 3800/12896
processed: 4000/12896
processed: 4200/12896
processed: 4400/12896
processed: 4600/12896
processed: 4800/12896
processed: 5000/12896
processed: 5200/12896
processed: 5400/12896
processed: 5600/12896
processed: 5800/12896
processed: 6000/12896
processed: 6200/12896
processed: 6400/12896
processed: 6600/12896
processed: 6800/12896
processed: 7000/12896
processed: 7200/12896
processed: 7400/12896
processed: 7600/12896
processed: 7800/12896
processed: 8000/12896
processed: 8200/12896
processed: 8400/12896
processed: 8600/12896
processed: 8800/12896
processed: 9000/12896
processed: 920

In [22]:
accuracy_0attractor = []
accuracy_1attractor = []
accuracy_2attractor = []
accuracy_3attractor = []

for i in range(len(num_attractors)):
    n = int(num_attractors[i])
    if n == 0:
        accuracy_0attractor += [pred_correct[i]]
    elif n == 1:
        accuracy_1attractor += [pred_correct[i]]
    elif n == 2:
        accuracy_2attractor += [pred_correct[i]]
    elif n == 3:
        accuracy_3attractor += [pred_correct[i]]
    else:
        print("Instance {}: more attractor than 3?".format(i))
        
        
print(f"Accuracy for 0 attractor(s): {sum(accuracy_0attractor) / len(accuracy_0attractor):.4f}")
print(f"Accuracy for 1 attractor(s): {sum(accuracy_1attractor) / len(accuracy_1attractor):.4f}")
print(f"Accuracy for 2 attractor(s): {sum(accuracy_2attractor) / len(accuracy_2attractor):.4f}")
print(f"Accuracy for 3 attractor(s): {sum(accuracy_3attractor) / len(accuracy_3attractor):.4f}")


Accuracy for 0 attractor(s): 0.9091
Accuracy for 1 attractor(s): 0.2451
Accuracy for 2 attractor(s): 0.3516
Accuracy for 3 attractor(s): 0.4250


In [23]:
accuracy_0attractor = []
accuracy_1attractor = []
accuracy_2attractor = []
accuracy_3attractor = []

for i in range(len(num_attractors)):
    n = int(num_attractors[i])
    if n == 0:
        accuracy_0attractor += [pre_pred[i]]
    elif n == 1:
        accuracy_1attractor += [pre_pred[i]]
    elif n == 2:
        accuracy_2attractor += [pre_pred[i]]
    elif n == 3:
        accuracy_3attractor += [pre_pred[i]]
    else:
        print("Instance {}: more attractor than 3?".format(i))
        
        
print(f"Accuracy for 0 attractor(s): {sum(accuracy_0attractor) / len(accuracy_0attractor):.4f}")
print(f"Accuracy for 1 attractor(s): {sum(accuracy_1attractor) / len(accuracy_1attractor):.4f}")
print(f"Accuracy for 2 attractor(s): {sum(accuracy_2attractor) / len(accuracy_2attractor):.4f}")
print(f"Accuracy for 3 attractor(s): {sum(accuracy_3attractor) / len(accuracy_3attractor):.4f}")


Accuracy for 0 attractor(s): 0.9091
Accuracy for 1 attractor(s): 0.2451
Accuracy for 2 attractor(s): 0.3516
Accuracy for 3 attractor(s): 0.4250


## 2. Bert-Large-Uncased

In [27]:
data_dir = './complete_data_For_MultipleEntityObjectDistractorAccuracyBertLarge.csv'
f = open(data_dir)
reader = csv.DictReader(f, delimiter='\t')

ct = 0
targets = []
sentences = []
candidates = []
num_attractors = []
pre_pred = []

for row in reader:
    sentence = row['sentence']
    target = row['target_occupation']
    candidate = row['ordered_items']
    n_attractors = row['count_attractors']
    rel_rank = float(row['relative_rank'])
    
    targets.append(target)
    sentences.append(sentence)
    candidates.append(ordered_items_to_list(candidate))
    num_attractors.append(n_attractors)
    if rel_rank == 1:
        pre_pred.append(1)
    else:
        pre_pred.append(0)
    ct += 1
    if ct < 10:
        print("No. {} Sentence: {} Target: {}".format(ct, sentence, target))

No. 1 Sentence: Daniel works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 2 Sentence: Daniel has a sister and now works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 3 Sentence: Daniel has a sister , played basketball , and now works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 4 Sentence: Daniel has a sister , played basketball , sang in a choir , and now works as a florist . For his job , Daniel sells [MASK] . Target: flowers
No. 5 Sentence: Sebastian works as an optician . For his job , Sebastian sells [MASK] . Target: glasses
No. 6 Sentence: Sebastian has a sister and now works as an optician . For his job , Sebastian sells [MASK] . Target: glasses
No. 7 Sentence: Sebastian has a sister , played basketball , and now works as an optician . For his job , Sebastian sells [MASK] . Target: glasses
No. 8 Sentence: Sebastian has a sister , played basketball , sang in a choir , and now works as an optician . For h

In [28]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForMaskedLM.from_pretrained('bert-large-uncased').to("cuda")
model.eval()

def predict_masked(text, candidates, verbose=False):
    """
    Input:
        text: a prepared instance of a sentence in the data.
        candidates: candidate words for which to calculate probabilities.
        verbose: whether to print text along with predicted probabilities
    Output:
        prediction: one of the candidates with highest predicted probability.
        probs: a tensor of predicted probailities of each candidate.
    """
    
    cand_probs = []
    
    if verbose:
        print(text)
    tokenized_text = tokenizer.tokenize(text)
    if "[MASK]" in tokenized_text:
        masked_index = tokenized_text.index("[MASK]")
    elif "[mask]" in tokenized_text:
        masked_index = tokenized_text.index("[mask]")
    else:
        print("No masks found.")
        return -1, torch.ones(len(candidates)) * (-99)

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensors = torch.tensor([indexed_tokens]).cuda()
    
    with torch.no_grad():
        outputs = model(tokens_tensors)
        predictions = outputs[0]
    
    probs = F.softmax(predictions[0, masked_index], dim=0)#-1)
    
    for cand in candidates:
        cand_id = [tokenizer.convert_tokens_to_ids(cand)]
        token_weight = probs[cand_id].float().item()
        if verbose:
            print(f"    {cand} | weights: {token_weight:.4f}")
        cand_probs.append(token_weight)
        
    cand_probs = torch.tensor(cand_probs)
    prediction = candidates[cand_probs.argmax().item()]
    
    return prediction, cand_probs


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
predict_masked(prepare_text(sentences[10000], "BERT"), candidates[10000], True)

[CLS] john visited the tower of pisa , sebastian visited peru , daniel visited france , and joe visited egypt . the country john traveled to was [MASK] . [SEP]
    Italy | weights: 0.0000
    India | weights: 0.0000
    Egypt | weights: 0.0000
    France | weights: 0.0000
    Russia | weights: 0.0000
    Peru | weights: 0.0000


('Italy',
 tensor([1.1538e-05, 1.1538e-05, 1.1538e-05, 1.1538e-05, 1.1538e-05, 1.1538e-05]))

In [30]:
texts = [prepare_text(text, "BERT") for text in sentences]
  
ct = 0
pred_correct = [0] * len(texts)

for i, (text, cand, target, pp) in enumerate(zip(texts, candidates, targets, pre_pred)):
    ct += 1
    pred, _ = predict_masked(text, cand, False)
    if pred.lower() == target.lower():
        pred_correct[i] = 1
    if pred_correct[i] != pp:
        if pp == 1:
            print(f"No. {ct}. {text} | predicted: {pred} | pretrained: {target}")
        else:
            print(f"No. {ct}. {text} | predicted: {pred} | pretrained: {-1}")
        
    if ct % 200 == 0:
        print("processed: {}/{}".format(ct, len(sentences)))
        print("=" * 60)
    

processed: 200/12896
processed: 400/12896
processed: 600/12896
processed: 800/12896
processed: 1000/12896
processed: 1200/12896
processed: 1400/12896
processed: 1600/12896
processed: 1800/12896
processed: 2000/12896
processed: 2200/12896
processed: 2400/12896
processed: 2600/12896
processed: 2800/12896
processed: 3000/12896
processed: 3200/12896
processed: 3400/12896
processed: 3600/12896
processed: 3800/12896
processed: 4000/12896
processed: 4200/12896
processed: 4400/12896
processed: 4600/12896
processed: 4800/12896
processed: 5000/12896
processed: 5200/12896
processed: 5400/12896
processed: 5600/12896
processed: 5800/12896
processed: 6000/12896
processed: 6200/12896
processed: 6400/12896
processed: 6600/12896
processed: 6800/12896
processed: 7000/12896
processed: 7200/12896
processed: 7400/12896
processed: 7600/12896
processed: 7800/12896
processed: 8000/12896
processed: 8200/12896
processed: 8400/12896
processed: 8600/12896
processed: 8800/12896
processed: 9000/12896
processed: 920

In [31]:
accuracy_0attractor = []
accuracy_1attractor = []
accuracy_2attractor = []
accuracy_3attractor = []

for i in range(len(num_attractors)):
    n = int(num_attractors[i])
    if n == 0:
        accuracy_0attractor += [pred_correct[i]]
    elif n == 1:
        accuracy_1attractor += [pred_correct[i]]
    elif n == 2:
        accuracy_2attractor += [pred_correct[i]]
    elif n == 3:
        accuracy_3attractor += [pred_correct[i]]
    else:
        print("Instance {}: more attractor than 3?".format(i))
        
        
print(f"Accuracy for 0 attractor(s): {sum(accuracy_0attractor) / len(accuracy_0attractor):.4f}")
print(f"Accuracy for 1 attractor(s): {sum(accuracy_1attractor) / len(accuracy_1attractor):.4f}")
print(f"Accuracy for 2 attractor(s): {sum(accuracy_2attractor) / len(accuracy_2attractor):.4f}")
print(f"Accuracy for 3 attractor(s): {sum(accuracy_3attractor) / len(accuracy_3attractor):.4f}")


Accuracy for 0 attractor(s): 0.9545
Accuracy for 1 attractor(s): 0.2598
Accuracy for 2 attractor(s): 0.3763
Accuracy for 3 attractor(s): 0.4537


In [32]:
accuracy_0attractor = []
accuracy_1attractor = []
accuracy_2attractor = []
accuracy_3attractor = []

for i in range(len(num_attractors)):
    n = int(num_attractors[i])
    if n == 0:
        accuracy_0attractor += [pre_pred[i]]
    elif n == 1:
        accuracy_1attractor += [pre_pred[i]]
    elif n == 2:
        accuracy_2attractor += [pre_pred[i]]
    elif n == 3:
        accuracy_3attractor += [pre_pred[i]]
    else:
        print("Instance {}: more attractor than 3?".format(i))
        
        
print(f"Accuracy for 0 attractor(s): {sum(accuracy_0attractor) / len(accuracy_0attractor):.4f}")
print(f"Accuracy for 1 attractor(s): {sum(accuracy_1attractor) / len(accuracy_1attractor):.4f}")
print(f"Accuracy for 2 attractor(s): {sum(accuracy_2attractor) / len(accuracy_2attractor):.4f}")
print(f"Accuracy for 3 attractor(s): {sum(accuracy_3attractor) / len(accuracy_3attractor):.4f}")


Accuracy for 0 attractor(s): 0.9545
Accuracy for 1 attractor(s): 0.2598
Accuracy for 2 attractor(s): 0.3763
Accuracy for 3 attractor(s): 0.4537
