In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertForMaskedLM.from_pretrained('bert-base-uncased')
ids2token = tokenizer.convert_ids_to_tokens
token2ids = tokenizer.convert_tokens_to_ids
token2string = tokenizer.convert_tokens_to_string

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
#Reading data
def read_data(filename):
  with open(filename, 'r', encoding='UTF-16') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
  return data

data1 = read_data('ex1.txt')
data2 = read_data('ex2.txt')
data3 = read_data('ex3.txt')

In [None]:
#Surprisals
def calc_surp(seq,val):
  seq = seq.detach().numpy()
  val = val.detach().numpy()
  exp_sum = sum(np.exp(seq))
  portion = np.exp(val) / exp_sum
  surp = - np.log(portion)
  return portion, surp

In [None]:
#Probabilities
def pair_prob_all(s1,arr,s2,mask,model):
  inputs  = tokenizer(s1, return_tensors="pt")
  indice  = token2ids(arr)
  inputs["input_ids"] = torch.tensor([indice])
  labels  = tokenizer(s2, return_tensors="pt")["input_ids"]
  label = labels[0][int(mask)]
  if len(inputs["input_ids"][0]) == len(labels[0]):
    outputs = model(**inputs, labels=labels)
    loss    = outputs[0]
    logits  = outputs[1]
    probs   = logits[0][int(mask)]
    prob    = logits[0][int(mask)][int(label)]
    portion = calc_surp(probs,prob)[0]
    surp    = calc_surp(probs,prob)[1]
    return loss, logits, portion, surp
  else:
    return 0, 0, 0, 0


In [None]:
#Experiment
def return_probability_all(corpus, model):
  corpus_all = []
  for i in range(len(corpus)):
    indexed = tokenizer(corpus[i][2], return_tensors="pt")["input_ids"][0]
    arr1_token = ids2token(indexed)


    temp = []
    temp.append(corpus[i][0])
    temp.append(corpus[i][1])
    temp.append(corpus[i][2])
    temp.append(corpus[i][3])
    temp.append(corpus[i][4])
    temp.append(corpus[i][5])

    
    text1 = corpus[i][2].replace('[MASK]', corpus[i][3])
    index = arr1_token.index('[MASK]')
    loss, logits, portion, surp = pair_prob_all(corpus[i][2], arr1_token, text1, index, model)
    if loss != 0:
      scores = surp
      temp.append(scores)
    else:
        scores = "NA"
        temp.append(scores)
      

    corpus_all.append(temp)
  return corpus_all

In [None]:
#Result
result1 = return_probability_all(data1, bert)
result2 = return_probability_all(data2, bert)
result3 = return_probability_all(data3, bert)

In [None]:
#Save
file1 = open('result1.txt','w')
file2 = open('result2.txt','w')
file3 = open('result3.txt','w')

def write_cases(corpus,wfiles):
    for i in range(len(corpus)):
        for j in range(len(corpus[i])):
            wfiles.write(str(corpus[i][j])+'\t')
            if j == len(corpus[i])-1:
                wfiles.write('\n')

write_cases(result1,file1)
write_cases(result2,file2)
write_cases(result3,file3)