In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

100%|██████████| 231508/231508 [00:00<00:00, 356327.65B/s]


In [None]:
# Tokenize input
text = "[CLS] I fell down and cried. [SEP] I fell down and cried. [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [None]:
tokenized_text

['[CLS]',
 'i',
 'fell',
 'down',
 'and',
 'cried',
 '.',
 '[SEP]',
 'i',
 'fell',
 'down',
 'and',
 'cried',
 '.',
 '[SEP]']

In [None]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = [9,12]
for ind in masked_index:
  tokenized_text[ind] = '[MASK]'

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
#convert to cuda

model.to('cuda')

100%|██████████| 313/313 [00:00<00:00, 101571.93B/s]
100%|██████████| 440473133/440473133 [00:40<00:00, 10839280.28B/s]


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
def pp_input(sent):
  #text = "[CLS]" + sent + "[SEP]" + sent + "[CLS]"
  tokenized_sent = tokenizer.tokenize(sent)
  tokenized_sent_2 = tokenized_sent.copy()
  verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  mask_indexes = []
  tags = nltk.pos_tag(tokenized_sent)
  #print(tokenized_sent)
  for i in range(len(tags)):
    if tags[i][1] in verb_tags:
      mask_indexes.append(i)
  for ind in mask_indexes:
    tokenized_sent_2[ind] = '[MASK]'
  tokenized_text = ["[CLS]"] + tokenized_sent + ["[SEP]"] + tokenized_sent_2 + ["[SEP]"]
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [0] + [0 for i in range(len(tokenized_sent))] + [1] + [1 for i in range(len(tokenized_sent))] + [1]
  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  #Predict all tokens
  with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]
  needed_pred = 3
  suggestions = dict()
  for ind in mask_indexes:
    ind_n = len(tokenized_sent) + 2 + ind
    mask_sorted = sorted(predictions[0, ind_n], reverse=True)
    suggestions[tokenized_sent[ind]] = []
    for i in range(1,needed_pred+1):
      pred_ind = (predictions[0, ind_n] == mask_sorted[i]).nonzero().item()
      pred_token = tokenizer.convert_ids_to_tokens([pred_ind])[0]
      suggestions[tokenized_sent[ind]].append(pred_token)
  return suggestions

In [None]:
sent = "The wizard came and killed the demon."
x = pp_input(sent)

In [None]:
x

{'came': ['went', 'returned', 'turned'],
 'killed': ['released', 'defeated', 'attacked']}

In [None]:
def get_score(sub,vv,obj,vt):
  sent = sub + vv + obj
  tokenized_sent = tokenizer.tokenize(sent)
  tokenized_sent_2 = tokenized_sent.copy()
  verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  mask_indexes = []
  tags = nltk.pos_tag(tokenized_sent)
  #print(tokenized_sent)
  for i in range(len(tags)):
    if tags[i][1] in verb_tags:
      mask_indexes.append(i)
  for ind in mask_indexes:
    tokenized_sent_2[ind] = '[MASK]'
  tokenized_text = ["[CLS]"] + tokenized_sent + ["[SEP]"] + tokenized_sent_2 + ["[SEP]"]
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [0] + [0 for i in range(len(tokenized_sent))] + [1] + [1 for i in range(len(tokenized_sent))] + [1]
  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  #Predict all tokens
  with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]
  
  suggestions = dict()
  for ind in mask_indexes:
    ind_n = len(tokenized_sent) + 2 + ind
    mask_sorted = sorted(predictions[0, ind_n], reverse=True)
    suggestions[tokenized_sent[ind]] = []
    try:
      target_id = tokenizer.convert_tokens_to_ids([vt])
      return predictions[0,ind_n][target_id]
    except:
      return -50 #selected by observation


In [None]:
min([min(predictions[0][i]) for i in range(predictions.shape[1])])

In [None]:
fp = open('/Datasets/TestingDatasets/GS2011data.txt', 'r')
line = fp.readline()
line = fp.readline()

groups = {}

while(line):
    a = line.split()
    v = a[1]
    s = a[2]
    o = a[3]
    vt = a[4]
    if (s,o) in groups:
      if (s,v,o, vt) in groups[(s,o)]:
        groups[(s,o)][(s,v,o, vt)]+=int(a[5])
      else:
        groups[(s,o)][(s,v,o, vt)]=int(a[5])
    else:
      groups[(s,o)] = {}
      groups[(s,o)][(s,v,o, vt)]=int(a[5])
    line = fp.readline()
fp.close()

In [None]:
from scipy.stats import spearmanr

In [None]:
spearman1 = 0
counter = 0
for so in groups:
  model_score_1 = []
  data_score = []
  for svovt in groups[so]:
    sub = lemmatizer.lemmatize(svovt[0])
    obj = lemmatizer.lemmatize(svovt[2])
    vv = lemmatizer.lemmatize(svovt[1])
    vt = lemmatizer.lemmatize(svovt[3])
    s = get_score(sub,vv,obj,vt)
    model_score_1.append(s)
    data_score.append(groups[so][svovt])
  try:
    s1 = spearmanr(np.argsort(np.argsort(model_score_1)), data_score)[0]
  except:
    continue
  if s1 < 2:
    spearman1+=s1
    counter+=1

print(spearman1/counter)

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0.14285714285714285


In [None]:
def find_mask_ind(tokenized_text):
  verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
  mask_indexes = []
  tags = nltk.pos_tag(tokenized_text)
  for i in range(len(tags)):
    if (tags[i][1] in verb_tags) and (tags[i][0]!='[SEP]') and (tags[i][0]!='[CLS]'):
      mask_indexes.append(i)
  return mask_indexes

In [None]:
mask_indexes

[3, 10]

In [None]:
tokenized_text

['[CLS]',
 'i',
 'will',
 'go',
 'to',
 'school',
 'today',
 '.',
 '[SEP]',
 'i',
 'played',
 'yesterday',
 '.',
 '[SEP]']

In [None]:
import numpy as np

In [None]:
np.where(tokenized_text=='the')

(array([], dtype=int64),)

In [None]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [None]:
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0,1,1,1,1,1,1,1,1]

In [None]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [None]:
import tensorflow as tf

In [None]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

In [None]:
#look at the prediction
predicted_index = torch.argmax(predictions[0, masked_index[0]]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

print('Predicted token is:',predicted_token)

Predicted token is: lay


In [None]:
tokenizer.convert_ids_to_tokens([predicted_index])

['lay']

In [None]:
predictions[0]

tensor([[ -6.5044,  -6.4415,  -6.4657,  ...,  -5.9074,  -5.7146,  -4.0583],
        [-13.5651, -13.6426, -13.7622,  ..., -11.9287, -11.3416, -10.3830],
        [ -8.3596,  -8.4663,  -8.1823,  ...,  -6.4627,  -6.7614,  -6.8012],
        ...,
        [ -3.3134,  -3.0745,  -3.1240,  ...,  -2.2517,  -3.0724,  -1.6352],
        [-14.4288, -14.1552, -14.3316,  ..., -11.3436, -11.5835,  -9.8701],
        [-13.8569, -13.8408, -13.8689,  ..., -11.1805, -10.9464,  -8.5966]],
       device='cuda:0')

In [None]:
torch.argmax(predictions[0, masked_index])

tensor(3216, device='cuda:0')

In [None]:
predictions[0, masked_index].cpu().numpy()

array([ 0.2697877 ,  0.21498415,  0.13474041, ..., -0.85740125,
        1.1795503 , -2.5295548 ], dtype=float32)

In [None]:
mask_sorted = sorted(predictions[0, masked_index[0]], reverse=True)

In [None]:
choose_ind = 0 #index of the word to be chosen from predictions
pred_ind = (predictions[0, masked_index[0]] == mask_sorted[choose_ind]).nonzero().item()

In [None]:
pred_ind

3913

In [None]:
pred_token = tokenizer.convert_ids_to_tokens([pred_ind])[0]

In [None]:
pred_token

'sat'

In [None]:
tokenizer.convert_ids_to_tokens([pred_ind])

['reads']

In [None]:
predictions[0].shape

torch.Size([15, 30522])

In [None]:
tokenizer

<pytorch_transformers.tokenization_bert.BertTokenizer at 0x7f9301336320>

In [None]:
import numpy as np

In [None]:
import nltk

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
import pandas as pd
eval_data_set3 = pd.read_csv('Datasets/TestingDatasets/GS2011data.txt', sep=' ')


In [None]:
ground = eval_data_set3['input']

In [None]:
segments_ids = [0,0,0,0,1,1,1,1,1]

In [None]:
mask_index = 6

In [None]:
from numpy import dot
from numpy.linalg import norm

In [None]:
def get_segments_mask(text):
  token_ll = len(text)
  segments_id = [0]*token_ll
  v = 0
  for i in range(token_ll):
    if text[i]=='SEP':
      v = 1
    segments_id[i] = v
  

In [None]:
model_score = []

for svovt in zip(eval_data_set3['subject'],eval_data_set3['verb'],eval_data_set3['object'],eval_data_set3['landmark']):
  sub = lemmatizer.lemmatize(svovt[0])
  obj = lemmatizer.lemmatize(svovt[2])
  vv = lemmatizer.lemmatize(svovt[1])
  vt = lemmatizer.lemmatize(svovt[3])
  
  sent = ' '.join(['[CLS]',sub,vv,obj,'[SEP]',sub,vv,obj,'[SEP]'])
  sent_t = ' '.join(['[CLS]',sub,vt,obj,'[SEP]',sub,vt,obj,'[SEP]'])
  
  tokenized_text = tokenizer.tokenize(sent)
  tokenized_text_t = tokenizer.tokenize(sent_t)
  
  token_ll = len(tokenized_text)
  segments_ids = [0]*token_ll
  v = 0
  for i in range(token_ll):
    if tokenized_text[i]=='[SEP]':
      v=1
    segments_ids[i]=v
  
  for i in range(token_ll):
    if tokenized_text[i]==vv:
      mask_index=i
  tokenized_text[mask_index] = '[MASK]'
  tokenized_text_t[mask_index] = '[MASK]'
  
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  indexed_tokens_t = tokenizer.convert_tokens_to_ids(tokenized_text_t)
  
  tokens_tensor = torch.tensor([indexed_tokens]).to('cuda')
  tokens_tensor_t = torch.tensor([indexed_tokens_t]).to('cuda')
  segments_tensors = torch.tensor([segments_ids]).to('cuda')
  
  try:
    with torch.no_grad():
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      predictions = outputs[0]
  except:
    print(tokenized_text)
  try:
    with torch.no_grad():
      outputs_t = model(tokens_tensor_t, token_type_ids=segments_tensors)
      predictions_t = outputs_t[0]
  except:
    print(tokenized_text)
    print(tokenized_text_t)
    break
  
  vec1 = predictions[0, mask_index].cpu().numpy()
  vec2 = predictions_t[0, mask_index].cpu().numpy()
  
  model_score.append(dot(vec1, vec2)/(norm(vec1)*norm(vec2)))

['[CLS]', 'man', 'say', 'success', '[SEP]', 'man', '[MASK]', 'success', '[SEP]']
['[CLS]', 'man', 'all', '##ege', 'success', '[SEP]', '[MASK]', 'all', '##ege', 'success', '[SEP]']
