In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

## Loading and tokenizing sentences

In [2]:
# load Prasada & Dillingham (2006) sentences
df = pd.read_csv('prasada_dillingham_2006_sentences.csv')
display(df.head())

Unnamed: 0,connection.type,connection.subtype,item,in.general,by.virtue.of,causal.essence,statistical,formal,should,prevalence.estimates,word1_singular,word2_singular,word1_plural,word2_plural,singular.googleNews,singular.wiki,plural.googleNews,plural.wiki
0,principled,artifact,airplanes have wings,3.83,6.78,2.5,4.56,4.5,6.82,96.13,airplanes,wings,airplane,wings,0.313458,0.383928,0.254327,0.367735
1,principled,artifact,ambulances have sirens,5.5,5.72,2.39,5.33,5.22,6.44,97.96,ambulances,sirens,ambulance,siren,0.294424,0.342587,0.456853,0.515955
2,principled,artifact,cars have four wheels,4.17,5.83,2.5,5.56,5.39,5.61,98.45,,,,,,,,
3,principled,artifact,diapers are absorbent,5.06,5.44,3.11,4.94,5.72,6.67,95.86,diapers,absorbent,diaper,absorbent,0.409588,0.472291,0.396471,0.454671
4,principled,artifact,fire trucks have hoses,4.61,5.61,2.72,5.17,5.61,6.61,97.84,firetrucks,hoses,firetruck,hose,0.354045,0.369252,0.418823,0.408377


In [3]:
# load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
# grab sentences
sentences = list(df['item'])

# add markers
sentences = [f'[CLS] {sentence} [SEP]' for sentence in sentences]
print(sentences[0:2])

# tokenize
tokenized = [tokenizer.tokenize(sentence) for sentence in sentences]
print(tokenized[0:2])

# write to dataframe so we can manually annotate start and end indices of each concept
annotate_tokens = [str(sentence) for sentence in tokenized]
df_annotate = pd.DataFrame(annotate_tokens, columns=['sentence'])
display(df_annotate.head())
df_annotate.to_csv('annotate.tsv', index=False, sep='\t')

['[CLS] airplanes have wings [SEP]', '[CLS] ambulances have sirens [SEP]']
[['[CLS]', 'airplanes', 'have', 'wings', '[SEP]'], ['[CLS]', 'ambulance', '##s', 'have', 'sirens', '[SEP]']]


Unnamed: 0,sentence
0,"['[CLS]', 'airplanes', 'have', 'wings', '[SEP]']"
1,"['[CLS]', 'ambulance', '##s', 'have', 'sirens'..."
2,"['[CLS]', 'cars', 'have', 'four', 'wheels', '[..."
3,"['[CLS]', 'dia', '##pers', 'are', 'absorb', '#..."
4,"['[CLS]', 'fire', 'trucks', 'have', 'hose', '#..."


In [5]:
df = pd.read_csv('annotated.tsv', sep='\t').head(10)
display(df.head())

Unnamed: 0,sentence,word1_start,word1_end,word2_start,word2_end
0,"['[CLS]', 'airplanes', 'have', 'wings', '[SEP]']",1.0,1.0,2.0,3.0
1,"['[CLS]', 'ambulance', '##s', 'have', 'sirens'...",1.0,2.0,3.0,4.0
2,"['[CLS]', 'cars', 'have', 'four', 'wheels', '[...",1.0,1.0,2.0,4.0
3,"['[CLS]', 'dia', '##pers', 'are', 'absorb', '#...",1.0,2.0,3.0,5.0
4,"['[CLS]', 'fire', 'trucks', 'have', 'hose', '#...",1.0,2.0,3.0,5.0


In [6]:
# convert to indices
indices = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
print(indices[0:2])

def pad_indices(indices):
    # get length of longest sentence
    max_len = 0
    for sentence in indices:
        if len(sentence) > max_len:
            max_len = len(sentence)
    
    # return list of padded sentences
    return [sentence + [0] * (max_len - len(sentence)) for sentence in indices]

padded = pad_indices(indices)
print(padded[0:2])

# add class (all 1s)
classes = [[1] * len(sentence) for sentence in padded]
print(classes[0:2])

[[101, 24042, 2031, 4777, 102], [101, 10771, 2015, 2031, 20675, 102]]
[[101, 24042, 2031, 4777, 102, 0, 0, 0, 0], [101, 10771, 2015, 2031, 20675, 102, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [7]:
# cast tokens and classes to torch tensors
token_tensor = torch.tensor(padded)
class_tensor = torch.tensor(classes)

## Extracting word embeddings from BERT

In [8]:
# load pretrained BERT in eval mode
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [9]:
# run forward pass of BERT of our token tensors, no_grad means BERT does not construct a full graph
with torch.no_grad():
    outputs = model(token_tensor, class_tensor)
    hidden_states = outputs[2]  # grab hidden layer activations

In [10]:
# stack hidden layer states into a single tensor
token_embeddings = torch.stack(hidden_states, dim=0)
print(token_embeddings.size())

# swap dimensions to order sentence, token, layer, node
token_embeddings = token_embeddings.permute(1,2,0,3)
print(token_embeddings.size())

torch.Size([13, 90, 9, 768])
torch.Size([90, 9, 13, 768])


## Computing cosine similarities between concept embeddings and property embeddings

### Using first and last tokens to represent concepts and properties
To simplify the (complex) mapping from concepts and properties to tokens (and vice versa) we start by using the first and last tokens in the sequence. For some sentences this works okay (e.g. "airplanes have wings"), while for other sentences it is obviously too simplified (e.g. "diapers are absorbent", in which "diapers" and "absorbent" are decomposed into two tokens each).

Still, we can use it as a rough baseline to which we can compare later refinements of this method.

In [11]:
norm = lambda x: x / np.linalg.norm(x)
cos = lambda a, b: np.dot(norm(a), norm(b))

def cos_tensor(tensor, sentence_idx, word1_idx, word2_idx):
    word1 = torch.sum(tensor[sentence_idx, word1_idx, -4:], dim=0)
    word2 = torch.sum(tensor[sentence_idx, word2_idx, -4:], dim=0)
    return cos(word1, word2)

In [12]:
for i, sentence in enumerate(tokenized[0:10]):
    sim = cos_tensor(token_embeddings, i, 1, len(sentence) - 2)
    print(f'{sim:.2f}: {sentence}')

0.72: ['[CLS]', 'airplanes', 'have', 'wings', '[SEP]']
0.63: ['[CLS]', 'ambulance', '##s', 'have', 'sirens', '[SEP]']
0.61: ['[CLS]', 'cars', 'have', 'four', 'wheels', '[SEP]']
0.31: ['[CLS]', 'dia', '##pers', 'are', 'absorb', '##ent', '[SEP]']
0.42: ['[CLS]', 'fire', 'trucks', 'have', 'hose', '##s', '[SEP]']
0.66: ['[CLS]', 'needles', 'are', 'sharp', '[SEP]']
0.39: ['[CLS]', 'rain', '##coat', '##s', 'are', 'water', '##proof', '[SEP]']
0.52: ['[CLS]', 'roller', 'skate', '##s', 'have', 'wheels', '[SEP]']
0.59: ['[CLS]', 'sand', '##paper', 'is', 'rough', '[SEP]']
0.64: ['[CLS]', 'scissors', 'cut', '[SEP]']


### Using multi-word phrases to represent concepts and properties
Instead of using the first and last tokens in the sequence to represent the word and its property (which is obviously oversimplified, e.g. "diapers are absorbent" is reduced to `dia`  and `##ent`) we'll use the sum of all token embeddings generated for a given concept or property.

For example, "diapers are absorbent" is now represented as:  
Concept: `sum(dia, ##pers)`  
Property: `sum(are, absorb, ##ent)`

In [13]:
def cos_tensor_multiword(tensor, df, idx):
    slice1 = slice(int(df.loc[idx, 'word1_start']), int(df.loc[idx, 'word1_end']) + 1, None)
    word1 = torch.sum(tensor[idx, slice1, -4:], dim=(0, 1))
    slice2 = slice(int(df.loc[idx, 'word2_start']), int(df.loc[idx, 'word2_end']) + 1, None)
    word2 = torch.sum(tensor[idx, slice2, -4:], dim=(0, 1))
    return cos(word1, word2)

In [14]:
for i, sentence in enumerate(tokenized[0:10]):
    sim = cos_tensor_multiword(token_embeddings, df, i)
    print(f'{sim:.2f}: {sentence}')

0.71: ['[CLS]', 'airplanes', 'have', 'wings', '[SEP]']
0.74: ['[CLS]', 'ambulance', '##s', 'have', 'sirens', '[SEP]']
0.67: ['[CLS]', 'cars', 'have', 'four', 'wheels', '[SEP]']
0.55: ['[CLS]', 'dia', '##pers', 'are', 'absorb', '##ent', '[SEP]']
0.68: ['[CLS]', 'fire', 'trucks', 'have', 'hose', '##s', '[SEP]']
0.68: ['[CLS]', 'needles', 'are', 'sharp', '[SEP]']
0.69: ['[CLS]', 'rain', '##coat', '##s', 'are', 'water', '##proof', '[SEP]']
0.71: ['[CLS]', 'roller', 'skate', '##s', 'have', 'wheels', '[SEP]']
0.68: ['[CLS]', 'sand', '##paper', 'is', 'rough', '[SEP]']
0.64: ['[CLS]', 'scissors', 'cut', '[SEP]']


## Computing sentence-level loss using BERT
This is work in progress.

In [15]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

with torch.no_grad():
    loss = model(token_tensor, labels=torch.tensor([]))[0]

print(np.exp(loss.detach().numpy()).shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: Expected input batch_size (810) to match target batch_size (0).