In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as f

from transformers import BertModel, BertTokenizer

In [2]:
bert_version = 'bert-large-cased'
tokenizer = BertTokenizer.from_pretrained(bert_version)
model = BertModel.from_pretrained(bert_version)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model.eval()
model.to('cuda:1')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [4]:
texts  = [
    'Obama speaks to the media in Illinois',
    'The president greets the press in Chicago',
    'Oranges are my favorite fruit',
]

encodings = tokenizer(
    texts, # the texts to be tokenized
    padding=True, # pad the texts to the maximum length (so that all outputs have the same length)
    return_tensors='pt' # return the tensors (not lists)
).to('cuda:1')

encodings

{'input_ids': tensor([[  101,  7661,  8917,  1106,  1103,  2394,  1107,  3461,   102,     0],
        [  101,  1109,  2084, 18884,  1116,  1103,  3181,  1107,  2290,   102],
        [  101,  6309,  1116,  1132,  1139,  5095,  5735,   102,     0,     0]],
       device='cuda:1'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], device='cuda:1')}

In [5]:
with torch.no_grad():
    embeddings = model(**encodings)

embeddings = embeddings['last_hidden_state']

# class token

In [6]:
class_token = embeddings[:, 0, :]

normalized = f.normalize(class_token, p = 2, dim = 1)
class_token_dict = normalized.matmul(normalized.T)

class_token_dict

tensor([[1.0000, 0.8658, 0.7458],
        [0.8658, 1.0000, 0.8063],
        [0.7458, 0.8063, 1.0000]], device='cuda:1')

# Mean of all output

In [8]:
MEANS = embeddings.mean(dim = 1)
normalized = f.normalize(MEANS, p = 2, dim = 1)
mean_dist = normalized.matmul(normalized.T)
mean_dist = mean_dist.new_ones(mean_dist.shape) - mean_dist
mean_dist = mean_dist.cpu().numpy()
mean_dist 

array([[0.        , 0.2313121 , 0.61742276],
       [0.2313121 , 0.        , 0.6111248 ],
       [0.61742276, 0.6111248 , 0.        ]], dtype=float32)

# max over time of the output vector

In [9]:
MAXS, _ = embeddings.max(dim=1)
# normalize the MEANS token embeddings
normalized = f.normalize(MAXS, p=2, dim=1)
# calculate the cosine similarity
max_dist = normalized.matmul(normalized.T)
max_dist = max_dist.new_ones(max_dist.shape) - max_dist
max_dist = max_dist.cpu().numpy()
max_dist

array([[ 0.0000000e+00,  7.0275068e-02,  1.5313721e-01],
       [ 7.0275068e-02, -1.1920929e-07,  1.4794904e-01],
       [ 1.5313721e-01,  1.4794904e-01,  1.1920929e-07]], dtype=float32)