# **Installing the necessary libararies**

In [1]:
!pip install numpy
!pip install torch
!pip install sklearn
!pip install pytorch_transformers
!pip install transformers
!pip install matplotlib




# **Loading the Pre-trained BERT model**

In [2]:
from transformers import AutoTokenizer, AutoModel

## Load pretrained model/tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)

tokenizer = AutoTokenizer.from_pretrained("/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert")
model = AutoModel.from_pretrained("/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert",output_hidden_states=True)


Some weights of the model checkpoint at /Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /Users/lfoppiano/development/projects/embe

# **Input Formatting (Tokenization)**

In [15]:
# Define a new example sentence with multiple meanings of the word "bank"
text1 = "We are studying the material La3A2Ge2 (A = Ir, Rh)."
text2 = "The critical temperature T C = 4.7 K discovered for La3Ir2Ge2 in this work is by about 1.2 K higher than that found for La3Rh2Ge2."

# Add the special tokens.
marked_text = "[CLS] " + text1 + " [SEP] " + text2 + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)


# Display the words with their indeces.
for idx, tup in enumerate(zip(tokenized_text, indexed_tokens)):
    print('{:<3} {:<12} {:>6,}'.format(idx, tup[0], tup[1]))

0   [CLS]           102
1   we              185
2   are             220
3   studying      8,467
4   the             111
5   material      1,440
6   la            1,665
7   ##3          30,138
8   ##a          30,110
9   ##2          30,132
10  ##ge            303
11  ##2          30,132
12  (               145
13  a               106
14  =               275
15  ir            1,622
16  ,               422
17  rh            3,645
18  )               546
19  .               205
20  [SEP]           103
21  the             111
22  critical      2,616
23  temperature   1,633
24  t               105
25  c               115
26  =               275
27  4               286
28  .               205
29  7               450
30  k               231
31  discovered    8,847
32  for             168
33  la            1,665
34  ##3          30,138
35  ##ir            211
36  ##2          30,132
37  ##ge            303
38  ##2          30,132
39  in              121
40  this            238
41  work        

# **Running BERT on the text**

In [16]:
import torch

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31090, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [17]:
# Run the text through BERT, get the output and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor)

    # Evaluating the model will return a different number of objects based on
    # how it's  configured in the `from_pretrained` call earlier. In this case,
    # becase we set `output_hidden_states = True`, the third item will be the
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    last_hidden_state = outputs[0]
    hidden_states = outputs[2]

    # initial embeddings can be taken from 0th layer of hidden states
    initial_embeddings = hidden_states[0]

    # sum of all hidden states
    sum_all_hidden_states = torch.stack(hidden_states).sum(0)

    # sum of last four layer
    sum_last_four_layers = torch.stack(hidden_states[-4:]).sum(0)

    #concat last four layers
    concat_last_four_layers = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)

In [18]:
sum_last_four_layers.size()

torch.Size([1, 62, 768])

In [19]:
concat_last_four_layers.size()

torch.Size([1, 62, 3072])

# Output Layers

In [20]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 62
Number of hidden units: 768


# Analysis

In [21]:
# `hidden_states` is a Python list.
print('      Type of hidden_states: ', type(hidden_states))

# Each layer in the list is a torch tensor.
print('Tensor shape for each layer: ', hidden_states[0].size())

token_embeddings = torch.stack(hidden_states, dim=0)

# Remove batches dimension
token_embeddings = torch.squeeze(token_embeddings, dim=1)

# we swap dimension 0 (layers) and 1 (tokens)
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()


      Type of hidden_states:  <class 'tuple'>
Tensor shape for each layer:  torch.Size([1, 62, 768])


torch.Size([62, 13, 768])

# Word vectors

## Concatenate last 4 hidden layers

In [22]:
# Stores the token vectors, with shape [22 x 3,072]
token_vecs_cat = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Concatenate the vectors (that is, append them together) from the last
    # four layers.
    # Each layer vector is 768 values, so `cat_vec` is length 3,072.
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

    # Use `cat_vec` to represent `token`.
    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

Shape is: 62 x 3072


## Sum last 4 hidden layers

In [23]:
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)

    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 62 x 768


# Analysis meaning

We are trying to check

In [24]:
for idx, tup in enumerate(zip(tokenized_text, indexed_tokens)):
    print('{:<3} {:<12} {:>6,}'.format(idx, tup[0], tup[1]))

0   [CLS]           102
1   we              185
2   are             220
3   studying      8,467
4   the             111
5   material      1,440
6   la            1,665
7   ##3          30,138
8   ##a          30,110
9   ##2          30,132
10  ##ge            303
11  ##2          30,132
12  (               145
13  a               106
14  =               275
15  ir            1,622
16  ,               422
17  rh            3,645
18  )               546
19  .               205
20  [SEP]           103
21  the             111
22  critical      2,616
23  temperature   1,633
24  t               105
25  c               115
26  =               275
27  4               286
28  .               205
29  7               450
30  k               231
31  discovered    8,847
32  for             168
33  la            1,665
34  ##3          30,138
35  ##ir            211
36  ##2          30,132
37  ##ge            303
38  ##2          30,132
39  in              121
40  this            238
41  work        

In [27]:
print('First 5 vector values for each instance of "materials".')
print('')
print("la (pos 6)", str(token_vecs_sum[6][:5]))
print("la (pos 33)", str(token_vecs_sum[33][:5]))
print("la (pos 54)", str(token_vecs_sum[54][:5]))

# print("ge  ", str(token_vecs_sum[10][:5]))
# print("ir   ", str(token_vecs_sum[15][:5]))
# print("la   ", str(token_vecs_sum[53][:5]))
# print("rh   ", str(token_vecs_sum[55][:5]))
# print("ge   ", str(token_vecs_sum[57][:5]))

First 5 vector values for each instance of "materials".

la (pos 6) tensor([ 8.7684, -0.8787,  2.3963,  6.9049, -1.5473])
la (pos 33) tensor([ 2.8659, -2.0691,  1.8861, 10.5369,  0.8178])
la (pos 54) tensor([ 4.5353,  0.0495,  2.9821, 10.7882, -2.3673])


In [28]:
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "river bank" (different meanings).
diff = 1 - cosine(token_vecs_sum[6], token_vecs_sum[33])
print('Vector similarity for la (pos 6) and la (pos 33) meanings:  %.2f' % diff)

diff = 1 - cosine(token_vecs_sum[6], token_vecs_sum[54])
print('Vector similarity for  la (pos 6) and la (pos 54) meanings:  %.2f' % diff)

diff = 1 - cosine(token_vecs_sum[33], token_vecs_sum[54])
print('Vector similarity for la (pos 33) and la (pos 54) meanings:  %.2f' % diff)

Vector similarity for la (pos 6) and la (pos 33) meanings:  0.80
Vector similarity for  la (pos 6) and la (pos 54) meanings:  0.84
Vector similarity for la (pos 33) and la (pos 54) meanings:  0.88
