# **Installing the necessary libararies**

In [1]:
!pip install numpy
!pip install torch
!pip install sklearn
!pip install pytorch_transformers
!pip install transformers



# **Loading the Pre-trained BERT model**

In [4]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer

## Load pretrained model/tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True)

tokenizer = BertTokenizer.from_pretrained("/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert_tpu/scivocab.myvocab.txt")
# model = AutoModel.from_pretrained("/Users/lfoppiano/development/projects/embeddings/pre-trained-embeddings/matscibert_tpu/",output_hidden_states=True)

# **Input Formatting (Tokenization)**

In [31]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "We are studying the material La 3 A 2 Ge 2 (A = Ir, Rh). The critical temperature T C = 4.7 K discovered for La 3 Ir 2 Ge 2 in this work is by about 1.2 K higher than that found for La 3 Rh 2 Ge 2."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

[                63
C                39
LS            3,148
]                65
ĠWe             784
Ġare            325
Ġstudying     7,866
Ġthe            216
Ġmaterial       715
ĠLa           3,575
Ġ3              370
ĠA              302
Ġ2              322
ĠGe           3,273
Ġ2              322
Ġ(              249
A                37
Ġ=              557
ĠIr           5,302
,                16
ĠRh           4,982
).              356
ĠThe            275
Ġcritical     2,328
Ġtemperature    505
ĠT              255
ĠC              303
Ġ=              557
Ġ4              436
.                18
7                27
ĠK              680
Ġdiscovered   9,238
Ġfor            271
ĠLa           3,575
Ġ3              370
ĠIr           5,302
Ġ2              322
ĠGe           3,273
Ġ2              322
Ġin             234
Ġthis           445
Ġwork         1,020
Ġis             268
Ġby             316
Ġabout        1,276
Ġ1              278
.                18
2                22
ĠK              680


# **Running BERT on the text**

In [27]:
import torch

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(32768, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [28]:
# Run the text through BERT, get the output and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor)

    # can use last hidden state as word embeddings
    last_hidden_state = outputs[0]
    word_embed_1 = last_hidden_state

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

    # initial embeddings can be taken from 0th layer of hidden states
    word_embed_2 = hidden_states[0]

    # sum of all hidden states
    word_embed_3 = torch.stack(hidden_states).sum(0)

    # sum of second to last layer
    word_embed_4 = torch.stack(hidden_states[2:]).sum(0) 

    # sum of last four layer
    word_embed_5 = torch.stack(hidden_states[-4:]).sum(0) 

    #concat last four layers
    word_embed_6 = torch.cat([hidden_states[i] for i in [-1,-2,-3,-4]], dim=-1)




In [29]:
word_embed_5.size()

torch.Size([1, 19, 768])

In [30]:
word_embed_6.size()

torch.Size([1, 19, 3072])