In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# BERT Embeddings

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Define input sentence
sentence1 = "The cat chased the mouse across the room."
sentence2 = "The mouse ran across the room, pursued by the cat."

In [None]:
# Tokenize input sentence
# input_ids = tokenizer.encode(input_text, add_special_tokens=True)

input_ids_1 = bert_tokenizer.encode(sentence1, return_tensors='pt', add_special_tokens=True)
input_ids_2 = bert_tokenizer.encode(sentence2, return_tensors='pt', add_special_tokens=True)

In [None]:
# Get BERT embeddings
with torch.no_grad():
    outputs_1 = bert_model(input_ids_1)
    outputs_2 = bert_model(input_ids_2)

# embeddings1 = outputs_1[0][0].mean(dim=0)
# embeddings2 = outputs_2[0][0].mean(dim=0) # first element of the output tuple contains the embeddings
    

In [None]:
#outputs_1 #here we have the last hidden state values

In [None]:
outputs_1[0]

tensor([[[-0.1933,  0.1243, -0.1650,  ..., -0.4874,  0.4708,  0.8474],
         [-0.1992,  0.1454, -0.6586,  ..., -0.3850,  1.1582, -0.0479],
         [-0.2293,  0.0326,  0.1207,  ..., -0.4336,  0.6552,  0.9609],
         ...,
         [-0.0749,  0.2701, -0.7780,  ...,  0.0389,  0.5268,  0.1165],
         [ 0.6536,  0.3500, -0.3956,  ...,  0.2424,  0.0666, -0.3547],
         [ 0.1260,  0.4227,  0.3613,  ...,  0.0253, -0.0594, -0.0079]]])

In [None]:
 outputs_1[0].shape


torch.Size([1, 11, 768])

In [None]:
outputs_1[0][0]

tensor([[-0.1933,  0.1243, -0.1650,  ..., -0.4874,  0.4708,  0.8474],
        [-0.1992,  0.1454, -0.6586,  ..., -0.3850,  1.1582, -0.0479],
        [-0.2293,  0.0326,  0.1207,  ..., -0.4336,  0.6552,  0.9609],
        ...,
        [-0.0749,  0.2701, -0.7780,  ...,  0.0389,  0.5268,  0.1165],
        [ 0.6536,  0.3500, -0.3956,  ...,  0.2424,  0.0666, -0.3547],
        [ 0.1260,  0.4227,  0.3613,  ...,  0.0253, -0.0594, -0.0079]])

In [None]:
outputs_1[0][0].shape

torch.Size([11, 768])

In [None]:
embeddings1 = outputs_1[0][0].mean(dim=0) #here it is 2d so dim=0 , if it is 3d then dim=1 like in T5
embeddings2 = outputs_2[0][0].mean(dim=0)

In [None]:
print(embeddings1.shape)
print(embeddings2.shape)

torch.Size([768])
torch.Size([768])


In [None]:
embeddings1 = embeddings1.reshape(1, -1) 
embeddings2 = embeddings2.reshape(1, -1) 

In [None]:
# Print the embeddings
print(embeddings1.shape)
print(embeddings2.shape)

torch.Size([1, 768])
torch.Size([1, 768])


In [None]:
similarity = cosine_similarity(embeddings1, embeddings2)[0][0]
similarity

0.9428967

# BIoBert Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel

biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

biobert_model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [None]:
# Define input sentence
sentence1 = "The cat chased the mouse across the room."
sentence2 = "The mouse ran across the room, pursued by the cat."

In [None]:
# Tokenize input sentence
# input_ids = tokenizer.encode(input_text, add_special_tokens=True)

input_ids_1 = biobert_tokenizer.encode(sentence1, return_tensors='pt', add_special_tokens=True)
input_ids_2 = biobert_tokenizer.encode(sentence2, return_tensors='pt', add_special_tokens=True)

In [None]:
# Get BIOBERT embeddings
with torch.no_grad():
    outputs_1 = biobert_model(input_ids_1)
    outputs_2 = biobert_model(input_ids_2)

embeddings1 = outputs_1[0][0].mean(dim=0)
embeddings2 = outputs_2[0][0].mean(dim=0) 
    

In [None]:
# Print the embeddings
print(embeddings1.shape)
print(embeddings2.shape)

torch.Size([768])
torch.Size([768])


In [None]:
embeddings1 = embeddings1.reshape(1, -1) 
embeddings2 = embeddings2.reshape(1, -1) 

In [None]:
# Print the embeddings
print(embeddings1.shape)
print(embeddings2.shape)

torch.Size([1, 768])
torch.Size([1, 768])


In [None]:
similarity = cosine_similarity(embeddings1, embeddings2)[0][0]
similarity

0.961789