In [3]:
import random 
import torch

## NLP - Aug 1, 2024

In [4]:
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)

if torch.cuda.is_available():
    print("is available")
    torch.cuda.manual_seed_all(random_seed)

is available


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")



In [8]:
text = "The AIMT is a fantastic program at the Lambton College"

encoding = tokenizer(text, return_tensors = 'pt', padding=True, truncation=True)
input_ids = encoding["input_ids"] # Token IDS
attention_mask = encoding["attention_mask"] # Attention mask

# print input IDs
print(f"Input ID: {input_ids}")
# print attention mask
print(f"Attention mask: {attention_mask}")

Input ID: tensor([[  101,  1996,  6614,  2102,  2003,  1037, 10392,  2565,  2012,  1996,
         12559,  2669,  2267,   102]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
# Generate embedding using BERT model   
with torch.no_grad():
    outputs = model(input_ids, attention_mask = attention_mask)
    word_embeddings = outputs. last_hidden_state # This contains the embedding # Shape :[1, num_tokens, 768]
    
print(f"Shpae of Word Embedding: {word_embeddings.shape}")
    

Shpae of Word Embedding: torch.Size([1, 14, 768])


In [10]:
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(f"Tokens: {tokens}")

Tokens: ['[CLS]', 'the', 'aim', '##t', 'is', 'a', 'fantastic', 'program', 'at', 'the', 'lamb', '##ton', 'college', '[SEP]']


In [11]:
# Decode the token IDs back to text
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

print(f"Decoded Text: {decoded_text}")

tokenized_text = tokenizer.tokenize(decoded_text)

print(f"tokenized Text: {tokenized_text}")

encoded_text = tokenizer.encode(text, return_tensors="pt") # Returns a tensor

print(f"Encoded Text: {encoded_text}")

Decoded Text: the aimt is a fantastic program at the lambton college
tokenized Text: ['the', 'aim', '##t', 'is', 'a', 'fantastic', 'program', 'at', 'the', 'lamb', '##ton', 'college']
Encoded Text: tensor([[  101,  1996,  6614,  2102,  2003,  1037, 10392,  2565,  2012,  1996,
         12559,  2669,  2267,   102]])


In [12]:
# print word embedding for each token
for token, embedding in zip(tokenized_text,word_embeddings[0]):
    print(f"Token: {token}")
    print(f"Embedding: {embedding}")
    print("\n")

Token: the
Embedding: tensor([-1.8596e-02,  1.5096e-02,  2.6542e-01,  1.3411e-02, -6.4132e-02,
        -2.9608e-01,  2.6305e-01,  5.5107e-01, -2.2020e-01, -5.5321e-01,
         1.3403e-01, -6.8338e-02,  4.8061e-01,  4.3570e-01,  1.3534e-01,
        -2.2642e-01, -8.4224e-02,  5.1563e-01,  4.3876e-02, -3.8735e-01,
        -1.6000e-01, -5.2518e-01,  2.0668e-01,  2.9923e-02,  6.1750e-02,
         8.2011e-03,  1.8064e-01, -1.0371e-01,  2.2538e-01,  1.4485e-01,
         1.6966e-01,  8.4841e-02, -1.0171e-01, -1.5347e-01,  1.5465e-01,
        -8.1628e-02,  5.3127e-02, -2.2059e-01,  2.6349e-01,  7.7967e-02,
         6.9181e-02,  3.1834e-02,  1.0810e-01,  1.3221e-02, -1.2235e-01,
        -9.0888e-02, -2.7399e+00, -1.0898e-01, -1.1064e-01, -1.4294e-01,
         1.4440e-01, -1.2264e-03,  1.4719e-01,  1.2014e-01,  2.0760e-01,
         2.6918e-01, -3.3129e-01,  6.4679e-01,  3.0875e-02, -2.5256e-01,
        -1.8341e-02, -4.4547e-02, -3.3403e-01,  1.8550e-01, -1.6647e-01,
        -1.1039e-01,  2.2750e

In [13]:
sentence_embedding = word_embeddings.mean(dim = 1)

print("Sentence Embedding:")
print(sentence_embedding)

print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")

Sentence Embedding:
tensor([[-2.0340e-02,  7.2589e-02,  1.7413e-01,  1.0753e-01,  2.9440e-01,
         -1.0248e-01,  2.5406e-01,  6.9635e-01, -3.2303e-01, -3.1278e-01,
          2.3739e-01, -1.4498e-01,  4.1507e-01,  5.6953e-01,  6.8873e-02,
         -1.2940e-01,  3.0408e-01,  3.5122e-02, -2.3468e-01, -1.3765e-01,
         -2.0795e-01, -2.2657e-01, -9.1484e-02,  5.3153e-01,  4.5044e-01,
          2.0673e-01,  3.3365e-01,  7.0274e-02, -1.5105e-01,  1.1173e-01,
          1.2948e-01, -6.0593e-02, -2.1415e-01, -2.6945e-01, -2.0657e-02,
          9.4398e-02, -3.6323e-01, -3.4001e-01, -1.7670e-01,  1.1183e-02,
         -4.2568e-01, -4.5948e-01, -1.4922e-02,  6.0407e-02, -1.2631e-01,
         -3.3636e-01, -1.8430e-01, -1.4505e-01,  1.0150e-01, -2.1398e-01,
         -1.5419e-01,  2.0304e-01, -1.7404e-02, -2.4991e-01,  5.6454e-02,
          4.8624e-01, -3.8084e-01, -2.0731e-01, -2.0960e-01, -9.5725e-02,
         -9.3199e-02,  9.6667e-02, -3.9997e-01,  3.7793e-02, -3.3783e-02,
          9.2909e-

## COmputing the similarity metrics using Cosine similarity

In [14]:
example_sentence = "The Lambton college is a great place and AIMT program opens the door to future opportunies"

example_encoding = tokenizer(example_sentence, return_tensors = 'pt', padding= True, truncation=True)
example_input_ids = example_encoding["input_ids"]
example_attention_mask = example_encoding["attention_mask"]

with torch.no_grad():
    example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)
    
    similarity_score = cosine_similarity(sentence_embedding, example_sentence_embedding)
    
    print("Cosine Similarity Score", similarity_score[0][0])
    

Cosine Similarity Score 0.8240335
