In [42]:
!pip install transformers



In [43]:
import torch

In [44]:
from transformers import BertTokenizer, BertModel

In [45]:
import random

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
random_seed=42

In [48]:
random.seed(random_seed)

In [49]:
torch.manual_seed(random_seed)

<torch._C.Generator at 0x786380125270>

In [50]:
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(random_seed)

In [51]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

In [52]:
model=BertModel.from_pretrained("bert-base-uncased")

In [53]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [54]:
text="ineuron is for the best AI Courses."
text2="sunny savita is a mentor of genai course."

In [55]:
encoding=tokenizer.batch_encode_plus([text,text2],padding=True,truncation=True,return_tensors="pt",add_special_tokens=True)

In [56]:
input_ids=encoding["input_ids"]

In [57]:
input_ids

tensor([[  101,  1999, 11236,  2239,  2003,  2005,  1996,  2190,  9932,  5352,
          1012,   102,     0],
        [  101, 11559,  7842, 28403,  2003,  1037, 10779,  1997,  8991,  4886,
          2607,  1012,   102]])

In [58]:
attention_mask=encoding["attention_mask"]

In [59]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [60]:
with torch.no_grad():
  outputs=model(input_ids, attention_mask=attention_mask)
  word_embeddings=outputs.last_hidden_state

In [61]:
word_embeddings.shape

torch.Size([2, 13, 768])

In [62]:
word_embeddings

tensor([[[-4.2705e-01, -1.6661e-01,  1.9086e-01,  ..., -2.9867e-01,
           6.3564e-01,  6.9628e-01],
         [-4.5551e-01, -5.8428e-01,  3.3123e-01,  ..., -1.4749e-01,
           5.6046e-01,  1.2290e+00],
         [-7.1737e-01, -4.1263e-01,  1.0531e+00,  ...,  3.0627e-01,
           3.9835e-01,  2.6603e-01],
         ...,
         [ 5.1638e-01,  1.5779e-01, -7.2684e-02,  ...,  2.1025e-01,
          -4.6511e-01, -2.4941e-01],
         [ 2.9626e-01,  2.9635e-01,  2.6037e-01,  ...,  3.1854e-01,
          -6.8196e-01, -1.1987e-01],
         [-3.3972e-01, -2.1031e-04,  1.9611e-01,  ...,  1.7365e-01,
           9.6430e-02,  3.9116e-01]],

        [[-5.9909e-01, -7.7465e-02, -5.5465e-02,  ..., -1.6245e-01,
           4.0960e-01,  2.1757e-01],
         [ 9.7091e-02, -2.4665e-01, -4.4398e-01,  ..., -1.8114e-01,
           7.5230e-01, -4.2935e-01],
         [ 2.2273e-01, -4.8634e-01, -5.8741e-01,  ..., -4.2234e-01,
           2.2519e-01, -3.1184e-01],
         ...,
         [ 4.5262e-01, -6

In [63]:
input_ids[0]

tensor([  101,  1999, 11236,  2239,  2003,  2005,  1996,  2190,  9932,  5352,
         1012,   102,     0])

In [64]:
decode=tokenizer.decode(input_ids[0],skip_special_tokens=True)

In [65]:
decode

'ineuron is for the best ai courses.'

In [66]:
decode=tokenizer.decode(input_ids[1],skip_special_tokens=True)

In [67]:
decode

'sunny savita is a mentor of genai course.'