In [None]:
# can also use the following google model for sentence embeddings
# https://huggingface.co/google/embeddinggemma-300m

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F


In [2]:

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



In [3]:

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [4]:

# Sentences we want sentence embeddings for
sentences = ['I need a backend developer with JavaScript experience', 'I have 6 years of experience in FastAPI']


In [5]:

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')


In [6]:

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)


In [7]:

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])


In [8]:

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


Sentence embeddings:
tensor([[-1.1849e-01, -1.0214e-01,  1.1694e-02,  5.4410e-02,  3.6486e-03,
         -3.8296e-02,  1.4635e-02,  4.5787e-02, -3.6352e-02, -8.9998e-02,
         -8.5973e-02,  3.7943e-02, -2.6027e-03, -2.7832e-02,  8.1116e-03,
          4.7520e-02,  5.2576e-03,  2.0542e-02,  7.7834e-02, -1.0203e-01,
         -4.4725e-02, -5.4466e-02,  3.0470e-02, -6.2458e-02,  4.3915e-02,
         -1.4086e-03,  5.1297e-02,  2.1621e-02,  2.2369e-02, -3.5547e-02,
         -7.5415e-03, -9.0370e-02,  1.5723e-02, -2.6063e-02, -9.1835e-02,
          1.3786e-01, -4.4649e-02, -1.5893e-02, -6.9422e-03, -8.7589e-03,
         -1.4987e-01,  1.7440e-02, -5.6121e-02, -3.2696e-02,  5.0552e-02,
         -1.3214e-01, -4.4603e-02, -3.8776e-02, -1.8744e-02,  4.6950e-02,
          2.6847e-02, -9.6546e-02, -1.7238e-02, -6.6890e-02, -1.0547e-01,
          1.3676e-02, -6.3358e-02,  3.2819e-02,  1.3582e-03,  2.1050e-02,
          3.2441e-02,  2.1329e-02,  3.0195e-02,  1.2800e-02,  6.2946e-02,
         -3.8189e

In [9]:
sentence_embeddings[0].shape, sentence_embeddings[1].shape

(torch.Size([384]), torch.Size([384]))

In [10]:
import torch.nn.functional as F

In [11]:
F.cosine_similarity(sentence_embeddings[0], sentence_embeddings[1], dim=0)

tensor(0.2542)

In [19]:
sentence_embeddings[0].tolist()

[-0.11848553270101547,
 -0.10213883221149445,
 0.011694430373609066,
 0.054409824311733246,
 0.003648617072030902,
 -0.03829596936702728,
 0.014635247178375721,
 0.045787375420331955,
 -0.03635245934128761,
 -0.0899975448846817,
 -0.08597302436828613,
 0.03794262930750847,
 -0.002602653345093131,
 -0.027832288295030594,
 0.008111556991934776,
 0.04752005264163017,
 0.005257603246718645,
 0.020541541278362274,
 0.07783418893814087,
 -0.10203251242637634,
 -0.044725075364112854,
 -0.054465703666210175,
 0.03047044947743416,
 -0.062457695603370667,
 0.04391477257013321,
 -0.0014085537986829877,
 0.051297422498464584,
 0.021621346473693848,
 0.02236897312104702,
 -0.03554705157876015,
 -0.007541509345173836,
 -0.09037034213542938,
 0.015723377466201782,
 -0.026063403114676476,
 -0.09183461964130402,
 0.13786092400550842,
 -0.044648900628089905,
 -0.015893302857875824,
 -0.006942196749150753,
 -0.008758945390582085,
 -0.14986753463745117,
 0.017440248280763626,
 -0.05612139031291008,
 -0.03

In [20]:
from torch import tensor

In [22]:
tensor(sentence_embeddings[0].tolist())

tensor([-1.1849e-01, -1.0214e-01,  1.1694e-02,  5.4410e-02,  3.6486e-03,
        -3.8296e-02,  1.4635e-02,  4.5787e-02, -3.6352e-02, -8.9998e-02,
        -8.5973e-02,  3.7943e-02, -2.6027e-03, -2.7832e-02,  8.1116e-03,
         4.7520e-02,  5.2576e-03,  2.0542e-02,  7.7834e-02, -1.0203e-01,
        -4.4725e-02, -5.4466e-02,  3.0470e-02, -6.2458e-02,  4.3915e-02,
        -1.4086e-03,  5.1297e-02,  2.1621e-02,  2.2369e-02, -3.5547e-02,
        -7.5415e-03, -9.0370e-02,  1.5723e-02, -2.6063e-02, -9.1835e-02,
         1.3786e-01, -4.4649e-02, -1.5893e-02, -6.9422e-03, -8.7589e-03,
        -1.4987e-01,  1.7440e-02, -5.6121e-02, -3.2696e-02,  5.0552e-02,
        -1.3214e-01, -4.4603e-02, -3.8776e-02, -1.8744e-02,  4.6950e-02,
         2.6847e-02, -9.6546e-02, -1.7238e-02, -6.6890e-02, -1.0547e-01,
         1.3676e-02, -6.3358e-02,  3.2819e-02,  1.3582e-03,  2.1050e-02,
         3.2441e-02,  2.1329e-02,  3.0195e-02,  1.2800e-02,  6.2946e-02,
        -3.8189e-02, -4.4997e-02,  3.5909e-02,  8.9

In [23]:
sentence_embeddings[0]

tensor([-1.1849e-01, -1.0214e-01,  1.1694e-02,  5.4410e-02,  3.6486e-03,
        -3.8296e-02,  1.4635e-02,  4.5787e-02, -3.6352e-02, -8.9998e-02,
        -8.5973e-02,  3.7943e-02, -2.6027e-03, -2.7832e-02,  8.1116e-03,
         4.7520e-02,  5.2576e-03,  2.0542e-02,  7.7834e-02, -1.0203e-01,
        -4.4725e-02, -5.4466e-02,  3.0470e-02, -6.2458e-02,  4.3915e-02,
        -1.4086e-03,  5.1297e-02,  2.1621e-02,  2.2369e-02, -3.5547e-02,
        -7.5415e-03, -9.0370e-02,  1.5723e-02, -2.6063e-02, -9.1835e-02,
         1.3786e-01, -4.4649e-02, -1.5893e-02, -6.9422e-03, -8.7589e-03,
        -1.4987e-01,  1.7440e-02, -5.6121e-02, -3.2696e-02,  5.0552e-02,
        -1.3214e-01, -4.4603e-02, -3.8776e-02, -1.8744e-02,  4.6950e-02,
         2.6847e-02, -9.6546e-02, -1.7238e-02, -6.6890e-02, -1.0547e-01,
         1.3676e-02, -6.3358e-02,  3.2819e-02,  1.3582e-03,  2.1050e-02,
         3.2441e-02,  2.1329e-02,  3.0195e-02,  1.2800e-02,  6.2946e-02,
        -3.8189e-02, -4.4997e-02,  3.5909e-02,  8.9