In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-2.0266e-04,  8.1480e-02,  3.1362e-02,  2.9206e-03,  2.6156e-02,
          2.9074e-02,  7.8262e-02, -1.8042e-03,  1.0134e-01, -4.5171e-02,
          5.8435e-02, -1.5320e-02,  5.4996e-02, -9.8643e-02, -3.5025e-02,
          8.4568e-03,  1.5861e-02,  1.0563e-02, -3.4271e-02, -4.7506e-03,
          9.9902e-02, -2.0602e-02, -4.4784e-02,  3.1214e-02, -1.1924e-02,
         -5.1502e-02, -1.3361e-02,  1.8962e-02,  9.7681e-02, -5.4411e-02,
         -3.4331e-02,  8.1291e-02,  4.8812e-02, -1.1028e-02,  2.1352e-02,
          1.2719e-02, -1.4397e-02,  3.6286e-02, -7.6123e-02,  3.2329e-02,
          2.0810e-02, -4.2202e-02,  9.1291e-02,  2.0853e-02, -3.0802e-02,
         -8.3851e-02,  1.3089e-02, -3.0063e-02,  4.1123e-02, -1.2750e-01,
         -7.7803e-02, -3.9341e-02,  1.5260e-03, -2.8011e-02,  3.4166e-02,
          1.4671e-02, -7.7165e-02,  1.6362e-01,  4.1129e-02, -5.2446e-02,
         -4.1877e-02,  1.8053e-02, -1.3892e-02, -3.6819e-02,  6.9498e-02,
         -2.5709e

In [5]:
class Embedding():
    
    def __init__(self, docs=None, tokenizer=None, model=None):
        self.docs = docs
        if tokenizer is None or model is None:
            self.tokenizer, self.model = self._model_tokenizer()
        else:
            self.tokenizer = tokenizer
            self.model = model
    
    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def _model_tokenizer(self, tokenizer, model):
        # Load model from HuggingFace Hub
        tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
        model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
        return tokenizer, model
    
    def forward(self, docs=None):
        if docs is not None:
            self.docs = docs
        
        # Tokenize sentences
        encoded_input = self.tokenizer(self.docs, padding=True,truncation=True,return_tensors='pt')    
        
        # Compute the token embedding
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        
        # Perform pooling
        sentence_embeddings = self._mean_pooling(model_output, encoded_input['attention_mask'])
        
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        
        return sentence_embeddings        

In [3]:
# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

In [11]:
embedding = Embedding()

In [12]:
embedding.forward(sentences)

tensor([[-2.0266e-04,  8.1480e-02,  3.1362e-02,  2.9206e-03,  2.6156e-02,
          2.9074e-02,  7.8262e-02, -1.8042e-03,  1.0134e-01, -4.5171e-02,
          5.8435e-02, -1.5320e-02,  5.4996e-02, -9.8643e-02, -3.5025e-02,
          8.4568e-03,  1.5861e-02,  1.0563e-02, -3.4271e-02, -4.7506e-03,
          9.9902e-02, -2.0602e-02, -4.4784e-02,  3.1214e-02, -1.1924e-02,
         -5.1502e-02, -1.3361e-02,  1.8962e-02,  9.7681e-02, -5.4411e-02,
         -3.4331e-02,  8.1291e-02,  4.8812e-02, -1.1028e-02,  2.1352e-02,
          1.2719e-02, -1.4397e-02,  3.6286e-02, -7.6123e-02,  3.2329e-02,
          2.0810e-02, -4.2202e-02,  9.1291e-02,  2.0853e-02, -3.0802e-02,
         -8.3851e-02,  1.3089e-02, -3.0063e-02,  4.1123e-02, -1.2750e-01,
         -7.7803e-02, -3.9341e-02,  1.5260e-03, -2.8011e-02,  3.4166e-02,
          1.4671e-02, -7.7165e-02,  1.6362e-01,  4.1129e-02, -5.2446e-02,
         -4.1877e-02,  1.8053e-02, -1.3892e-02, -3.6819e-02,  6.9498e-02,
         -2.5709e-02,  3.5855e-02,  2.