# Sentence Transformers Demonstrations

## Imports and setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

## Basic usage

Compute the embeddings of several sentences:

In [2]:
sentences = [
    "Warm beer on a cold day isn't my idea of fun.",
    "The old rusted farm equipment surrounded the house predicting its demise.",
    "She thought there'd be sufficient time if she hid her watch.",
    "He told us a very exciting adventure story.",
]

embeddings = (
    SentenceTransformer('all-MiniLM-L6-v2')
    .encode(sentences)
)

pd.DataFrame({
    'sentence': sentences,
    'embedding': [e for e in embeddings],
    'embedding shape': [e.shape for e in embeddings],
})

Unnamed: 0,sentence,embedding,embedding shape
0,Warm beer on a cold day isn't my idea of fun.,"[0.032820337, 0.010522877, -0.0042745704, 0.08...","(384,)"
1,The old rusted farm equipment surrounded the h...,"[-0.022551935, 0.07862607, 0.06436973, 0.06431...","(384,)"
2,She thought there'd be sufficient time if she ...,"[-0.0057829944, 0.043506794, 0.034075283, 0.09...","(384,)"
3,He told us a very exciting adventure story.,"[0.051544324, 0.07973126, 0.039082285, 0.07299...","(384,)"


Compute the embedding of a single paragraph with several sentences:

In [3]:
pooh_first_paragraph = "Here is Edward Bear, coming downstairs now, bump, bump, bump, on the back of his head, behind Christopher Robin. It is, as far as he knows, the only way of coming downstairs, but sometimes he feels that there really is another way, if only he could stop bumping for a moment and think of it. And then he feels that perhaps there isn't. Anyhow, here he is at the bottom, and ready to be introduced to you. Winnie-the-Pooh."

vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

embedding = vectorizer.encode(pooh_first_paragraph, show_progress_bar=True)

pd.DataFrame({
    'text': [pooh_first_paragraph,],
    'embedding vector': [embedding,],
    'shape': [embedding.shape,],
}).T

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,0
text,"Here is Edward Bear, coming downstairs now, bu..."
embedding vector,"[0.029747749, -0.06632457, 0.03771893, -0.0041..."
shape,"(384,)"


## Model properties

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

pd.Series({
    'Sequence lenght': model.get_max_seq_length(),
    'Number of components of the embedding vector': model.get_sentence_embedding_dimension(),
}).T

Sequence lenght                                 256
Number of components of the embedding vector    384
dtype: int64

## Cosine similarity (sentence transformers)

In [5]:
util.cos_sim([1.0, 0.0], [1.0, 0.0])

tensor([[1.]])

In [6]:
util.cos_sim([1.0, 0.0], [0.6, 0.0])

tensor([[1.]])

In [7]:
util.cos_sim([2.3, -1.8], [2.3, -1.8])

tensor([[1.]])

In [8]:
util.cos_sim([2.3, -1.8], [2.7, -1.8])

tensor([[0.9971]])

In [9]:
util.cos_sim([-2.3, 1.8], [2.3, -1.8])

tensor([[-1.]])

## Cosine similarity (sklearn)

In [10]:
cosine_similarity([[1.0, 0.0]], [[1.0, 0.0]])

array([[1.]])

In [11]:
cosine_similarity([[2.3, -1.8]], [[2.7, -1.8]])

array([[0.99711008]])

## Testing pooling 

Checking how similar are:
- average of embedding for 'carrot' and 'pie'
- embedding for 'carrot pie'

Двете не са напълно подобни, защото моделът (с основа BERT?) връща ембединг на всяка дума В КОНТЕКСТА на изречението.

In [12]:
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')

phrases = ['carrot', 'pie', 'carrot pie']
embeddings = [vectorizer.encode(p) for p in phrases]

# Average of embedding for 'carrot' and 'pie'

# Checking how similar are:
# - average of embedding for 'carrot' and 'pie'
# - embedding for 'carrot pie'
util.cos_sim(
    0.5 * (embeddings[0] + embeddings[1]),
    embeddings[2],
)

tensor([[0.9204]])

## Words similarity

In [13]:
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')
util.cos_sim(
    vectorizer.encode('cucamber'),
    vectorizer.encode('cucamber')
)

tensor([[1.]])

## Sentences similarity

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embedding1 = model.encode('I enjoy walking in the mountains.')
embedding2 = model.encode('I like strolling in parks and forests.')

util.cos_sim(embedding1, embedding2)

tensor([[0.7057]])

In [15]:
embedding1 = model.encode('I enjoy walking in the mountains.')
embedding2 = model.encode('I enjoy walking in the mountains.')

util.cos_sim(embedding1, embedding2)

tensor([[1.0000]])

In [16]:
embedding1 = model.encode('I enjoy walking in the mountains.')
embedding2 = model.encode('I hate walking in the mountains.')

util.cos_sim(embedding1, embedding2)

tensor([[0.7463]])

## Comparing model performance

In [18]:
# Define the sentences
sentence1 = 'I bought a new car, it is really fast.'
sentence2 = 'Myself recently got a the latest ride. It is not slow at all!'

# Load different transformer models
model_names = ['bert-base-nli-mean-tokens', 'roberta-base-nli-mean-tokens', 'distilbert-base-nli-mean-tokens']

# Calculate cosine similarity for each model
for model_name in model_names:
    # Load the model
    model = SentenceTransformer(model_name)

    # Encode the sentences to get embeddings
    embeddings = model.encode([sentence1, sentence2], show_progress_bar=True)

    # Calculate cosine similarity
    similarity = cosine_similarity([embeddings[0]], [embeddings[1]])

    print(f"Similarity using {model_name:<60}: {similarity[0][0]:.2f}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Similarity using bert-base-nli-mean-tokens                                   : 0.59


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Similarity using roberta-base-nli-mean-tokens                                : 0.70


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Similarity using distilbert-base-nli-mean-tokens                             : 0.77


## Heading

## Heading