# Introduction to Vector Databases with sentence embeddings

Vector databases are designed to store and query high-dimensional vectors, such as embeddings generated from sentences or documents. These embeddings can capture the semantic meaning of the text and are useful for a variety of natural language processing tasks.

## What are vector embeddings?

vector embeddings are fixed-size vectors that represent sentences in a high-dimensional space. These embeddings are generated such that semantically similar sentences are close to each other in the embedding space.

## Using the `sentence-transformers` Library

The `sentence-transformers` library allows us to easily generate sentence embeddings with pre-trained models. These models are trained on large text corpora and can generate high-quality embeddings for a wide range of sentences. For more info: https://sbert.net/index.html 


In [None]:
%%html 
<iframe width="560" height="315" src="https://www.youtube.com/embed/klTvEwg3oJ4?si=TEc_pmoM5I3TWCwn" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>

In [None]:
import numpy as np
from collections import Counter
import string

class SimpleTextEncoder:
    """ A naive text encoder that counts the occurrences of each letter in the text. """
    def __init__(self):
        pass
    
    def encode(self, texts: str | list[str], batch_size=None) -> np.ndarray:
        """
        Encodes a list of texts to fixed-size vectors where each dimension
        corresponds to the count of a specific letter in the text.
        
        :param texts: A list of strings to encode.
        :param batch_size: Not used in this simple encoder, just for compatibility.
        :return: A numpy array of shape (len(texts), len(self.dimensions))
        """
        # If a single string is provided, wrap it in a list
        if isinstance(texts, str):
            texts = [texts]
            
        # Define the columns to be the alphabet (a-z)
        dimensions = string.ascii_lowercase    
        
        # Initialize an array to hold the encodings
        encodings = np.zeros((len(texts), len(dimensions)), dtype=float)
        
        # Encode each text
        for i, text in enumerate(texts):
            # Normalize the text to lower case and filter out non-alphabetic characters
            text = text.lower()
            text = filter(str.isalpha, text)

            # Count the occurrences of each letter
            letter_counts = Counter(text)
            
            # Fill the encoding array with the counts for each letter
            for j, letter in enumerate(dimensions):
                encodings[i, j] = float(letter_counts[letter])
        
        return encodings

homemade_model = SimpleTextEncoder()

In [None]:
from sentence_transformers import SentenceTransformer

# Initialize the model
pretrained_model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:

# select model
model = pretrained_model
#model = homemade_model

# Our list of sentences
sentences = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "Someone in a gorilla costume is playing a set of drums."
]

# Generate the sentence embeddings
embeddings = model.encode(sentences)

# Print the embeddings for the first sentence
print("Dimensions in embedding:", len(embeddings[0]))

print(sentences[0])
print("Start of embedding of the first sentence:", embeddings[0][0:5])

if isinstance(model, SimpleTextEncoder):
    print(f"A: {embeddings[0][0]}")
    print(f"B: {embeddings[0][1]}") 
    print(f"C: {embeddings[0][2]}") 
    print(f"D: {embeddings[0][3]}") 
    print(f"E: {embeddings[0][4]}") 


In [None]:
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
from ipywidgets import interactive, HBox, VBox

# Assuming 'embeddings' is a 2D numpy array and 'sentences' is the list of sentences
# We will create a function to plot any two dimensions

def plot_embeddings(dim1, dim2):
    # Create a DataFrame for the plot
    df = pd.DataFrame(embeddings[:, [dim1, dim2]], columns=['Dim 1', 'Dim 2'])
    df['Sentence'] = sentences
    
    # Create the 2D plot
    fig = px.scatter(df, x='Dim 1', y='Dim 2', text='Sentence', 
                     title=f'Sentence Embeddings Visualization (Dimensions {dim1+1} vs {dim2+1})')
    fig.update_traces(textposition='top center')
    fig.update_layout(transition_duration=500)
    fig.show()

# Create a slider for each dimension
slider_1 = interactive(plot_embeddings, dim1=(0, 9), dim2=(0, 9))

# Display the sliders and plot
display(VBox([HBox(slider_1.children[:-1]), slider_1.children[-1]]))
slider_1.update()

In [None]:
from sentence_transformers import util

# Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

# Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim) - 1):
    for j in range(i + 1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

# Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

In [None]:
from sentence_transformers import util

query = "A man is eating food." # Perfect match
query = "jam" # Rely on models pre-training

# Encode the query sentence
query_embedding = model.encode(query)

# Compute cosine similarity between the query embedding and all sentence embeddings
cos_similarities = util.cos_sim(query_embedding, embeddings)[0]

# Find the index of the sentence with the highest similarity
closest_idx = cos_similarities.argmax()

# Return the closest sentence and its similarity score
closest_sentence = sentences[closest_idx]
similarity_score = cos_similarities[closest_idx].item()

print("Closest sentence:", closest_sentence)
print("Similarity score:", similarity_score)
    

# Task 1: Implement your own embedding model, bonus points for using chatGPT! (15 minutes)