In [30]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

sentences = [
    'The new movie is awesome',
    'This recent movie is so good',
]
# supabase/gte-small is a model trained on HuggingFace
model = SentenceTransformer('Supabase/gte-small')
embeddings = model.encode(sentences)
print(cos_sim(embeddings[0], embeddings[1]))

No sentence-transformers model found with name Supabase/gte-small. Creating a new one with MEAN pooling.


tensor([[0.8980]])


In [31]:
embeddings = model.encode(sentences[0])
print(f'the length is {len(embeddings)}')
embeddings

the length is 384


array([-7.12436140e-01, -3.50771216e-03,  1.76455945e-01, -1.33495301e-01,
        4.07740384e-01,  3.17245930e-01,  4.26221997e-01,  2.44826242e-01,
       -1.64955840e-01, -2.92247772e-01,  4.60436910e-01, -4.21748430e-01,
        1.09906569e-01,  3.93392622e-01,  2.84923017e-02, -4.13255662e-01,
        7.51393318e-01, -3.56660157e-01, -6.55718803e-01,  1.43724188e-01,
        1.76584885e-01, -3.10462266e-01,  1.59971640e-01, -5.56617916e-01,
        1.34771049e-01,  3.68468553e-01, -2.38507062e-01, -2.50233263e-01,
       -3.96164477e-01, -1.82644939e+00, -1.02361999e-01, -3.07933509e-01,
        4.00420189e-01, -1.31331339e-01, -4.28220749e-01, -3.80465984e-01,
       -1.56796917e-01,  4.16277677e-01, -2.50473171e-01,  9.31331590e-02,
       -1.10003598e-01,  2.37232283e-01, -1.55720711e-01, -6.32615507e-01,
       -2.44624645e-01, -1.58663377e-01,  4.78389598e-02, -4.13466357e-02,
        5.62601388e-01, -3.75526935e-01,  3.29896390e-01, -7.05497622e-01,
        1.53233483e-01, -

In [32]:
embeddings1 = model.encode("the new movie is awesome")
embeddings2 = model.encode("this recent movie is so good")
cos_sim(embeddings1, embeddings2)

tensor([[0.8980]])

In [33]:
embeddings1 = model.encode("the new movie is awesome")
embeddings2 = model.encode("the new movie is awesome")
cos_sim(embeddings1, embeddings2)

tensor([[1.]])

In [34]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]

sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.8200
A man is playing guitar 		 A woman watches TV 		 Score: 0.7016
The new movie is awesome 		 The new movie is so great 		 Score: 0.9697


In [50]:
# Two lists of sentences
sentences1 = [
    "I got a great UXD work",
    "A man is singing",
    "TI have a hambuger for lunch",
]

sentences2 = [
    "Kevin got a great SDE job",
    "A woman is playing violin",
    "The new song is so great",
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

I got a great UXD work 		 Kevin got a great SDE job 		 Score: 0.8084
A man is singing 		 A woman is playing violin 		 Score: 0.7399
TI have a hambuger for lunch 		 The new song is so great 		 Score: 0.7397


In [35]:
# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-4.56867069e-01 -6.03743754e-02  2.77015604e-02 -1.49323896e-01
 -2.58533675e-02  3.99035245e-01 -1.55862188e-02  2.39103243e-01
  9.95151699e-02  1.49657741e-01 -3.45251888e-01 -4.33489054e-01
  6.84537232e-01  2.49792948e-01  3.92542332e-01  3.05619389e-01
 -2.38010958e-01  3.97295952e-01 -4.60436285e-01 -1.37540236e-01
  5.90817750e-01 -2.84304023e-01  1.05978601e-01 -5.92266202e-01
 -1.59350500e-01  4.13091540e-01 -1.64931938e-01 -7.34145567e-02
 -3.01011831e-01 -1.89854705e+00  2.36649904e-02 -5.51726222e-01
  7.99842119e-01 -4.33843397e-02 -2.60188192e-01 -1.74996153e-01
 -4.91537094e-01  4.09644186e-01 -1.80871382e-01  2.30171129e-01
  2.36194983e-01  2.71462083e-01  2.17981711e-02 -6.09191656e-01
 -2.04823703e-01 -5.56082666e-01 -6.08014226e-01  7.79330730e-05
 -8.24697167e-02 -2.05188528e-01 -7.09771067e-02 -4.21118557e-01
 -9.76329967e-02  8.62648115e-02  2.12224156e-01  1.12527296e-01
  2.59943

In [36]:
from sentence_transformers.util import semantic_search

docs = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

docs_embeddings = model.encode(docs, convert_to_tensor=True)

query = "tell me about music"
query_embedding = model.encode(query, convert_to_tensor=True)

hits = semantic_search(query_embedding, docs_embeddings, top_k=2)
hits

for hit in hits[0]:
    print(docs[hit['corpus_id']], "(Score: %.4f)" % hit['score'])

A woman is playing violin. (Score: 0.7764)
A monkey is playing drums. (Score: 0.7570)


In [51]:
#change to search content about food, the result is different. Add more content to test
from sentence_transformers.util import semantic_search

docs = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is eating a piece of potato.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "The burger king is so delicious.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

docs_embeddings = model.encode(docs, convert_to_tensor=True)

query = "tell me about food"
query_embedding = model.encode(query, convert_to_tensor=True)

hits = semantic_search(query_embedding, docs_embeddings, top_k=4)
hits

for hit in hits[0]:
    print(docs[hit['corpus_id']], "(Score: %.4f)" % hit['score'])

A man is eating food. (Score: 0.8346)
The burger king is so delicious. (Score: 0.8088)
A man is eating a piece of potato. (Score: 0.7784)
A man is eating a piece of bread. (Score: 0.7758)


In [38]:

import tiktoken

def split_large_text(large_text, max_tokens):
    enc = tiktoken.get_encoding("cl100k_base")
    tokenized_text = enc.encode(large_text)

    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokenized_text:
        current_chunk.append(token)
        current_length += 1

        if current_length >= max_tokens:
            chunks.append(enc.decode(current_chunk).rstrip(' .,;'))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(enc.decode(current_chunk).rstrip(' .,;'))

    return chunks

Why use tokens?

> By breaking words into smaller parts (tokens), LLMs can better handle new or unusual words by understanding their building blocks. It also helps the model grasp the nuances of language, such as different word forms and contextual meanings.

[source](https://kelvin.legal/understanding-large-language-models-words-versus-tokens/#:~:text=By%20breaking%20words%20into%20smaller,word%20forms%20and%20contextual%20meanings.)

In [39]:
import tiktoken

sent = "If we split a text by number of characters, it is not obvious how many tokens these chunks will be."

print(len(sent.split()))

enc = tiktoken.get_encoding("cl100k_base")
encoded = enc.encode(sent)

print(len(encoded))
tokens = [enc.decode_single_token_bytes(x) for x in encoded]
print(tokens)
print(len(tokens))


decoded = enc.decode(encoded)
print(len(decoded.split()))
decoded


20
22
[b'If', b' we', b' split', b' a', b' text', b' by', b' number', b' of', b' characters', b',', b' it', b' is', b' not', b' obvious', b' how', b' many', b' tokens', b' these', b' chunks', b' will', b' be', b'.']
22
20


'If we split a text by number of characters, it is not obvious how many tokens these chunks will be.'

In [52]:
# try to split into different length
doc = """If we split a text by number of characters, it is not obvious how many tokens these chunks will be.
And at the same time if we want to split a text into bigger possible chunks and keep these chunks under certain LLM tokens limit, we cannot operate by number of characters."""
split_large_text(doc, 20)

['If we split a text by number of characters, it is not obvious how many tokens these chunks will',
 ' be.\nAnd at the same time if we want to split a text into bigger possible chunks and keep',
 ' these chunks under certain LLM tokens limit, we cannot operate by number of characters']

In [None]:
# database - postgress
# doc - use def split_large_text(large_text, max_tokens):   to split the text