# Embeddings and Sentence Similarities

In [None]:
!pip install sentence-transformers tiktoken

In [4]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

In [5]:
# Load the model
# Here we are using this model from Supabase
# https://huggingface.co/Supabase/gte-small
# You can choose other embedding models from Hugging Face

model = SentenceTransformer('Supabase/gte-small')

No sentence-transformers model found with name Supabase/gte-small. Creating a new one with MEAN pooling.


In [6]:
"""
Let's use the model and convert the sentence into embeddings
"""

embeddings = model.encode("I like Python programing")
print(f"the length of the embedding vector is {len(embeddings)}")
embeddings

the length of the embedding vector is 384


array([-6.73097491e-01, -4.71553117e-01,  7.92207941e-02, -4.01285559e-01,
       -7.17622265e-02, -1.38925672e-01,  3.35828155e-01,  2.56369591e-01,
        2.25289520e-02,  3.36204357e-02, -2.89838374e-01, -6.86789393e-01,
        4.86794055e-01,  2.61979014e-01, -9.76839513e-02, -2.52809286e-01,
        2.07800120e-02, -1.89304184e-02, -3.48913461e-01,  8.48381668e-02,
        1.59663633e-01, -2.12455124e-01, -4.23649013e-01, -1.02164638e+00,
       -1.95467919e-02,  8.37713778e-01, -2.95442641e-01, -3.57815683e-01,
       -1.60214499e-01, -1.29963005e+00,  1.48330862e-02, -2.96227276e-01,
        6.15166247e-01, -2.47573540e-01, -1.04925102e-02,  9.18918923e-02,
       -7.85714239e-02,  7.93742761e-02, -5.00591576e-01,  4.96645123e-01,
        3.19249660e-01,  1.01673068e-03, -1.99556425e-01, -3.76710683e-01,
       -2.29374900e-01, -7.59295404e-01, -4.37365860e-01, -2.10281089e-01,
        1.39861107e-01, -3.03207129e-01, -1.64122246e-02, -3.51231426e-01,
       -1.48414329e-01,  

In [7]:
"""
We can use embeddings to find the similarity between two sentences.
Here we are using cosine similarity to find the similarity between two sentences.

The cosine similarity measures the cosine of the angle between two vectors.

Maximum similarity is 1 and minimum similarity is -1.
"""

embeddings1 = model.encode('The new movie is awesome')
embeddings2 = model.encode('This recent movie is so good')

cos_sim(embeddings1, embeddings2)

tensor([[0.8980]])

In [8]:
# Not so similar sentences

embeddings1 = model.encode('The new movie is awesome')
embeddings2 = model.encode('I like Python programming')

cos_sim(embeddings1, embeddings2)

tensor([[0.7360]])

In [30]:
# Exactly same sentences

embeddings1 = model.encode('The new movie is awesome')
embeddings2 = model.encode('The new movie is awesome')

cos_sim(embeddings1, embeddings2)

tensor([[1.0000]])

In [31]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]

sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine-similarities
cosine_scores = cos_sim(embeddings1, embeddings2)

# Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(
        sentences1[i], sentences2[i], cosine_scores[i][i]
    ))

The cat sits outside 		 The dog plays in the garden 		 Score: 0.8200
A man is playing guitar 		 A woman watches TV 		 Score: 0.7016
The new movie is awesome 		 The new movie is so great 		 Score: 0.9697


# Semantic Search

In [18]:
"""
A common use case of sentence embeddings is semantic search.

Here we embed a list of documents and a query. Then we find the most similar documents to the query.
"""

from sentence_transformers.util import semantic_search

docs = [
    "A man ate food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

docs_embeddings = model.encode(docs, convert_to_tensor=True)

"""
Try different queries and see the results
"""
query = "I am hungry"
#query = "Tell me about music"
#query = "What is moving?"

query_embedding = model.encode(query, convert_to_tensor=True)

# You can set the number of results you want by changing the top_k parameter
hits = semantic_search(query_embedding, docs_embeddings, top_k=2)

for hit in hits[0]:
    print(docs[hit['corpus_id']], "(Score: %.4f)" % hit['score'])

A man ate food. (Score: 0.8466)
A man is eating a piece of bread. (Score: 0.8153)


In [None]:
"""
Look into ReRanking for better results
https://sbert.net/examples/applications/retrieve_rerank/README.html
"""

# Sentence Tokenization

Why use tokens?

> By breaking words into smaller parts (tokens), LLMs can better handle new or unusual words by understanding their building blocks. It also helps the model grasp the nuances of language, such as different word forms and contextual meanings.

[source](https://kelvin.legal/understanding-large-language-models-words-versus-tokens/#:~:text=By%20breaking%20words%20into%20smaller,word%20forms%20and%20contextual%20meanings.)

In [37]:
import tiktoken

sent = "If we split a text by number of characters, it is not obvious how many tokens these chunks will be."

print("Split by whitespace: %s"%len(sent.split()))

enc = tiktoken.get_encoding("cl100k_base")
encoded = enc.encode(sent)

print("encoded sentence: %s"%len(encoded))

tokens = [enc.decode_single_token_bytes(x) for x in encoded]
print("tokens: %s"%tokens)
print(len(tokens))


decoded = enc.decode(encoded)
print("reconstructed words: %s"%len(decoded.split()))
decoded


Split by whitespace: 20
encoded sentence: 22
tokens: [b'If', b' we', b' split', b' a', b' text', b' by', b' number', b' of', b' characters', b',', b' it', b' is', b' not', b' obvious', b' how', b' many', b' tokens', b' these', b' chunks', b' will', b' be', b'.']
22
reconstructed words: 20


'If we split a text by number of characters, it is not obvious how many tokens these chunks will be.'

In [None]:
import tiktoken

def split_large_text(large_text, max_tokens):
    """Convenience function to split a large text into chunks of max_tokens tokens."""
    enc = tiktoken.get_encoding("cl100k_base")
    tokenized_text = enc.encode(large_text)

    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokenized_text:
        current_chunk.append(token)
        current_length += 1

        if current_length >= max_tokens:
            chunks.append(enc.decode(current_chunk).rstrip(' .,;'))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(enc.decode(current_chunk).rstrip(' .,;'))

    return chunks

In [15]:
doc = """If we split a text by number of characters, it is not obvious how many tokens these chunks will be.
And at the same time if we want to split a text into bigger possible chunks and keep these chunks under certain LLM tokens limit, we cannot operate by number of characters."""
split_large_text(doc, 10)

['If we split a text by number of characters',
 ' it is not obvious how many tokens these chunks will',
 ' be.\nAnd at the same time if we want',
 ' to split a text into bigger possible chunks and keep',
 ' these chunks under certain LLM tokens limit, we',
 ' cannot operate by number of characters']

# RAG workflow

In [38]:
"""Read the following context, then answer the question based on the context only

Context:

{context}


Question:

{question}

"""

question = input("Enter the question: ")

In [39]:
question

'Tell me about the new iPad'

In [None]:
context = semantic_search(question, docs, top_k=2)

["features are xxxxxx", "difference from preivous perverions xxxxx"]



In [None]:
"""Read the following context, then answer the question based on the context only

Context:

Apple released the iPad with xxxxxx

It is difrfernt from preisou....


Question:

Tell me about the new iPad
"""

# How to read PDF in Python

Here we are only reading the text, for images, tables, and formulas, we need to use OCR based solutions like nougat from Facebook

In [2]:
!pip install pymupdf

Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/10/be/c1a8afad3a3c1a10023548dc037c6b86b5ab8c234b6b8bc53a89c8d26051/PyMuPDF-1.24.3-cp311-none-macosx_11_0_arm64.whl.metadata
  Downloading PyMuPDF-1.24.3-cp311-none-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Obtaining dependency information for PyMuPDFb==1.24.3 from https://files.pythonhosted.org/packages/7e/4a/27e4e2ce8f5d0ed1d1b2a1f7807f6158db1e8e547a7bf76ac462a800a4b4/PyMuPDFb-1.24.3-py3-none-macosx_11_0_arm64.whl.metadata
  Downloading PyMuPDFb-1.24.3-py3-none-macosx_11_0_arm64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.3-cp311-none-macosx_11_0_arm64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.24.3-py3-none-macosx_11_0_arm64.whl (14.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [3]:
import fitz

doc = fitz.open("icis24a-sub2705-i8.pdf") # open a document

In [5]:
text = ''
for page in doc: # iterate the document pages
    text += page.get_text()

In [6]:
text

'Generative AI and Evaluations of Early-Stage Innovations\nGenerative AI and Evaluations of Early-Stage\nInnovations\nShort Paper\nIntroduction\nThe rapid advancement of AI is creating unprecedented opportunities for generating novel ideas\ncost-effectively and at scale across a range of innovative contexts, such as crowdsourcing (Boussioux et al.,\n2023), consumer products (Girotra et al., 2023), knowledge work (Dell’Acqua et al., 2023), and creative\nwriting (Doshi & Hausman, 2023). The unparalleled growth in ideas necessitates alternative screening\nmethods. Large Language Models (LLMs) offer a promising approach to assist experts in filtering and\nprioritizing ideas with the potential for high impact. This research investigates how the use and design of\nAI-driven evaluation tools influence experts’ ability to discern and make pivotal decisions on advancing or\nrejecting novel ideas.\nRecent studies have investigated the potential of human-AI collaboration in decision-making proces