<a href="https://colab.research.google.com/github/juyalm/AITest/blob/main/vectorDb/vector_hello_world.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/supabase/supabase/blob/master/examples/ai/vector_hello_world.ipynb)

###Calculate Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# The sentences to encode
sentences = [
    "Hello", "Hi", "Namaste"]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(f"Your embedding length is {len(embeddings)}")
print(embeddings)
# [3, 384]

# 3. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
print(similarities)
# tensor([[1.0000, 0.6660, 0.1046],
#         [0.6660, 1.0000, 0.1411],
#         [0.1046, 0.1411, 1.0000]])


Your embedding length is 3
[[-0.06277177  0.05495887  0.05216487 ...  0.05090595  0.05148351
   0.00709218]
 [-0.09047623  0.04043961  0.02390564 ...  0.06815039  0.01786564
   0.01127624]
 [-0.0806225   0.04730476  0.01811497 ... -0.05458119  0.01905414
   0.04276026]]
tensor([[1.0000, 0.8072, 0.3188],
        [0.8072, 1.0000, 0.2664],
        [0.3188, 0.2664, 1.0000]])


### Semantic or similarity search



In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-mpnet-base-cos-v1")

query_embedding = model.encode("How big is London")
passage_embeddings = model.encode([
    "London is known for its financial district",
    "London has 9,787,426 inhabitants at the 2011 census",
    "The United Kingdom is the fourth largest exporter of goods in the world",
])

similarity = model.similarity(query_embedding, passage_embeddings)
# => tensor([[0.4659, 0.6142, 0.2697]])
print(similarity)

tensor([[0.5517, 0.5627, 0.3164]])


###Get similarity score between 2 sentences using cosine similarity

In [None]:
from sentence_transformers import SentenceTransformer , util, InputExample, losses

model = SentenceTransformer("all-MiniLM-L6-v2")

emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")
#Get the cosine similarity score between sentences
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.6153]])


###Semantic Search 2

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

# Two lists of sentences
sentences1 = [
    "India is a big country",
    "India is diverified",
    "Australia is even bigger than India",
]

sentences2 = [
    "India is amongst the vast countries in the world",
    "Australia is a continent",
    "Nepal is a tiney country",
]

# Compute embeddings for both lists
embeddings1 = model.encode(sentences1)
embeddings2 = model.encode(sentences2)

# Compute cosine similarities
similarities = model.similarity(embeddings1, embeddings2)

# Output the pairs with their score
for idx_i, sentence1 in enumerate(sentences1):
    print(sentence1)
    for idx_j, sentence2 in enumerate(sentences2):
        print(f" - {sentence2: <30}: {similarities[idx_i][idx_j]:.4f}")

India is a big country
 - India is amongst the vast countries in the world: 0.8375
 - Australia is a continent      : 0.3187
 - Nepal is a tiney country      : 0.3991
India is diverified
 - India is amongst the vast countries in the world: 0.6734
 - Australia is a continent      : 0.2808
 - Nepal is a tiney country      : 0.3424
Australia is even bigger than India
 - India is amongst the vast countries in the world: 0.6213
 - Australia is a continent      : 0.6686
 - Nepal is a tiney country      : 0.2844


# HuggingFace Embeddings
Hugging Face sentence-transformers is a Python framework for state-of-the-art sentence, text and image embeddings. You can use these embedding models from the HuggingFaceEmbeddings class.

In [None]:
%pip install -qU langchain-huggingface

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
text = "This is a test document."

hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
query_result = hf_embeddings.embed_query(text)
print(query_result)

[-0.038338541984558105, 0.12346471846103668, -0.02864297851920128, 0.05365270376205444, 0.008845366537570953, -0.03983934596180916, -0.07300589233636856, 0.04777132719755173, -0.030462471768260002, 0.05497974902391434, 0.08505292981863022, 0.03665666654706001, -0.005319973453879356, -0.002233141800388694, -0.06071099638938904, -0.027237920090556145, -0.01135166734457016, -0.042437683790922165, 0.00912993960082531, 0.10081552714109421, 0.07578728348016739, 0.06911715865135193, 0.009857431054115295, -0.0018377641681581736, 0.02624903991818428, 0.03290243074297905, -0.07177437096834183, 0.028384247794747353, 0.06170954555273056, -0.052529532462358475, 0.033661652356386185, 0.07446812838315964, 0.07536034286022186, 0.03538404777646065, 0.06713404506444931, 0.010798045434057713, 0.08167017996311188, 0.016562897711992264, 0.03283063694834709, 0.036325663328170776, 0.0021727988496422768, -0.09895738214254379, 0.0050467848777771, 0.05089650675654411, 0.009287580847740173, 0.024507684633135796,

# Similarity Search with a document

In [None]:
%pip install -qU langchain-community
%pip install -qU langchain-community unstructured
!pip install unstructured
!pip install pdfminer
!pip install pdfminer.six
!pip install pi_heif
!pip install unstructured_inference
!apt-get install poppler-utils
!pip install unstructured_pytesseract
!apt-get install tesseract-ocr
!apt-get install libtesseract-dev


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.

Load environment variables

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pi_heif import register_heif_opener
from dotenv import load_dotenv
import os
#Instead of importing open_filename from the old pdfminer library
#we will rely on the unstructured library and other supporting libraries to do the heavy lifting
#All the required dependencies should have already been installed

In [None]:


load_dotenv('.env')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
open_ai_key = os.getenv('OPENAI_API_KEY')
pinecone_env_key = os.getenv('PINECONE_ENV_KEY')
print(pinecone_api_key)
print(open_ai_key)
print(pinecone_env_key)

pcsk_3mLvpR_PwgBS6tQXNxptTRrsKUMhDP7F1uGU4DdgemPyDB1Nbns4k2dNSwqqAYPuWsHmtZ
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
us-east-1


In [None]:
loader = UnstructuredPDFLoader("/content/sample_data/Java_8_in_Action.pdf")
# loader = PyPDFLoader("\sample_data\Java_8_in_Action.pdf")
data = loader.load()

In [None]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your sample document')
print (f'Here is a sample: {data[0].page_content[:300]}')

You have 1 document(s) in your data
There are 833480 characters in your sample document
Here is a sample: IN ACTION

ld

Mario Fusco Alan My

p

Raoul-Gabr

Ba MANNING

www. it-ebooks.info

Java 8 in Action: Lambdas, streams, and functional-style programming

Raoul-Gabriel Urma, Mario Fusco, and Alan Mycroft

(BY Mannine pusticarions

2

www. it-ebooks.info

Copyright

For online information and orderin


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)

In [None]:
# Let's see how many small chunks we have
print (f'Now you have {len(texts)} documents')

Now you have 2337 documents


In [None]:
!pip install pinecone-client -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import pinecone
from pinecone import Pinecone

# Instead of using pinecone.init, create a Pinecone instance
pinecone = Pinecone(api_key=pinecone_api_key, environment=pinecone_env_key)

# Now you can use the instance methods for index operations
#index = pinecone.create_index("demo", dimension=384)  # Assuming you still want to create the index
index = pinecone.Index("demo")  # To get an existing index

In [None]:
# Set batch size for upserting
batch_size = 100  # Adjust this value as needed
#trying to upload the entire upserted_data list to Pinecone in a single upsert call. This creates a very large request payload, exceeding the 4MB limit.
upserted_data = []
i = 0
for item in texts:
    id = index.describe_index_stats()['total_vector_count']
    upserted_data.append(
        (
            str(id + i),
            model.encode(item.page_content).tolist(),
            {
                'content': item.page_content
            }
        )
    )
    i += 1

    # Upsert in batches
    if len(upserted_data) >= batch_size:
        index.upsert(vectors=upserted_data)
        upserted_data = []  # Reset for the next batch

# Upsert any remaining data
if upserted_data:
    index.upsert(vectors=upserted_data)

In [None]:
query = "explain lambda functions"
query_em = model.encode(query).tolist()
# Use keyword arguments instead of positional arguments for index.query()
# Changed 'false' to 'False'
result = index.query(vector=query_em, top_k=2, include_metadata=False)
print(result)

{'matches': [{'id': '2268', 'score': 0.69456166, 'values': []},
             {'id': '491', 'score': 0.683633804, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}


In [None]:
!apt-get install -qq git