In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.vector_stores.pinecone import PineconeVectorStore
from uuid import uuid4

import pinecone
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time
import json


In [2]:
pinecone_api_key = "<api_key>"
# Initialize a Pinecone client with your API key
pc = Pinecone(api_key=pinecone_api_key)

In [5]:
with open('app/context.json', 'r') as j:
     dict_doc = json.loads(j.read())

In [25]:
all_docs_combs = [x.replace("â€¢","") for x in list(dict_doc.values())]

In [26]:
# Convert the text into numerical vectors that Pinecone can index
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=all_docs_combs,
    parameters={
        "input_type": "passage", 
        "truncate": "END"
    }
)

print(embeddings)

EmbeddingsList(
  model='multilingual-e5-large',
  vector_type='dense',
  data=[
    {'vector_type': dense, 'values': [0.03155517578125, -0.024444580078125, ..., -0.042755126953125, -0.016937255859375]},
    {'vector_type': dense, 'values': [0.019195556640625, -0.02618408203125, ..., -0.050506591796875, -0.0233306884765625]},
    ... (17 more embeddings) ...,
    {'vector_type': dense, 'values': [0.0196380615234375, -0.021942138671875, ..., -0.060333251953125, -0.0205230712890625]},
    {'vector_type': dense, 'values': [0.0182952880859375, -0.0257415771484375, ..., -0.059906005859375, -0.033355712890625]}
  ],
  usage={'total_tokens': 3078}
)


In [24]:
# Create a serverless index
index_name = "genaihackathon"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [27]:
ids = [str(uuid4()) for x in range(0,len(all_docs_combs))]
dict_doc = {ids[x]:all_docs_combs[x] for x in range(0,len(all_docs_combs))}

In [28]:
# Target the index
# In production, target an index by its unique DNS host, not by its name
# See https://docs.pinecone.io/guides/data/target-an-index
index = pc.Index(index_name)

# Prepare the records for upsert
# Each contains an 'id', the vector 'values', 
# and the original text and category as 'metadata'
records = []
for d, e in zip(dict_doc, embeddings):
    records.append({
        "id":d,
        "values": e["values"]
    })

# Upsert the records into the index
index.upsert(
    vectors=records,
    namespace=index_name
)

upserted_count: 21

In [29]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'genaihackathon': {'vector_count': 0}},
 'total_vector_count': 0}


In [36]:
# Define your query
query = "Could you tell me about payment methods and return policies?"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace=index_name,
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)
print('Results obtained.')


Results obtained.


In [37]:
top_result_uuid = results['matches'][0]['id']
print(dict_doc[top_result_uuid])
print(results)

 Refund Policy  Refunds will be issued to the original payment method.  Shipping costs are non-refundable unless the return is due to a manufacturing defect or an error on our part.  If the original payment method is no longer available, store credit will be issued.6  Payment Policies  We accept major credit/debit cards, PayPal, and other payment methods as listed at checkout.  Orders are charged at the time of purchase.  In case of a refund, the amount will be credited back to the original payment method within 7-10 business days.
{'matches': [{'id': '50422b96-cc14-45be-b3f3-1619facbe523',
              'metadata': None,
              'score': 0.87214035,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '2f7b0306-e726-4beb-85f2-03b3a8e787bd',
              'metadata': None,
              'score': 0.8676379,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '921ca8