In [78]:
import getpass
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
load_dotenv()

if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [80]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

# Adjust the path to where your PDF is located
data_path = "./books"

# Load all PDFs from the directory
loader = DirectoryLoader(
    data_path,
    glob='*.pdf',
    loader_cls=PyPDFLoader
)
    
# This gives you one document per page
documents=loader.load()

In [67]:
print(f"Loaded {len(documents)} pages from PDF(s)")

Loaded 480 pages from PDF(s)


In [81]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=750,
    chunk_overlap=30,
    length_function=len
)

In [82]:
chunk_documents = text_splitter.split_documents(documents)
len(chunk_documents) 

# So now we have a list of documents with much smaller chunks

1425

In [83]:
import time

index_name = "sapiens" 

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [84]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [85]:
import uuid

# Now you can add these to your vector store
doc_ids = [str(uuid.uuid4()) for _ in range(len(chunk_documents))]

In [86]:

# Define a reasonable batch size
batch_size = 50  # Start small and adjust as needed

# Process in batches
for i in range(0, len(chunk_documents), batch_size):
    # Get current batch of documents and IDs
    batch_docs = chunk_documents[i:i+batch_size]
    batch_ids = doc_ids[i:i+batch_size]
    
    # Add the batch to the vector store
    vector_store.add_documents(documents=batch_docs, ids=batch_ids)
    
    print(f"Processed batch {i//batch_size + 1}/{(len(chunk_documents) + batch_size - 1)//batch_size}")

Processed batch 1/29
Processed batch 2/29
Processed batch 3/29
Processed batch 4/29
Processed batch 5/29
Processed batch 6/29
Processed batch 7/29
Processed batch 8/29
Processed batch 9/29
Processed batch 10/29
Processed batch 11/29
Processed batch 12/29
Processed batch 13/29
Processed batch 14/29
Processed batch 15/29
Processed batch 16/29
Processed batch 17/29
Processed batch 18/29
Processed batch 19/29
Processed batch 20/29
Processed batch 21/29
Processed batch 22/29
Processed batch 23/29
Processed batch 24/29
Processed batch 25/29
Processed batch 26/29
Processed batch 27/29
Processed batch 28/29
Processed batch 29/29


In [101]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage

template = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

    Question: {question} 
    Context: {context} 
"""

prompt_template = ChatPromptTemplate.from_template(template)

In [102]:
def get_context(query):
    documents = vector_store.similarity_search(query=query)
    return "\n\n".join(doc.page_content for doc in documents)

In [104]:
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(model="gpt-4o")
# Create the chain properly
chain = (
    {
        "context": lambda query: get_context(query),
        "question": RunnablePassthrough()
    } 
    | prompt_template 
    | llm
)