In [14]:
# import Libraries

import openai
import langchain
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

True

In [15]:
# API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")  # Fixed: removed comma
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
print(f"Using Pinecone index: {PINECONE_INDEX_NAME}")

Using Pinecone index: ai-chatbot-2


In [16]:
# Initialize OpenAI and Pinecone
openai.api_key = OPENAI_API_KEY

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = PINECONE_INDEX_NAME

# Note: We don't need to get the index here since PineconeVectorStore.from_documents will handle it

In [17]:
## Embedding Technique Of OPENAI - Updated to use langchain_openai
# Note: If you're getting RateLimitError, check your OpenAI account at https://platform.openai.com/account/usage
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",  # Updated to use newer embedding model
    api_key=os.getenv('OPENAI_API_KEY')
)
embeddings

# Alternative: Use HuggingFace embeddings (free)
# from langchain_community.embeddings import HuggingFaceEmbeddings
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x00000251B1025F90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x00000251B1026AD0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [18]:
## Lets Read the document
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents=file_loader.load()
    return documents

In [19]:
doc=read_doc('documents/')
len(doc)

28

In [20]:
## Divide the docs into chunks - Updated with best practices
### https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.html#
def chunk_data(docs, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into chunks using RecursiveCharacterTextSplitter
    Updated with 2024 best practices for chunk size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    doc = text_splitter.split_documents(docs)
    return doc

In [21]:
documents=chunk_data(docs=doc)
len(documents)

43

In [22]:
# vectors=embeddings.embed_query("How are you?")
# len(vectors)

In [23]:
# Create Pinecone vector store from documents
print(f"Creating vector store with {len(documents)} documents...")

# Initialize or connect to existing Pinecone vector store
vector_store = PineconeVectorStore.from_documents(
    documents=documents,  # Use the chunked documents
    embedding=embeddings,
    index_name=index_name
)

print(f"Vector store created/connected successfully with index: {index_name}")

Creating vector store with 43 documents...
Vector store created/connected successfully with index: ai-chatbot-2


In [24]:
## Cosine Similarity Retrieve Results from VectorDB
def retrieve_query(query,k=2):
    matching_results=vector_store.similarity_search(query,k=k)
    return matching_results

In [25]:
# Modern LLM and QA Chain setup
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.5)

# Create a prompt template for QA
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context:

Context: {context}

Question: {input}

Answer:
""")

# Create the document chain
document_chain = create_stuff_documents_chain(llm, prompt)

In [26]:
## Search answers from VectorDB using modern chain
def retrieve_answers(query):
    # Get relevant documents
    doc_search = retrieve_query(query)
    print("Retrieved documents:", doc_search)
    
    # Use the modern document chain
    response = document_chain.invoke({
        "context": doc_search,
        "input": query
    })
    
    return response

In [31]:
our_query = "What are the products available?"
answer = retrieve_answers(our_query)
print(answer)

Retrieved documents: [Document(id='388d2736-aba7-4bc2-91f9-25bbe7c70cbf', metadata={'author': 'Maurice Igugu', 'creationdate': '2019-03-22T15:41:09+01:00', 'creator': 'CorelDRAW 2019', 'moddate': '2019-03-24T15:34:42+01:00', 'page': 15.0, 'page_label': '16', 'producer': 'Corel PDF Engine Version 21.0.0.593', 'source': 'documents\\At-The-HEART-of-Sterling.pdf', 'title': 'At The HEART of Sterling - Old March 2019.cdr', 'total_pages': 28.0}, page_content='era where innovation meets opportunity.\nOUR TARGET MARKETS\nOur choice of market \nsegments is based on \nthe understanding of \nemerging trends in \nthe macroeconomic \nenvironment and \nopportunities in the \nsectors of interest. \nSecondary\nMarkets\nPublic Sector Mining Oil & Gas\n(Upstream, \ndownstream\n& services) \nTelecom-\nmunications\nManufacturing Real Estate Wholesale\n& Trading\nPower\n(Generation &\nDistribution\nHealth AgricultureEducation Renewable\nEnergy\nTransportation\n& Logistics\nPrimary \nMarkets\n15'), Document(

In [None]:
# What are the products available?
our_query = "What are the performance indicators for the agriculture sector?"
answer = retrieve_answers(our_query)
print(answer)

In [33]:
# What are the products available?
our_query = "Highlights?"
answer = retrieve_answers(our_query)
print(answer)

Retrieved documents: [Document(id='0dffe3b5-1381-42e4-8eec-7aba933619aa', metadata={'author': 'Maurice Igugu', 'creationdate': '2019-03-22T15:41:09+01:00', 'creator': 'CorelDRAW 2019', 'moddate': '2019-03-24T15:34:42+01:00', 'page': 0.0, 'page_label': '1', 'producer': 'Corel PDF Engine Version 21.0.0.593', 'source': 'documents\\At-The-HEART-of-Sterling.pdf', 'title': 'At The HEART of Sterling - Old March 2019.cdr', 'total_pages': 28.0}, page_content='At The Heart Of Sterling'), Document(id='5f0651ac-7c67-49ac-8a76-eb19bbfb5b68', metadata={'author': 'Maurice Igugu', 'creationdate': '2019-03-22T15:41:09+01:00', 'creator': 'CorelDRAW 2019', 'moddate': '2019-03-24T15:34:42+01:00', 'page': 21.0, 'page_label': '22', 'producer': 'Corel PDF Engine Version 21.0.0.593', 'source': 'documents\\At-The-HEART-of-Sterling.pdf', 'title': 'At The HEART of Sterling - Old March 2019.cdr', 'total_pages': 28.0}, page_content='digitization are to:\nSpecta is an online community lending solution \ntargeted at

In [28]:
# Alternative: Using ChatOpenAI for more modern chat-based models
# Uncomment below to use ChatOpenAI instead of OpenAI completion model

# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# 
# # Use ChatOpenAI with gpt-3.5-turbo or gpt-4
# chat_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.5)
# 
# # Create a chat prompt template
# chat_system_prompt = (
#     "You are an assistant for question-answering tasks. "
#     "Use the following pieces of retrieved context to answer the question. "
#     "If you don't know the answer, just say that you don't know. "
#     "Use three sentences maximum and keep the answer concise.\n\n"
#     "{context}"
# )
# 
# chat_prompt = ChatPromptTemplate.from_messages([
#     ("system", chat_system_prompt),
#     ("human", "{input}"),
# ])
# 
# # Create chat-based document processing chain
# chat_question_answer_chain = create_stuff_documents_chain(chat_llm, chat_prompt)
# 
# # Create chat-based retrieval chain
# chat_rag_chain = create_retrieval_chain(retriever, chat_question_answer_chain)