In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import os
from typing import List
from langchain_core.documents import Document

In [2]:
def batch_documents(documents: List[Document], batch_size: int) -> List[List[Document]]:
    return [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]

In [3]:
pdf_folder = "NEW_DATA"
documents = []

for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_folder, filename))
        documents.extend(loader.load())

In [4]:
print(len(documents))

5422


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

In [6]:
print(len(splits))

15952


In [7]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [10]:
# Create an empty Chroma vector store
vectorstore = Chroma(embedding_function=embeddings, persist_directory="./chroma_db")

  warn_deprecated(


In [11]:
# Batch size (adjust this value if needed)
batch_size = 50
# Batch the documents
batches = batch_documents(splits, batch_size)

In [12]:
print(len(batches))

320


In [13]:
# Add documents in batches
for i, batch in enumerate(batches):
    print(f"Processing batch {i+1}/{len(batches)}")
    vectorstore.add_documents(batch)

Processing batch 1/320
Processing batch 2/320
Processing batch 3/320
Processing batch 4/320
Processing batch 5/320
Processing batch 6/320
Processing batch 7/320
Processing batch 8/320
Processing batch 9/320
Processing batch 10/320
Processing batch 11/320
Processing batch 12/320
Processing batch 13/320
Processing batch 14/320
Processing batch 15/320
Processing batch 16/320
Processing batch 17/320
Processing batch 18/320
Processing batch 19/320
Processing batch 20/320
Processing batch 21/320
Processing batch 22/320
Processing batch 23/320
Processing batch 24/320
Processing batch 25/320
Processing batch 26/320
Processing batch 27/320
Processing batch 28/320
Processing batch 29/320
Processing batch 30/320
Processing batch 31/320
Processing batch 32/320
Processing batch 33/320
Processing batch 34/320
Processing batch 35/320
Processing batch 36/320
Processing batch 37/320
Processing batch 38/320
Processing batch 39/320
Processing batch 40/320
Processing batch 41/320
Processing batch 42/320
P

In [14]:
# Persist the vector store
vectorstore.persist()

  warn_deprecated(


In [36]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma  # Or your chosen vector store
from langchain.embeddings import HuggingFaceEmbeddings
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI

GOOGLE_API_KEY = "AIzaSyAwvo8tsU9Jsxw9dcWk7qp7-iGZXZAGaWI"
genai.configure(api_key=GOOGLE_API_KEY)

model = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3, api_key=GOOGLE_API_KEY)

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

loaded_vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)



query_expansion_template = """You are an AI assistant tasked with improving search queries. Given the original query, please reformulate it in English language to be more specific and comprehensive. The goal is to create a query that will yield the most relevant and accurate results in a similarity search from a vectorstore of English Translation of Quran and Hadiths.

Original query: {original_query}

Expanded query:"""

query_expansion_prompt = PromptTemplate(
    input_variables=["original_query"],
    template=query_expansion_template
)

query_expansion_chain = LLMChain(llm=model, prompt=query_expansion_prompt)


query_result_template = """You are an AI assistant tasked with providing accurate answers based on the given context and your existing knowledge. If the answer is not found in either the context or your knowledge base, respond with "Don't know." Avoid providing incorrect or inappropriate answers.

Context: {context}

Query: {query}

Answer:"""

query_result_prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=query_result_template
)

query_result_chain = LLMChain(llm=model, prompt=query_result_prompt)

def expand_query(original_query):
    expanded_query = query_expansion_chain.run(original_query=original_query)
    return expanded_query.strip()

def enhanced_similarity_search(original_query, k=3):
    expanded_query = expand_query(original_query)
    results = loaded_vectorstore.similarity_search(expanded_query, k=k)
    return expanded_query, results

def get_answer(original_query):
    expanded_query, results = enhanced_similarity_search(original_query, k=10)
    context = str()
    for result in results:
        context+=result.page_content

    answer = query_result_chain.run(context=context, query=expanded_query)

    return answer.strip()

In [39]:
get_answer("how many companions slept in kahf")

"Don't know."