### Load PDF Document from Google Drive

In [2]:
!pip install gdown -q 
!pip install groq  -q
!pip install qdrant-client -q
!pip install sentence-transformers -q
!pip install langchain -q
!pip install pypdf  -q 

In [3]:
import os
import gdown
from groq import Groq
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv('.env')  
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
PDF_ID = os.getenv("PDF_ID")

In [5]:
# Google Drive PDF file ID
pdf_id = PDF_ID  

# Destination path for the downloaded PDF
pdf_path = "./dataset/downloaded_document.pdf"  

# Download the PDF file from Google Drive
gdown.download(f"https://drive.google.com/uc?id={pdf_id}", pdf_path, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1Tlf5jxYlvScKUAhNkUxYDkWrcXp1gEIM
To: /home/game/ai_innovator_llm_rag/notebook/dataset/downloaded_document.pdf
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 88.2k/88.2k [00:00<00:00, 1.08MB/s]


'./dataset/downloaded_document.pdf'

In [6]:
# Initialize embedding model using Sentence Transformers
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_size = embedding_model.embed_query("test").__len__()

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [7]:
# Initialize Qdrant Client (using in-memory or adjust to your needs)
qdrant = QdrantClient(":memory:")

# Create a Qdrant collection to hold document vectors
qdrant.recreate_collection(
    collection_name="documents",
    vectors_config={"size": vector_size, "distance": "Cosine"}
)

  qdrant.recreate_collection(


True

In [8]:
# Load PDF document
loader = PyPDFLoader(pdf_path) 
# Load the downloaded PDF
documents = loader.load()

In [9]:
# Split the documents into smaller chunks
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
document_chunks = splitter.split_documents(documents)

# Extract text from document chunks
texts = [doc.page_content for doc in document_chunks]

# Convert texts into vectors
vectors = embedding_model.embed_documents(texts)

# Upsert data into Qdrant
points = [PointStruct(id=i, vector=vectors[i], payload={"text": texts[i]}) for i in range(len(texts))]
qdrant.upsert(collection_name="documents", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [10]:
def search_documents(query):
    # Convert query to vector
    query_vector = embedding_model.embed_query(query)
    
    # Search Qdrant for similar documents
    search_results = qdrant.search(
        collection_name="documents",
        query_vector=query_vector,
        limit=3  # Retrieve top 3 relevant documents
    )
    
    # Check if results are found
    if not search_results:
        return []  # Return empty if no documents found

    # Extract text from results
    return [hit.payload.get("text", "‡πÄ‡∏≠‡∏Å‡∏™‡∏≤‡∏£‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°") for hit in search_results]

In [11]:
def generate_answer(query):
    # Retrieve relevant documents from Qdrant
    retrieved_docs = search_documents(query)
    
    # Check if any documents were retrieved
    if not retrieved_docs:
        return "‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á"  # "No relevant information found"

    # Create the context for the language model
    context = "\n".join([str(doc) for doc in retrieved_docs if isinstance(doc, str)])
    
    if not context.strip():  # Check if context has content
        return "‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Ç‡πâ‡∏≠‡∏á"  # "No relevant information found"

    prompt = f"‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏≠‡πâ‡∏≤‡∏á‡∏≠‡∏¥‡∏á:\n{context}\n\n‡∏Ñ‡∏≥‡∏ñ‡∏≤‡∏°: {query}\n\n‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö:"
    
    # Initialize Groq API client
    groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

    try:
        # Requesting completion from Groq API
        response = groq_client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏ú‡∏¥‡∏î‡∏û‡∏•‡∏≤‡∏î‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ñ‡∏≥‡∏ï‡∏≠‡∏ö: {str(e)}"  # "Error occurred in generating the answer"

In [12]:
# üî• Test the question!
query = "‡∏°‡∏µ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏≠‡∏∞‡πÑ‡∏£‡∏ö‡πâ‡∏≤‡∏á"  # "What are the contents?"
answer = generate_answer(query)
print(answer)

  search_results = qdrant.search(


‡∏°‡∏µ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ó‡∏µ‡πà 25 ‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£ ‡πÉ‡∏ô‡πÉ‡∏ö‡πÅ‡∏à‡πâ‡∏á‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠‡∏¢‡∏≤‡∏ó‡∏µ‡πà‡∏Ñ‡∏∏‡∏ì‡πÉ‡∏´‡πâ‡∏°‡∏≤


In [13]:
# üî• Test the question!
query = "‡∏Ç‡∏≠‡∏£‡∏≤‡∏¢‡∏≠‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î‡∏Ç‡∏≠‡∏á‡∏¢‡∏≤  ‡∏û‡∏≤‡∏£‡∏≤‡πÄ‡∏ã‡∏ï‡∏≤‡∏°‡∏≠‡∏•(Paracetamol)"  
answer = generate_answer(query)
print(answer)

  search_results = qdrant.search(


‡∏¢‡∏≤‡∏†‡∏≤‡∏£‡∏≤‡πÄ‡∏ã‡∏ï‡∏≤‡∏°‡∏≠‡∏• (Paracetamol) ‡πÄ‡∏õ‡πá‡∏ô‡∏¢‡∏≤‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏•‡∏î‡∏Å‡∏≤‡∏£‡πÅ‡∏û‡∏á‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏¥‡∏î‡∏à‡∏≤‡∏Å‡∏†‡∏≤‡∏ß‡∏∞‡πÑ‡∏Ç‡πâ, ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î, ‡πÅ‡∏•‡∏∞‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏≠‡πà‡∏≠‡∏ô‡πÄ‡∏û‡∏•‡∏µ‡∏¢ ‡∏°‡∏µ‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞‡∏Ç‡∏≠‡∏á‡∏¢‡∏≤‡πÉ‡∏ô‡∏£‡∏π‡∏õ‡πÅ‡∏ö‡∏ö‡∏Ç‡∏≠‡∏á‡πÄ‡∏°‡πá‡∏î, ‡∏™‡∏≤‡∏£‡∏£‡∏±‡∏Å‡∏©‡∏≤‡πÇ‡∏£‡∏Ñ‡∏õ‡∏£‡∏∞‡πÄ‡∏†‡∏ó‡πÇ‡∏≠‡∏õ‡∏¥‡∏≠‡∏¥‡∏î‡∏Ç‡∏≠‡∏á‡∏ï‡πâ‡∏ô‡πÑ‡∏°‡πâ‡πÅ‡∏•‡∏∞‡∏Ç‡∏≠‡∏á‡∏û‡∏£‡∏£‡∏ì‡πÑ‡∏°‡πâ‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ö‡∏≥‡∏ö‡∏±‡∏î‡∏õ‡∏ß‡∏î‡∏ó‡πâ‡∏≠‡∏á‡πÅ‡∏•‡∏∞‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏≠‡∏≤‡∏Å‡∏≤‡∏£‡∏õ‡∏ß‡∏î‡∏Ç‡∏≠‡∏á‡∏Å‡∏•‡πâ‡∏≤‡∏°‡πÄ‡∏ô‡∏∑‡πâ‡∏≠

‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡πÉ‡∏ä‡πâ‡∏û‡∏≤‡∏£‡∏≤‡πÄ‡∏ã‡∏ï‡∏≤‡∏°‡∏≠‡∏•:

* ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏¢‡∏≤‡∏ó‡∏µ‡πà‡∏≠‡∏≠‡∏Å‡∏ï‡∏≤‡∏°‡∏™‡∏π‡∏ï‡∏£‡∏ú‡∏™‡∏°: ‡πÉ‡∏ä‡πâ‡∏û‡∏≤‡∏£‡∏≤‡πÄ‡∏ã‡∏ï‡∏≤‡∏°‡∏≠‡∏• 500-1000 ‡∏°‡∏¥‡∏•‡∏•‡∏¥‡∏Å‡∏£‡∏±‡∏° ‡∏ó‡∏∏‡∏Å‡πÜ 4-6 ‡∏ä‡∏±‡πà‡∏ß‡πÇ‡∏°‡∏á ‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏¥‡∏ô 4 ‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏ï‡πà‡∏≠‡∏ß‡∏±‡∏ô
* ‡∏™‡∏≥‡∏´‡∏£‡