In [1]:
# Required imports
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from transformers import AutoModelForCausalLM, AutoTokenizer , TextStreamer, pipeline
from langchain import HuggingFaceHub
from chromadb import PersistentClient 
from langchain.llms import HuggingFacePipeline
from huggingface_hub import login

import os
import shutil


## Initialize Variable

In [2]:
 # Set up directories and initialize variables
data_directory = "Data/"
persist_directory = "./db"
# Clear the persist directory if it exists
if os.path.exists(persist_directory):
    shutil.rmtree(persist_directory)

## Load the PDF file 

In [3]:
# Load PDF documents
def load_pdf(data_directory):
    loader = DirectoryLoader(data_directory, glob="*.pdf", loader_cls=PyMuPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf(data_directory)

extracted_data

[Document(metadata={'source': 'Data\\The Thirsty Crow.pdf', 'file_path': 'Data\\The Thirsty Crow.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Haseeb Abid', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20240828150536+05'00'", 'modDate': "D:20240828150536+05'00'", 'trapped': ''}, page_content="The Thirsty Crow - Short Stories for Kids \nIt was a hot summer afternoon. \nThe crow was very thirsty he flew here and there in search of water, but he couldn't find \nwater anywhere. As he looked around, all the lakes were dry. He could not see even a drop \nof water. \nThe crow decided to fly in another direction. As he flew a mile, He saw a pot lying on the \nground. \nThe crow happily flew down to the pot and peaked in with great hope but the water was at \nthe bottom of the pot. The thirty crow tried hard to put his beak inside the pot but could not \nreach 

## Split the Document into chunks

In [4]:
# Split documents into smaller chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 30)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks


text_chunks = text_split(extracted_data)
print("length of chunks:", len(text_chunks))

text_chunks

length of chunks: 3


[Document(metadata={'source': 'Data\\The Thirsty Crow.pdf', 'file_path': 'Data\\The Thirsty Crow.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': '', 'author': 'Haseeb Abid', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20240828150536+05'00'", 'modDate': "D:20240828150536+05'00'", 'trapped': ''}, page_content="The Thirsty Crow - Short Stories for Kids \nIt was a hot summer afternoon. \nThe crow was very thirsty he flew here and there in search of water, but he couldn't find \nwater anywhere. As he looked around, all the lakes were dry. He could not see even a drop \nof water. \nThe crow decided to fly in another direction. As he flew a mile, He saw a pot lying on the \nground. \nThe crow happily flew down to the pot and peaked in with great hope but the water was at"),
 Document(metadata={'source': 'Data\\The Thirsty Crow.pdf', 'file_path': 'Data\\The Thirsty Crow.pdf',

## Download the Embedding to apply on the Chunks of Data

In [5]:
# Download Hugging Face embeddings
def download_HF_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

embeddings = download_HF_embeddings()

  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


## Initialize Chroma DB

In [6]:

client = PersistentClient(path=persist_directory)
collection = client.create_collection(name="API_EXP")


db = Chroma.from_documents(text_chunks, embeddings, persist_directory=persist_directory, client=client)


client.delete_collection("API_EXP")

In [7]:
query = "What is the crow looking for?"

matching_docs = db.similarity_search_with_score(query, k=2)
matching_docs

[(Document(metadata={'author': 'Haseeb Abid', 'creationDate': "D:20240828150536+05'00'", 'creator': 'Microsoft® Word for Microsoft 365', 'file_path': 'Data\\The Thirsty Crow.pdf', 'format': 'PDF 1.7', 'keywords': '', 'modDate': "D:20240828150536+05'00'", 'page': 0, 'producer': 'Microsoft® Word for Microsoft 365', 'source': 'Data\\The Thirsty Crow.pdf', 'subject': '', 'title': '', 'total_pages': 1, 'trapped': ''}, page_content="The Thirsty Crow - Short Stories for Kids \nIt was a hot summer afternoon. \nThe crow was very thirsty he flew here and there in search of water, but he couldn't find \nwater anywhere. As he looked around, all the lakes were dry. He could not see even a drop \nof water. \nThe crow decided to fly in another direction. As he flew a mile, He saw a pot lying on the \nground. \nThe crow happily flew down to the pot and peaked in with great hope but the water was at"),
  0.8898072261132589),
 (Document(metadata={'author': 'Haseeb Abid', 'creationDate': "D:2024082815053

## Create a Proper Template

In [8]:
# Prompt template for detailed QA
query = "What is the crow looking for?"


prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""


PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {
    "prompt": PROMPT
}


## Login using Hugging Face API key

In [9]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Download the Model from the Hugging Face

In [10]:

cache_dir='D:/huggingface_models'

model = "meta-llama/Llama-3.1-8B"


tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model,cache_dir=cache_dir , device_map="auto")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


## Create a pipeline 

In [11]:
streamer = TextStreamer(tokenizer) # to stream the output

pipeline = pipeline("text-generation" , 
            model=model , 
            tokenizer= tokenizer ,
            streamer = streamer,
            max_length=500,
            # temperature=0.1,
            # pad_token_id=tokenizer.eos_token_id,
            # top_p=0.95,
            # repetition_penalty=1.2
            )

local_llm = HuggingFacePipeline(pipeline=pipeline )

  local_llm = HuggingFacePipeline(pipeline=pipeline )


## Define the Question Answer Retrieval 

In [12]:
 qa = RetrievalQA.from_chain_type(
        llm=local_llm,
        chain_type="stuff",
        retriever=db.as_retriever(),
        chain_type_kwargs= chain_type_kwargs
    )

## Query the Model to get Response

In [13]:
# Loop to take user input and provide response
# query = "what did the crow do to raise the water level"

qa.run(query)

  qa.run(query)
Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|>
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: The Thirsty Crow - Short Stories for Kids 
It was a hot summer afternoon. 
The crow was very thirsty he flew here and there in search of water, but he couldn't find 
water anywhere. As he looked around, all the lakes were dry. He could not see even a drop 
of water. 
The crow decided to fly in another direction. As he flew a mile, He saw a pot lying on the 
ground. 
The crow happily flew down to the pot and peaked in with great hope but the water was at

the bottom of the pot. The thirty crow tried hard to put his beak inside the pot but could not 
reach the water. 
The water in the pot was very low. He tried to move the pot but it was heavy. He could not 
even tilt the pot alone. The crow began thinking of ways to get water from the pot. 
When he looked around, he saw few stones lying on the groun

KeyboardInterrupt: 