In [1]:
# Install Pytorch & other libraries
# Install Hugging Face libraries
!pip install  --upgrade  --quiet\
  "transformers==4.38.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \
  "trl==0.7.11" \
  "peft==0.8.2" \
  "langchain" \
  "sentence-transformers" \
  "faiss-cpu"
!pip install unstructured pdfminer pdfminer.six faiss-cpu "torch==2.1.2" tensorboard --quiet
!pip install -U langchain-community==0.2.4  --quiet



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from huggingface_hub import login

login(
  token="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", # ADD YOUR TOKEN HERE
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


In [3]:
import torch
from IPython.display import display_markdown
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline
import transformers
import time
from langchain.document_loaders import UnstructuredPDFLoader,PDFMinerLoader,TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={
        "torch_dtype": torch.float16,
        "quantization_config": {"load_in_4bit": True},
        "low_cpu_mem_usage": True,
    },
)

terminators =  [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

### for semantic cache
# vector_store = FAISS()



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from langchain_community.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
import faiss
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
# Initialize an empty FAISS index
dimension = embeddings.client.get_sentence_embedding_dimension()
index = faiss.IndexFlatL2(dimension)

  warn_deprecated(


In [5]:
docstore = InMemoryDocstore()

In [6]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}
)

In [7]:
### Pdf file Path for RAG
pdf_file_path = "/opt/app-root/src/Gemma2-9B-Llama3-8B-Finetune-and-RAG/DeepLearningBook.pdf"

In [8]:
### this class used to retrieve the text from pdf and chunk it 
class Langchain_RAG:
    def __init__(self, pdf_file_path):
        self.embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")
        self.pdf_file_path = pdf_file_path
        print("Loading PDF file, this may take time to process...")
        self.loader = PDFMinerLoader(self.pdf_file_path)
        self.data = self.loader.load()
        print("PDF file loaded.")
        print("Chunking...")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"])
        self.texts = text_splitter.split_documents(self.data)
        print("Chunking completed.")
        self.get_vec_value = FAISS.from_documents(self.texts, self.embeddings)
        print("Vector values saved.")
        self.retriever = self.get_vec_value.as_retriever(search_kwargs={"k": 4})

    def __call__(self, query):
        relevant_docs = self.retriever.get_relevant_documents(query)
        return "".join([doc.page_content for doc in relevant_docs])


In [9]:
import time

# This class is used to generate responses from an LLM model
class Llama3_8B_gen:
    def __init__(self, pipeline, embeddings, vector_store, threshold):
        self.pipeline = pipeline
        self.embeddings = embeddings
        self.vector_store = vector_store
        self.threshold = threshold
        
    @staticmethod
    def generate_prompt(query,retrieved_text):
        messages = [
            {"role": "system", "content": "Answer the Question for the Given below context and information and not prior knowledge, only give the output result \n\ncontext:\n\n{}".format(retrieved_text) },
            {"role": "user", "content": query},]
        return pipeline.tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True)
    
    def semantic_cache(self, query, prompt):
        query_embedding = self.embeddings.embed_documents([query])
        similar_docs = self.vector_store.similarity_search_with_score_by_vector(query_embedding[0], k=1)
        
        if similar_docs and similar_docs[0][1] <self.threshold:
            self.print_bold_underline("---->> From Cache")
            return similar_docs[0][0].metadata['response']
        else:
            self.print_bold_underline("---->> From LLM")
            output = self.pipeline(prompt, max_new_tokens=512, eos_token_id=terminators, do_sample=True, temperature=0.7, top_p=0.9)
            
            response = output[0]["generated_text"][len(prompt):]
            self.vector_store.add_texts(texts = [query], 
                       metadatas = [{'response': response},])
            
            return response
            
    def generate(self, query, retrieved_context):
        start_time = time.time()
        
        prompt = self.generate_prompt(query, retrieved_context)
        res = self.semantic_cache(query, prompt)   
        
        end_time = time.time()
        execution_time = end_time - start_time
        self.print_bold_underline(f"LLM generated in {execution_time:.6f} seconds")
        
        return res

    @staticmethod
    def print_bold_underline(text):
        print(f"\033[1m\033[4m{text}\033[0m")

 


In [10]:
text_gen = Llama3_8B_gen(pipeline=pipeline,embeddings=embeddings,
                         vector_store=vector_store,threshold=0.1)
retriever = Langchain_RAG(pdf_file_path=pdf_file_path)

Loading PDF file, this may take time to process...
PDF file loaded.
Chunking...
Chunking completed.
Vector values saved.


In [11]:
def Rag_qa(query):
    retriever_context = retriever(query)
    result = text_gen.generate(query,retriever_context)
    return result

In [12]:
Rag_qa("What is Deep learning ?")

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m




[1m[4mLLM generated in 5.835521 seconds[0m


'According to the given context, Deep Learning is an approach to machine learning that has drawn heavily on our knowledge of the human brain, statistics and applied math. It is a particular kind of machine learning that achieves great power and flexibility by learning to represent the world as a nested hierarchy of concepts, with each concept defined in relation to simpler concepts, and more abstract representations computed in terms of less abstract ones.'

In [13]:
Rag_qa("What is Deep learning ?")

[1m[4m---->> From Cache[0m
[1m[4mLLM generated in 0.009813 seconds[0m


'According to the given context, Deep Learning is an approach to machine learning that has drawn heavily on our knowledge of the human brain, statistics and applied math. It is a particular kind of machine learning that achieves great power and flexibility by learning to represent the world as a nested hierarchy of concepts, with each concept defined in relation to simpler concepts, and more abstract representations computed in terms of less abstract ones.'

In [14]:
Rag_qa("Explain back propagation algorithm.")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 21.266562 seconds[0m


'The back-propagation algorithm is a method for efficiently computing the gradient of the loss function with respect to the model\'s parameters, which is an essential step in training neural networks. The algorithm is called "back-propagation" because it propagates the error backwards through the network, from the output layer to the input layer.\n\nHere\'s a step-by-step explanation of the back-propagation algorithm:\n\n1. **Forward Pass**: Start by performing a forward pass through the network, computing the output for a given input and storing the intermediate results.\n2. **Error Calculation**: Calculate the error between the predicted output and the actual output.\n3. **Backward Pass**: Traverse the network in reverse order, from the output layer to the input layer, and compute the gradients of the loss function with respect to the model\'s parameters.\n4. **Gradient Computation**: At each layer, compute the gradient of the loss function with respect to the layer\'s output, using 

In [15]:
Rag_qa("back propagation algorithm.")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[1m[4m---->> From LLM[0m
[1m[4mLLM generated in 5.019125 seconds[0m


'The back-propagation algorithm involves performing a forward propagation pass moving left to right through the graph, followed by a backward propagation pass moving right to left through the graph. The runtime is O(τ) and cannot be reduced by parallelization because the forward propagation graph is inherently sequential; each time step may only be computed after the previous one.'

In [16]:
Rag_qa("back propagation algorithm.")

[1m[4m---->> From Cache[0m
[1m[4mLLM generated in 0.009828 seconds[0m


'The back-propagation algorithm involves performing a forward propagation pass moving left to right through the graph, followed by a backward propagation pass moving right to left through the graph. The runtime is O(τ) and cannot be reduced by parallelization because the forward propagation graph is inherently sequential; each time step may only be computed after the previous one.'

### Explanation
When generating text directly from the Large Language Model (LLM), the process may take over 40 seconds. However, by caching the generated text, subsequent requests for the same text experience significantly reduced response times. This caching mechanism stores previously generated text, allowing for quick retrieval without the need to regenerate it, thus improving response times for repetitive requests. By leveraging this cache, the system optimizes performance and enhances user experience by minimizing wait times for text generation.