In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install langchain transformers chromadb accelerate bitsandbytes sentence_transformers pypdf unstructured gradio -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m809.1/809.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.1/286.1 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

# 🦅PDF🔗Assistant🤗

Using Open-source Tools to answer the domain specific questions required by the user using **Large Language Model (LLM)** and **Retrieval Augmented Generation (RAG)**.

Open-Source Tools used:

    Hugging Face 🤗
    Langchain 🔗
    Chroma DB 📓

    In this, we ask questions regarding to the context needed from the uploaded PDF.

**Why RAG?**

LLM's were trained on the historical data available on the internet, but it may not seen the updated future information. So, the model's response
may not be accurate (or) even it can't response. So, LLM can be out-dated. Also, there is no enough source to support the response.

**Fine-Tuning** can be done on new data, but it may loose the old knowledge since the parameters gets updated and sometimes can be resource expensive.

Another way is **Prompt Engineering**, along with a query, context (or) Information that supports query is given as a prompt to LLM and it comes with
response. Based on this idea, **RAG** is designed to overcome the above cons.

### 1. Importing Packages

In [1]:
"""Importing required packages."""
import os
import glob
import textwrap
import torch
import time

# For model and inference pipeline.
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# For PDF based RAG.
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA

# For web based app.
import gradio as gr

In [2]:
"""To check for GPU device."""
def get_default_device():
    """Use GPU if available, else CPU"""
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(torch.cuda.get_device_properties(i))
        return torch.device('cuda')
    else:
        return torch.device('cpu')
device = get_default_device()
print(f'Using device: {device}')

_CudaDeviceProperties(name='Tesla V100-SXM2-16GB', major=7, minor=0, total_memory=16151MB, multi_processor_count=80)
Using device: cuda


### 2. Vector Database

In [3]:
"""Loading of PDF files present in the Directory."""
loader = DirectoryLoader(
                         '/content/drive/MyDrive/RAG/',
                         glob = "./*.pdf",
                         loader_cls = PyPDFLoader,
                         show_progress = True,
                         use_multithreading = True
                        )
documents = loader.load()

100%|██████████| 2/2 [01:30<00:00, 45.03s/it]


In [4]:
print(f'Total Pages = {len(documents)}')

Total Pages = 1558


In [5]:
print(documents[100].page_content)

2.3. The Gaussian Distribution 81
Figure 2.7 The red curve shows the ellip-
tical surface of constant proba-
bility density for a Gaussian in
a two-dimensional space x=
(x1,x2)on which the density
isexp(−1/2)of its value at
x=µ. The major axes of
the ellipse are deﬁned by the
eigenvectors uiof the covari-
ance matrix, with correspond-
ing eigenvalues λi.
x1x2
λ1/2
1λ1/2
2y1y2u1u2
µ
whereUis a matrix whose rows are given by uT
i. From (2.46) it follows that Uis
anorthogonal matrix, i.e., it satisﬁes UUT=I, and hence also UTU=I, where I Appendix C
is the identity matrix.
The quadratic form, and hence the Gaussian density, will be constant on surfaces
for which (2.51) is constant. If all of the eigenvalues λiare positive, then these
surfaces represent ellipsoids, with their centres at µand their axes oriented along ui,
and with scaling factors in the directions of the axes given by λ1/2
i, as illustrated in
Figure 2.7.
For the Gaussian distribution to be well deﬁned, it is necessary for a

In [6]:
"""Splitting the Documents into chunks for Embeddings."""
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size = 500,
                                               chunk_overlap = 10,
                                               is_separator_regex = True
                                              )

texts = text_splitter.split_documents(documents)

print(f'Chunks of {len(texts)} from {len(documents)} pages.')

Chunks of 7608 from 1558 pages.


In [7]:
print(texts[100].page_content)

2N∑
n=1{y(xn,w)−tn}2(1.2)
where the factor of 1/2is included for later convenience. We shall discuss the mo-
tivation for this choice of error function later in this chapter. For the moment wesimply note that it is a nonnegative quantity that would be zero if, and only if, the


In [8]:
"""
State-of-the-Art Text Embedding Model by Hugging Face.
This model is used to create sentence embeddings for the chunks of Texts.

"""
embeddings = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
"""
Using Chroma DB to create custom vector Database of embeddings.
It uses sentence transformer model to create embeddings for the PDF texts and store it as Vector Database.
Ref: https://python.langchain.com/docs/integrations/vectorstores/chroma

"""
db = Chroma.from_documents(texts, embeddings)

In [10]:
"""Getting context for query from the Database using similarity search."""
query = "What is Backpropagation?"
matching_docs = db.similarity_search(query)

In [11]:
print(matching_docs[0])

page_content='backpropagation is also used to describe the training of a multilayer perceptron us-ing gradient descent applied to a sum-of-squares error function. In order to clarify\nthe terminology, it is useful to consider the nature of the training process more care-\nfully. Most training algorithms involve an iterative procedure for minimization of anerror function, with adjustments to the weights being made in a sequence of steps. At' metadata={'page': 260, 'source': '/content/drive/MyDrive/RAG/Bishop - Pattern Recognition And Machine Learning - Springer 2006.pdf'}


In [12]:
print(matching_docs[0].page_content)

backpropagation is also used to describe the training of a multilayer perceptron us-ing gradient descent applied to a sum-of-squares error function. In order to clarify
the terminology, it is useful to consider the nature of the training process more care-
fully. Most training algorithms involve an iterative procedure for minimization of anerror function, with adjustments to the weights being made in a sequence of steps. At


### 3. Open-source LLM (Falcon-7B🦅)

In [13]:
"""
Initializing the open-source LLM from Hugging Face models repo.
Model used is Falcon-7B Instruct model mainly used for chat based applications.
Ref: https://huggingface.co/tiiuae/falcon-7b-instruct

"""
model_id = "tiiuae/falcon-7b-instruct"
# Initializing Tokenizer for the Falcon.
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Ref: https://huggingface.co/docs/transformers/quantization#bitsandbytes
# Quantization config for the LLM. Quantization used here is 4-bit Quantization.
# Note: This works in case of GPU.
bnb_config = BitsAndBytesConfig(
                                load_in_4bit = True,
                                bnb_4bit_quant_type = "nf4",
                                bnb_4bit_compute_dtype = torch.float16,
                                bnb_4bit_use_double_quant = True,
                               )
# Initializing the pre-trained Falcon model from repo.
model = AutoModelForCausalLM.from_pretrained(
                                             model_id,
                                             trust_remote_code = True,
                                             cache_dir='./content/',
                                             quantization_config = bnb_config,
                                             device_map = 'auto'
                                            )

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/7.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.



modeling_falcon.py:   0%|          | 0.00/56.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b-instruct:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [14]:
model.eval()

FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (maybe_rotary): FalconRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELU(approximate='none')
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)

In [15]:
"""Creating Text generation pipeline with above model and tokenizer with config."""
pipe = pipeline(
                task = "text-generation",
                model = model,
                tokenizer = tokenizer,
                pad_token_id = tokenizer.eos_token_id,
                do_sample = True,
                max_length = 1024,
                temperature = 0.001,
                top_p = 0.95,
                num_return_sequences = 1,
                repetition_penalty = 1.45
               )

# langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

In [16]:
query = "How Explicit Memory is used in LSTM?"
print(llm.invoke(query))


Explicit memory is used in LSTM to store and retrieve specific information from the network. It allows for more efficient and targeted learning, as the network can be optimized to focus on specific tasks.


In [18]:
"""Creation of Custom template for chat task."""
prompt_template = """
Don't try to hallucinate an answer, if you don't know just say that you don't know.
Use only the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
                        template = prompt_template,
                        input_variables = ["context", "question"]
                       )

### 4. Retrieval Augmented Generation(RAG)

In [20]:
"""
Creating the Retriever from the Vector Database.
This Retriever retrieves the similar embeddings to the query from the vector database and appends it with the query embeddings.
Then it is send to the LLM, in which LLM has the support context in-order to answer the user query.
 
"""
retriever = db.as_retriever()

chain = RetrievalQA.from_chain_type(
                                    llm = llm,
                                    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
                                    retriever = retriever,
                                    chain_type_kwargs = {"prompt": PROMPT},
                                    return_source_documents = True,
                                    verbose = False
                                   )

In [21]:
print(db.max_marginal_relevance_search(query, k = 6))

[Document(page_content='CHAPTER10.SEQUENCEMODELING:RECURRENTANDRECURSIVENETS\niswhatgatedRNNsdo.\n10.10.1LSTM\nThecleverideaofintroducingself-loopstoproducepathswherethegradient\ncanﬂowforlongdurationsisacorecontributionoftheinitiallongshort-term\nmemory(LSTM)model(HochreiterandSchmidhuber1997,).Acrucialaddition\nhasbeentomaketheweightonthisself-loopconditionedonthecontext,ratherthan\nﬁxed(,).Bymakingtheweightofthisself-loopgated(controlled Gers e t a l .2000', metadata={'page': 424, 'source': '/content/drive/MyDrive/RAG/deeplearningbook.pdf'}), Document(page_content='Limited\xa0Memory\xa0BFGS\xa0(or\xa0L-BFGS)The\xa0memory costs\xa0ofthe\xa0BFGS\nalgorithmcanbesigniﬁcantlydecreasedbyavoidingstoringthecompleteinverse\nHessianapproximationM.TheL-BFGSalgorithmcomputestheapproximationM\nusingthesamemethodastheBFGSalgorithm,butbeginningwiththeassumption\nthatM( 1 ) t −istheidentitymatrix,ratherthanstoringtheapproximation fromone\nsteptothenext.Ifusedwithexactlinesearches,thedirectionsdeﬁne

In [23]:
"""Functions to print the response and sources."""
def wrap_text_preserve_newlines(text, width = 700):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width = width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


def process_llm_response(llm_response):
    ans = wrap_text_preserve_newlines(llm_response['result'])

    sources_used = ' \n'.join(
                              [
                                  source.metadata['source'].split('/')[-1][:-4]
                                  + ' - page: '
                                  + str(source.metadata['page'])
                                  for source in llm_response['source_documents']
                              ]
                             )

    ans = ans + '\n\nSources: \n' + sources_used
    return ans

In [24]:
def LLM_response(query):
    start = time.time()

    llm_response = chain.invoke(query)
    ans = process_llm_response(llm_response)

    end = time.time()

    time_elapsed = int(round(end - start, 0))
    time_elapsed_str = f'\n\nTime elapsed: {time_elapsed} s'
    return ans + time_elapsed_str

In [25]:
ans = LLM_response(query)
print(ans)

 Explicit memory is used in LSTM to capture the temporal dynamics of the network. The LSTM architecture is designed to capture the temporal dynamics of the network by using explicit memory. The explicit memory is used to store the current state of the network and the previous states of the network. The explicit memory is used to store the current state of the network and the previous states of the network. The explicit memory is used to store the current state of the network and the previous states of the network. The explicit memory is used to store the current state of the network and the previous states of the network. The explicit memory is used to store the current state of the network
and the previous states of the network. The explicit memory is used to store the current state of the network and the previous states of the network. The explicit memory is used to store the current state of the network and the previous states of the network. The explicit memory is used to store the

### 5. Gradio App

In [27]:
title = '🦅PDF🔗Assistant🤗'
# Define another string variable to hold the description of the app
description = 'This application demonstrates the use of the open-source tools to\
               chat with PDF.'

In [28]:
# Create generate function - this will be called when a user runs the gradio app
def generate(query, history):
    # The query will get passed to the LLM model!
    out_texts = LLM_response(query)
    return out_texts
    # And will return responses

In [29]:
# Build gradio Chat interface
gr.ChatInterface(fn = generate,
                # Pass through title and description
                 title = title, description = description,
                # Set theme and launch parameters
                 theme = 'finlaymacklon/boxy_violet').queue().launch(share = True)

themes/theme_schema@0.0.2.json:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://53491f27b0ce0a7ee3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




![App](./Others/Screenshot%20(120).png)

![App2](./Others/Screenshot%20(121).png)

### 6. References
1. [RAG](https://youtu.be/rhZgXNdhWDY?si=OkI_bXiaR7ihLzBy).
2. [Notebook](https://www.kaggle.com/code/hinepo/q-a-chatbot-with-llms-harry-potter/notebook).