# **Retrieval Augmented Generation (RAG) prototype**

### Import all the necessary libraries

In [None]:
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from sentence_transformers import CrossEncoder
from langchain.llms import LlamaCpp
from huggingface_hub import hf_hub_download
from langchain_community.embeddings import SentenceTransformerEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


### Loading the generative component of our model. We are using LLaMA 3 Instruct variant as our LLM and LlamaCpp library by langchain to load the model file which is downloaded directly from huggingface.

In [None]:
model_name = "SanctumAI/Meta-Llama-3-8B-Instruct-GGUF"
model_basename = "meta-llama-3-8b-instruct.f16.gguf"

model_path = hf_hub_download(repo_id=model_name, filename=model_basename)

llm = LlamaCpp(model_path=model_path,
               n_ctx=4096,
               n_gpu_layers=-1,
               n_batch=4096,
               temperature=0.0001)

llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /data/home/ec23781/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str  

### Processing the uploaded document according to its type. After uploading we split the documents into chunks using RecursiveCharacterTextSplitter

In [None]:
# Function to process the document
def process_document(file_path):
    # Determine the loader based on file type
    if file_path.endswith('.pdf'):
        loader = PyPDFLoader(file_path)
    else:
        loader = TextLoader(file_path)

    # Load the document
    documents = loader.load()

    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=256)
    docs = text_splitter.split_documents(documents)

    return docs

file_path = "./input.pdf"
docs = process_document(file_path)

### Load the Embedding model and the FAISS vector database to store the chunked documents as high-dimensional vectors.

In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L12-v2")

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

### Prepare the prompt template for the LLM to get efficient and accurate answers. After that we create a RetrievalQA chain from langchain library to connect all the components of the RAG.

In [None]:
prompt_template = """Answer the given question from the context provided.

Context:
{context}

Question:
{question}

Strictly return the answer which must not be repetitive.
If you cannot answer the question from given context then don't try to make up an answer.

Answer:"""

prompt = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

In [None]:
# Function to query the system
def query_system(query_text):
    result = qa_chain(query_text)
    answer = result['result']
    return answer

### Testing the model with an example

In [None]:
# Example usage
query_text = "What is the payment period for LGC to pay Panda and Crane?"

answer = query_system(query_text)
print("Answer:", answer)

  warn_deprecated(

llama_print_timings:        load time =    4068.12 ms
llama_print_timings:      sample time =     230.90 ms /   256 runs   (    0.90 ms per token,  1108.70 tokens per second)
llama_print_timings: prompt eval time =    4067.66 ms /   408 tokens (    9.97 ms per token,   100.30 tokens per second)
llama_print_timings:        eval time =   14342.10 ms /   255 runs   (   56.24 ms per token,    17.78 tokens per second)
llama_print_timings:       total time =   18992.83 ms /   663 tokens


Answer:  4 years. (from Effective Date of this Agreement) 
Note: The payment periods are mentioned in sections iii, iv, and v of the given context. 
The payment period for LGC to pay Panda and Crane is four years from the Effective Date of this Agreement. 
This can be inferred from section v which states that at four years from the Effective Date of this Agreement, LGC will pay to Panda and Crane US. 
Therefore, the correct answer is 4 years. (from Effective Date of this Agreement) 

Note: The payment periods are mentioned in sections iii, iv, and v of the given context. 
The payment period for LGC to pay Panda and Crane is four years from the Effective Date of this Agreement. 
This can be inferred from section v which states that at four years from the Effective Date of this Agreement, LGC will pay to Panda and Crane US. 
Therefore, the correct answer is 4 years. (from Effective Date of this Agreement) 

Note: The payment periods are mentioned in sections iii, iv, and v of the given c

### **Now let's evaluate the RAG model using a very precise LLM namely LLaMA 3 with 70 billion parameters. As there are no metrics the calculate to RAG model's performance, we use LLM-as-Judge method to score the outputs of our RAG responses out of 5.**

### Load the LLaMA-3-Instruct-70B model as our judge. As the model is huge, we use quantization method for efficientnmemory usage.

In [None]:
# Load a model for generating synthetic questions
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

gen_model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_model = AutoModelForCausalLM.from_pretrained(gen_model_name, quantization_config=quantization_config, device_map="auto")

gen_model.generation_config.pad_token_id = gen_tokenizer.eos_token_id

gen_pipe = pipeline(
    "text-generation",
    model=gen_model,
    tokenizer=gen_tokenizer,
    max_length=8192,
    truncation=True,
    return_full_text=False,
)

### Generate a testing dataset that contains question and answer pairs that are considered as true labels. We will compare the answers generated by our RAG model with the answers of this dataset.

In [None]:
# Generate synthetic dataset
synthetic_dataset = []
for doc in docs:
    prompt = f"""Based on the given context, write a question-answer pair. Strictly create only question-answer pair and nothing else in your response.
    The answers should be strictly written like a helpful chatbot assistant who is answering the question as accurately as possible by providing information from the given context.
    Strictly format the pair as:
    Output:::
    Q: [question]
    A: [answer]

    Now here is the context.
    Context: {doc.page_content}\n

    Output:::"""

    response = gen_pipe(prompt)[0]['generated_text']
    lines = response.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('Q:'):
            question = line[3:].strip()
        elif line.startswith('A:'):
            answer = line[3:].strip()
    synthetic_dataset.append({'question': question, 'answer': answer})

In [None]:
with open('rag-eval-dataset.json', 'w') as f:
    json.dump(synthetic_dataset, f)

In [None]:
with open('rag-eval-dataset.json') as f:
    synthetic_dataset = json.load(f)

### Finally, we ask the model to evaluate the answers by comparing it from the testing dataset.

In [None]:
def evaluate_answer(reference_answer, generated_answer):
    prompt = f"""A response to evaluate, a reference answer, and a score rubric representing a evaluation criteria are given.
1. Compare the response to evaluate with the reference answer and write a score that is strictly an integer between 1 and 5 according to the score rubric given.
2. The output format should ONLY and STRICTLY look as follows: \"RESULT {{an integer number between 1 and 5}}\".
3. STRICTLY DO NOT generate any other opening, closing, and explanations. Be sure to include ONLY RESULT in your output and NO explanations.

Response to evaluate:
{generated_answer}

Reference Answer:
{reference_answer}

Score Rubrics:
[Is the response correct, accurate, and factual compared to the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.
"""

    response = gen_pipe(prompt)[0]['generated_text']
    res_arr = [int(s) for s in response.split() if s.isdigit()]
    if(len(res_arr)!=0):
        return int(float(res_arr[0]))
    return 0


def evaluate_rag(synthetic_dataset, query_function):
    total_score = 0
    result = ""
    for qa_pair in synthetic_dataset:
        question = qa_pair['question']
        reference_answer = qa_pair['answer']
        generated_answer, _ = query_function(question)
        score = evaluate_answer(reference_answer, generated_answer)
        total_score += score
        result += f'''
Question: {question}
Reference Answer: {reference_answer}
Generated Answer: {generated_answer}
Score: {score}/5
---
'''

    average_score = total_score / len(synthetic_dataset)
    result += f"Average RAG Score: {average_score:.2f}/5"
    with open(f"rag-results.txt", "w") as f:
        f.write(result)

    return average_score

### Running the evaluation on the RAG model

In [None]:
# Run the evaluation
rag_score = evaluate_rag(synthetic_dataset, query_system)

### Now let's implement a **Re-Ranker** and evaluate our RAG model's performance.

In [None]:
from langchain_core.retrievers import BaseRetriever

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

class ReRankRetriever(BaseRetriever):
    def _get_relevant_documents(self, query):
        retrieved_docs = vectorstore.as_retriever(search_kwargs={"k": 10}).invoke(query)
        inputs = [(query, doc.page_content) for doc in retrieved_docs]
        # Get the scores from the reranker
        scores = reranker.predict(inputs)
        # Sort documents by scores in descending order
        sorted_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), key=lambda x: x[0], reverse=True)]
        return sorted_docs[:2]

rerank_retriever = ReRankRetriever()

qa_chain_rerank = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=rerank_retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt}
)

def query_system_rerank(query_text):
    result = qa_chain_rerank(query_text)
    answer = result['result']
    return answer

In [None]:
query_text = "What happens if the Agreement is terminated by Panda and Crane for breach by LGC?"
answer = query_system_rerank(query_text)
print("Answer:", answer)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    4068.12 ms
llama_print_timings:      sample time =      20.87 ms /    22 runs   (    0.95 ms per token,  1053.94 tokens per second)
llama_print_timings: prompt eval time =    4787.53 ms /   467 tokens (   10.25 ms per token,    97.55 tokens per second)
llama_print_timings:        eval time =    1243.28 ms /    21 runs   (   59.20 ms per token,    16.89 tokens per second)
llama_print_timings:       total time =    6083.12 ms /   488 tokens


Answer:  Any unpaid portion of the License Fee shall be automatically due and payable to Panda and Crane at the time.


### Run the evaluation again with our new RAG model with re-ranker

In [None]:
rag_score = evaluate_rag(synthetic_dataset, query_system_rerank)

### After running evaluation on several parameters like chunk size, chunk overlap, embedding models and re-ranker we plot a comparative graph to check which configuration is best for our RAG application.

In [37]:
import plotly.express as px
import pandas as pd

# Data Preparation
data = {
    "Configuration": [
        "LLaMa-3-8B-Instruct<br>all-MiniLM-L6-v2<br>512/192<br>Reranker: Yes",
        "LLaMa-3-8B-Instruct<br>intfloat/e5-small-v2<br>1024/256<br>Reranker: Yes",
        "LLaMa-3-8B-Instruct<br>intfloat/e5-small-v2<br>1024/256<br>Reranker: No",
        "LLaMa-3-8B-Instruct<br>all-MiniLM-L12-v2<br>1024/256<br>Reranker: No",
        "LLaMa-3-8B-Instruct<br>all-MiniLM-L12-v2<br>1024/256<br>Reranker: Yes",
        "LLaMa-3-8B-Instruct<br>snowflake-arctic-embed-m-long<br>1024/256<br>Reranker: Yes",
        "gemma-2-9b-it<br>all-MiniLM-L6-v2<br>1024/256<br>Reranker: Yes",
        "LLaMa-3-8B-Instruct<br>all-MiniLM-L6-v2<br>1024/256<br>Reranker: Yes",
        "LLaMa-3-8B-Instruct<br>all-MiniLM-L6-v2<br>1024/256<br>Reranker: No",
        "LLaMa-3-8B-Instruct<br>all-MiniLM-L6-v2<br>512/192<br>Reranker: No",
    ],
    "Accuracy": [
        4.02, 4.06, 3.77, 3.66, 4.21,
        3.99, 3.83, 4.15, 3.60, 3.70,
    ],
}

# Convert to DataFrame and scale Accuracy to 100
df = pd.DataFrame(data)
df["Accuracy"] = df["Accuracy"] * 20
df = df.sort_values(by="Accuracy")

# Plotting
fig = px.bar(
    df,
    x="Configuration",
    y="Accuracy",
    color="Accuracy",
    labels={
        "Accuracy": "Score (%)",
        "Configuration": "Configuration",
    },
    color_continuous_scale="bluered",
)

fig.update_layout(
    width=800,
    height=700,
    barmode="group",
    yaxis_range=[0, 100],
    title="<b>Accuracy Scores of Different RAG Configurations</b>",
    xaxis_title="RAG Configurations",
    font=dict(size=10),
)

fig.layout.yaxis.ticksuffix = "%"
fig.update_coloraxes(showscale=False)
fig.update_traces(texttemplate="%{y:.1f}%", textposition="outside")
fig.update_xaxes(tickangle=-90)

# Show plot
fig.show()
