In [None]:
import torch

LLM_MODEL_PATH = "/home/strange/models/llama-3.2-3b-Instruct-f16.gguf"
EMBEDDING_MODEL_PATH = "/home/strange/models/all-mpnet-base-v2"
EMBEDDING_TOKEN_LIMIT = 300

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

cuda
Available GPU memory: 8 GB


In [2]:
%load_ext gradio

## requirements

In [None]:
%pip install -q tqdm
%pip install -q pandas numpy matplotlib
%pip install -q PyMuPDF
%pip install -q sentence-transformers

# see https://spacy.io/usage
%pip install -q spacy[cuda12x]   
!python -m spacy download en_core_web_sm
!python -m spacy download es_core_news_sm

# faiss cpu/cuda
%pip install -q faiss-cpu
%pip install -q faiss-gpu-cu12

# pytorch cpu/cuda
%pip install -q torch
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

# llamacpp inference cpu/cuda
%pip install llama-cpp-python \
  -C cmake.args="-DGGML_BLAS=ON;-DGGML_BLAS_VENDOR=OpenBLAS"
%pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

# evaluation
%pip install "giskard[llm]" --upgrade

## 1. Indexing

### 1.1. Chunking

In [2]:
import re

class RecursiveTextSplitter:
    def __init__(self, max_length, overlap=0, overlap_by_words=True):
        self.max_length = max_length
        self.overlap = overlap
        self.overlap_by_words = overlap_by_words
    
    def split_text(self, text: str):
        return self._recursive_split(text)
    
    def _recursive_split(self, text):
        if len(text) <= self.max_length:
            return [text]
        
        # Attempt to split by paragraphs first
        paragraphs = text.split("\n\n")
        if len(paragraphs) > 1:
            return self._split_and_combine(paragraphs)
        
        # Split by sentences if paragraphs don't reduce size enough
        sentences = re.split(r'(?<=[.!?])\s+', text)
        if len(sentences) > 1:
            return self._split_and_combine(sentences)
        
        # Split by words if still necessary
        words = text.split()
        if len(words) > 1:
            return self._split_and_combine(words, join_char=" ")
        
        return [text]  # return as-is if we can't split further

    def _split_and_combine(self, segments, join_char=" "):
        chunks = []
        current_chunk = ""

        for segment in segments:
            # Check if adding the segment would exceed max_length
            if len(current_chunk) + len(segment) + (len(join_char) if current_chunk else 0) > self.max_length:
                # Finalize the current chunk if it's non-empty
                if current_chunk:
                    chunks.append(current_chunk)
                
                # Set up overlap for the new chunk if applicable
                current_chunk = self._get_overlap(current_chunk, join_char)
            
            # Add the segment to the current chunk
            if current_chunk:
                current_chunk += join_char
            current_chunk += segment

            # If current chunk exceeds max_length after adding, split immediately
            if len(current_chunk) > self.max_length:
                chunks.extend(self._split_long_chunk(current_chunk, join_char))
                current_chunk = ""

        # Add any remaining text as the last chunk
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks

    def _split_long_chunk(self, text, join_char):
        """Split a chunk that exceeds max_length into smaller pieces."""
        words = text.split(join_char) if self.overlap_by_words else list(text)
        chunks = []
        current_chunk = ""
        
        for word in words:
            if len(current_chunk) + len(word) + (len(join_char) if current_chunk else 0) > self.max_length:
                chunks.append(current_chunk)
                current_chunk = ""
            
            if current_chunk:
                current_chunk += join_char
            current_chunk += word
        
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks

    def _get_overlap(self, last_chunk, join_char):
        """Get overlap from the end of the last chunk."""
        if not last_chunk or self.overlap == 0:
            return ""
        
        if self.overlap_by_words:
            # Overlap by words
            words = last_chunk.split()
            overlap_words = words[-self.overlap:] if len(words) >= self.overlap else words
            return join_char.join(overlap_words)
        else:
            # Overlap by characters
            return last_chunk[-self.overlap:]


In [3]:
from pymupdf import Document
from tqdm import tqdm
import pandas as pd

def split_document(document: Document, splitter) -> list[dict]:
    """
    Splits a PDF document into smaller text chunks based on a specified token limit.

    Each chunk is created from the sentences in the document, ensuring that no chunk exceeds 
    the maximum token size defined by `chunk_size`. The function processes each page of the 
    document and formats the text accordingly.

    Args:
        document (Document): The PDF document to be split.
        chunk_size (int): The maximum number of tokens allowed in each chunk.
        splitter (callable): A function to split text into sentences (default is nlp_splitter).

    Returns:
        pd.DataFrame: A DataFrame containing the text chunks, their source document name, 
                      page number, and token count.
    """
    all_chunks = [] 

    for page in tqdm(document):
        # Get page text and strip unnecessary whitespace
        page_text = page.get_text().strip()

        # Split page_text into sentences using the splitter
        sentences = splitter.split_text(page_text)

        # Create chunks directly and extend all_chunks
        all_chunks += [{
            "text": sentence,
            "source": document.name,
            "page_number": page.number + 1, # start page 1
            "chunk_token_count": len(sentence) // 4,  # Approximate token count
        } for sentence in sentences if len(sentence) > 10]

    return all_chunks


def split_documents(documents: list[Document], splitter) -> pd.DataFrame:
    """
    Splits multiple PDF documents into smaller text chunks based on a specified token limit.

    This function iterates through a list of documents and calls `split_document` for each one,
    concatenating all resulting DataFrames into a single DataFrame.

    Args:
        documents (list[Document]): A list of PDF documents to be split.
        chunk_size (int): The maximum number of tokens allowed in each chunk.
        splitter (callable): A function to split text into sentences (default is nlp_splitter).

    Returns:
        pd.DataFrame: A DataFrame containing all text chunks from the provided documents,
                      including their source document name, page number, and token count.
    """
    all_chunks = []
    for document in documents:
        chunks = split_document(document=document, splitter=splitter)
        all_chunks += chunks
    return pd.DataFrame(all_chunks)

In [7]:
import pymupdf

splitter = RecursiveTextSplitter(max_length=EMBEDDING_TOKEN_LIMIT * 4, overlap=3)

chunks = split_documents(
    documents=[pymupdf.open("../docs/attention-is-all-you-need.pdf")],
    splitter=splitter,
)
chunks.describe()

100%|██████████| 15/15 [00:03<00:00,  4.70it/s]


Unnamed: 0,page_number,chunk_token_count
count,40.0,40.0
mean,7.025,250.125
std,3.912423,53.07575
min,1.0,97.0
25%,4.0,219.0
50%,7.0,279.0
75%,10.0,290.5
max,15.0,299.0


#### Gradio Interface

In [None]:
import spacy

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")


def visualize_splits(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Create a formatted output with different background colors for each sentence
    colored_sentences = [
        f"<div style='background-color: hsl({i * 30}, 70%, 90%); padding: 10px; margin: 5px 0;'>{sent}</div>"
        for i, sent in enumerate(sentences)
    ]

    # Join sentences without additional line breaks since <div> handles it
    output = "".join(colored_sentences)
    return output

In [None]:
%%blocks

import gradio as gr

# Create the Gradio interface
demo = gr.Interface(
    fn=visualize_splits,
    inputs=gr.Textbox(
        label="Input Text", placeholder="Type or paste your text here..."
    ),
    outputs="html",  # Change output type to HTML
    title="SpaCy Text Splitter",
)

#### Data Analysis Exploration

https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them

https://platform.openai.com/tokenizer


The propuse of this analysis is to compare the average token count per page with the maximun allow token size of the embedded model

Many embedding models have limits on the size of texts they can ingest, for example, the `sentence-transformers` model `all-mpnet-base-v2` has an input size of 384 tokens.

This means that the model has been trained in ingest and turn into embeddings texts with 384 tokens (1 token ~= 4 characters ~= 0.75 words).

Texts over 384 tokens which are encoded by this model will be auotmatically reduced to 384 tokens in length, potentially losing some information.

In [None]:
chunks.head()

In [None]:
chunks.describe().round(2)

the average token count per page is **297**

it means we could embed an average whole page with the `all-mpnet-base-v2` model (of capacity **384**)

The **max** is 563 meaning a page has 563 tokens, since this is bigger that the embedding model capacity tokens will be loss if we embed a hole page, this is another reason for splitting the page into chunks


Now we break down our list of sentences/text into smaller chunks

Why?
- Is easer to manage similar sized chunks of text
- Don't overload the embedding models capacity for tokens (e.g. there could be information loss if you try to embed a sequence of 400+ tokens into a embedding model with less token capacity)
- LLM context window (the amount of tokens an LLM can take in) we want to be sure is well used

For now, we're break our pages of sentences into groups of 10 (because it seemed to line up well with our embedding model capacity of **384**)

On average each page has 10 sentences and ~287 tokens

the average number of chunks is 1.7, as expected since our pages contain an average of 10 sentences

### 1.2. Embedding our text chunks

In [4]:
from sentence_transformers import SentenceTransformer
from torch import Tensor
from pathlib import Path
from typing import Literal

class EmbeddingModel:
    _instance = None

    def __new__(cls, model_path: Path = EMBEDDING_MODEL_PATH):
        if cls._instance is None:
            cls._instance = super(EmbeddingModel, cls).__new__(cls)
            cls._instance._model = SentenceTransformer(model_path, device=device)
        return cls._instance

    def embed(self, sentences: list[str] | str) -> Tensor:
        if isinstance(sentences, str):
            sentences = [sentences]

        # check if the embedding model can embed the sentences
        larger_than_embedding_limit = list(filter(lambda x: len(x) // 4 > EMBEDDING_TOKEN_LIMIT, sentences))
        assert len(larger_than_embedding_limit) == 0, 'Texts are larger than what the embedding model could digest'

        return self._model.encode(sentences, 
                                  batch_size=32, 
                                  convert_to_tensor=True, # keep the tensors on GPU
                                  normalize_embeddings=True) 
    
    def to(self, device: Literal['cuda', 'cpu']):
        self._model.to(device)

  from tqdm.autonotebook import tqdm, trange


In [6]:
embedding_model = EmbeddingModel()

No sentence-transformers model found with name /home/strange/models/all-mpnet-base-v2. Creating a new one with mean pooling.


In [8]:
%%time

from sentence_transformers import util

embeddings = embedding_model.embed(chunks['text'].tolist())
query_embedings = embedding_model.embed("What are the main benefits of using self-attention in the Transformer over recurrent layers?")

util.semantic_search(query_embedings, embeddings, score_function=util.dot_score)

CPU times: user 633 ms, sys: 384 ms, total: 1.02 s
Wall time: 4.81 s


[[{'corpus_id': 5, 'score': 0.6031705141067505},
  {'corpus_id': 18, 'score': 0.5572218894958496},
  {'corpus_id': 7, 'score': 0.5567423701286316},
  {'corpus_id': 17, 'score': 0.5485848188400269},
  {'corpus_id': 6, 'score': 0.5376397371292114},
  {'corpus_id': 12, 'score': 0.4966805577278137},
  {'corpus_id': 0, 'score': 0.48151886463165283},
  {'corpus_id': 8, 'score': 0.4710034728050232},
  {'corpus_id': 25, 'score': 0.4666858911514282},
  {'corpus_id': 22, 'score': 0.4659819006919861}]]

In [5]:
import os
import faiss
from pathlib import Path
from typing import Sequence
import numpy as np

def create_index(embeddings: np.ndarray):
    dimensions = embeddings.shape[1]

    # creating an index (L2 (Euclidean) distance)
    index = faiss.IndexFlatL2(dimensions)

    # Adding the embeddings to the index
    index.add(embeddings)

    print(f"Total sentences indexed: {index.ntotal}")
    return index


def load_index(
    index_path: Path,
    sentences: Sequence[str], 
    embedding_model: EmbeddingModel = None
) -> faiss.IndexFlatL2:
    """
    Creates a FAISS index from the given embedding model and documents chunks
    or loads the index directly if it already exists.
    """
    # check if embeddings already exists
    if os.path.isfile(index_path):
        print("Index FOUND, loading it...")
        return faiss.read_index(index_path)

    else:
        print("Index not found, generating it...")

        # creating the word embeddings
        embeddings = embedding_model.embed(sentences)

        # creating a FAISS index
        embeddings = embeddings.cpu().numpy().astype('float32')
        index = create_index(embeddings)

        # saving the FAISS index
        print("Saving index...")
        faiss.write_index(index, index_path)
        
        return index

In [None]:
index = load_index(index_path=f"../indexes/embeddings-{len(chunks)}.index",
                   sentences=chunks['text'].tolist(),
                   embedding_model=embedding_model)

Index not found, generating it...
Total sentences indexed: 38
Saving index...


### Benchmark: `CPU` vs `GPU` vs `GPU + batched predictions`

In [None]:
sentences = chunks['text'].tolist()
embedding_model = SentenceTransformer(EMBEDDING_MODEL_PATH)

No sentence-transformers model found with name models/all-mpnet-base-v2. Creating a new one with mean pooling.


In [None]:
%%time

embedding_model.to("cpu")
embeddings = embedding_model.encode(sentences)

CPU times: user 55.8 s, sys: 4.29 s, total: 1min
Wall time: 16.6 s


In [None]:
%%time

embedding_model.to("cuda")
embeddings = embedding_model.encode(sentences)

CPU times: user 740 ms, sys: 85 ms, total: 825 ms
Wall time: 820 ms


In [None]:
%%time

embedding_model.to("cuda")
embeddings = embedding_model.encode(sentences, batch_size=32, convert_to_tensor=True)

CPU times: user 742 ms, sys: 26.7 ms, total: 769 ms
Wall time: 719 ms


| cpu   | gpu   | gpu + batched predictions |
| ----- | ----- | ------------------------- |
| 2.3 m | 7.93 s | 7.3 s                       |

<br>

> note: 
> **batched predictions** means computing on multiple samples at once
> We can perform batched operations by turning our target text samples into a single list and then passing that list to our embedding model.

## 2. Retrieval

**TODO**: 
- [ ] Reranking
- [ ] Implement Hybrid Search using BM25

> note: 
> **Reranking** is to use a separated model that has been trained specifically to take search results (e.g. the top 25 semantic results) and rank them in order from most likely top-1 to least likely (e.g. [MixedBread Models](https://www.mixedbread.ai/blog/mxbai-rerank-v1))

### 2.1. Semantic Search




In [6]:
import faiss
import pandas as pd

def join_scores_chunks(chunks_df: pd.DataFrame, indices: list[int], scores: list[float]) -> pd.DataFrame:
    '''This function gets the chunks with the same indices as the attribute indices and 
    returns a DataFrame with another column added "distances"'''
    # gets a copy from the elements of the chunks list
    subset_df = chunks_df.iloc[indices].copy()
    subset_df['score'] = scores
    return subset_df

def semantic_search(
    query: str,
    chunks: pd.DataFrame,
    faiss_index: faiss.IndexFlatL2,
    embedding_model: EmbeddingModel,
    n_retrieved_docs: int = 30,
    n_docs_final: int = 7,
    reranker=None,
    threashold: float = 0.95
) -> pd.DataFrame:
    query_embeddings = embedding_model.embed(query).cpu()

    # perform semantic-search (distances, indices)
    scores, indices = faiss_index.search(query_embeddings, k=n_retrieved_docs)
    scores, indices = scores[0], indices[0]

    # join scores with chunk info
    relevant_docs = join_scores_chunks(chunks, indices=indices, scores=scores)

    # TODO: filter by a threashold (score must be normalized)
    # relevant_docs = relevant_docs[relevant_docs['score'] >= threashold]

    # reranker step
    if reranker:
        # relevant_docs = reranker.rerank(query, relevant_docs, k=n_docs_final)
        pass

    return relevant_docs[:n_docs_final]

In [15]:
query="What are the main benefits of using self-attention in the Transformer over recurrent layers?"

relevant_docs = semantic_search(query=query, 
                                chunks=chunks, 
                                faiss_index=index, 
                                embedding_model=embedding_model)
relevant_docs.head()

Unnamed: 0,text,source,page_number,chunk_token_count,score
7,Figure 1: The Transformer - model architecture...,docs/attention-is-all-you-need.pdf,3,353,0.90158
5,"in section 3.2. Self-attention, sometimes call...",docs/attention-is-all-you-need.pdf,2,343,0.903466
16,in the network. The shorter these paths betwee...,docs/attention-is-all-you-need.pdf,6,152,0.912788
15,10000 · 2π. We\nchose this function because we...,docs/attention-is-all-you-need.pdf,6,369,0.915001
11,output values. These are concatenated and once...,docs/attention-is-all-you-need.pdf,5,366,1.026594


### Print Results

In [None]:
import textwrap


def print_wrapped(text, wrap_length=80):
    """Limit the line width to 80"""
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)


def print_top_results_and_scores(query: str, relevant_docs: list[tuple[int, str]]):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.
    Note: Requires chunks to be formatted in a specific way (see above for reference).
    """
    print(f"Query: '{query}'\n")
    print("Results:\n")

    # loop through zipped together scores and indicies
    for distance, chunk in relevant_docs:
        page_number = chunk["metadata"]["page_number"]
        text = chunk["text"]

        print(f"Distance: {distance:.4f}")
        print(f"Page Num: {page_number}")
        print("Text:")
        print_wrapped(text)

        print("\n")

In [None]:
print_top_results_and_scores(query, relevant_docs)

In [None]:
import pymupdf
import numpy as np
import matplotlib.pyplot as plt

doc = pymupdf.open(pdf_path)
# Open PDF and load target page
page = doc.load_page(73)  # first ranked page

# Get the image of the page
img = page.get_pixmap(dpi=300)

# Convert the Pixmap to a numpy array
img_array = np.frombuffer(img.samples_mv, dtype=np.uint8).reshape((img.h, img.w, img.n))

# Display the image using Matplotlib
plt.figure(figsize=(13, 10))
plt.imshow(img_array)
plt.title(f"Query: '{query}' | Most relevant page:")
plt.axis("off")  # Turn off axis
plt.show()

## 3. Generation

In [7]:
from llama_cpp import Llama, ChatCompletionRequestMessage, CreateChatCompletionResponse, Iterator, CreateChatCompletionStreamResponse
from typing import Literal
from pathlib import Path

class LLM:
    _instance = None
    def __new__(cls, 
                path: Path = LLM_MODEL_PATH, 
                n_ctx: int = 4096,
                temperature: float = 0.75,
                cpu_threads: int = 8,
                device: Literal['cuda', 'cpu'] = device,
                verbose: bool = False):
        if cls._instance is None:
            cls._instance = super(LLM, cls).__new__(cls)
            cls._instance._model = Llama(
                model_path=path,
                chat_format="llama-3",
                n_threads=cpu_threads,
                n_gpu_layers=-1 if device == 'cuda' else 0,
                n_ctx=n_ctx,
                temperature=temperature,
                f16_kv=True,
                verbose=verbose
            )
        return cls._instance

    def chat_completion(self, 
                        messages: list[ChatCompletionRequestMessage],
                        temperature: bool = 0.75,
                        max_tokens: int = None,
                        top_p: int = None,
                        seed: int = None,
                        response_format: str = None,
                        stream: bool = True
                        ) -> (CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]):
        return self._model.create_chat_completion(messages, 
                                                  temperature=temperature, 
                                                  max_tokens=max_tokens,
                                                  top_p=top_p or 0.95,
                                                  seed=seed,
                                                  response_format={"type": "json_object"} if response_format == "json" else None,
                                                  stream=stream)

In [12]:
llm = LLM()

response = llm.chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are an assistant who perfectly describes images.",
        },
        {"role": "user", "content": "what is the meaning of life?"},
], stream=False)


print(response["choices"][0]["message"]['content'])

A question that has puzzled philosophers, theologians, and everyday humans for centuries!

To describe the meaning of life, I would attempt to put it into words like this:

Imagine a majestic, sprawling garden, filled with vibrant colors, diverse textures, and an intricate network of paths that invite exploration. The garden represents the complexity and richness of human existence.

At the center of the garden stands a magnificent tree, its trunk strong and sturdy, its branches stretching towards the sky, and its leaves rustling with an gentle, soothing melody. This tree is life itself, the core of existence.

The tree's roots dig deep into the earth, symbolizing our connection to our ancestors, our heritage, and our shared experiences. Its trunk represents resilience, strength, and the ability to weather life's storms. The branches, representing the paths we choose, symbolize our growth, evolution, and the diverse experiences that shape us.

As we wander through the garden, we come a

In [8]:
SYSTEM_PROMPT = (
    "I'm an intelligent chatbot designed to assist users by providing accurate information,"
    "answering questions, and engaging in meaningful conversations."
    "My primary goal is to enhance user experience through helpful, informative, and friendly interactions."
)

BASE_PROMPT = (
    "You are an AI assistant providing clear, concise, and informative answers "
    "to user queries. Respond using Markdown for formatting and wrap any equations "
    "in double dollar signs (`$$`). Keep the answer well-structured and avoid unnecessary information.\n"
)

EXAMPLE_OUTPUT = (
    "## Example Outputs"
    "**Example 1**"
    "**Q:** What is Newton's second law?"
    "**A:**"
    "Newton's second law of motion states that:"
    "$$ F = ma $$"
    "where:"
    "- **F** is the force applied to an object."
    "- **m** is the mass of the object."
    "- **a** is the acceleration of the object."
    "This law explains how the velocity of an object changes when it is subjected to an external force."
)

REWRITE_PROMPT = (
    "Rewrite this question as a simple, concise query without adding explanations,"
    "examples, or extra text. "
    "Keep the meaning exactly the same:\nOriginal: '{query}'\nRephrased:"
)

CONTEXT_COMPRESS_PROMPT = (
    "Summarize the following context, retaining only the information that directly supports an answer to the user's question. "
    "If the context does not contain relevant information, respond simply with: 'The answer to the user's question is not provided in the given context.' Avoid any additional reasoning or unnecessary steps.\n\n"
    "**Context:**\n{context}\n\n"
    "**User Question:** {query}\n\n"
    "### Concise Summary:\n"
)

In [9]:
def create_prompt(query: str, 
                  history: str, 
                  context: str, 
                  base_prompt: str = BASE_PROMPT,
                  example_output: str = EXAMPLE_OUTPUT, 
                  max_prompt_length: int = (4096 - 512) * 4 # (context windows - reserved output tokens) * 4
) -> str:
    # TODO: define limit lengths for history, context, query  
    prompt = f"{base_prompt}\n\n## Chat History:\n{history}\n\n## Retrieved Context:\n{context}\n\n## User's Query:\n\"{query}\""

    # Add example responses only if space allows
    if len(prompt) < max_prompt_length:
        prompt += example_output

    return prompt[:max_prompt_length]

In [27]:
query = "What are the main benefits of using self-attention in the Transformer over recurrent layers?"
history = 'The chat history only contains one question, which asks about the largest continent in the world. The response provided is a single sentence stating that Asia is the largest continent.'
context = 'In Germany still live NAZIs'

create_prompt(query, history, context)

'You are an AI assistant providing clear, concise, and informative answers to user queries. Respond using Markdown for formatting and wrap any equations in double dollar signs (`$$`). Keep the answer well-structured and avoid unnecessary information.\n\n\n## Chat History:\nThe chat history only contains one question, which asks about the largest continent in the world. The response provided is a single sentence stating that Asia is the largest continent.\n\n## Retrieved Context:\nIn Germany still live NAZIs\n\n## User\'s Query:\n"What are the main benefits of using self-attention in the Transformer over recurrent layers?"## Example Outputs**Example 1****Q:** What is Newton\'s second law?**A:**Newton\'s second law of motion states that:$$ F = ma $$where:- **F** is the force applied to an object.- **m** is the mass of the object.- **a** is the acceleration of the object.This law explains how the velocity of an object changes when it is subjected to an external force.'

In [10]:
def rewrite_query(query: str, llm: LLM, rewrite_prompt: str = REWRITE_PROMPT):
    prompt = rewrite_prompt.format(query=query)
    response = llm._model(prompt, max_tokens=50, temperature=0.3, stop=["\n"])
    return response['choices'][0]['text'].strip()

In [29]:
rewrite_query("cual es el continente mas grande del planeta", llm)

"'What is the largest continent on Earth?'"

In [11]:
import torch.nn.functional as F

def extract_relevant_history(history: list[dict], query: str, embedding_model: EmbeddingModel, threshold=0.9) -> list[dict]:
    if len(history) < 2: return []

    questions = [history[i].get("content") for i in range(0, len(history), 2)]
    answers   = [history[i].get("content") for i in range(1, len(history), 2)]

    # embed the previos questions & user's query
    questions_embeddings = embedding_model.embed(questions)
    query_embedding = embedding_model.embed(query)

    # perform cosine product for finding the similarity
    similarities = F.cosine_similarity(questions_embeddings, query_embedding)

    # filter relevant items
    relevant_messages = [
        {'question': questions[i], 'answer': answers[i]} 
        for i in range(len(similarities)) 
        if similarities[i] >= threshold
    ]
    
    return relevant_messages

def compress_history(relevant_messages: list[dict], llm: LLM) -> str | None:
    if len(relevant_messages) == 0: return None

    # Concatenate questions and answers for context
    context = "\n".join([f"Q: {entry['question']}\nA: {entry['answer']}" for entry in relevant_messages])
    
    # Create a prompt to summarize the concatenated context
    prompt = (f"Summarize the following chat history, keeping only key details:\n"
              f"{context}\n\nConcise summary:")

    # Generate the compressed history
    response = llm._model(prompt, max_tokens=50, temperature=0.3)
    compressed_history = response['choices'][0]['text'].strip()
    
    return compressed_history


In [31]:
embedding_model = EmbeddingModel()

history = [
    {"role": "user", "content": "¿Cuál es el continente más grande del mundo?"},
    {"role": "assistant", "content": "Asia."},
    {"role": "user", "content": "¿Qué fruta es conocida por tener muchas semillas en su exterior?"},
    {"role": "assistant", "content": "La fresa."},
    {"role": "user", "content": "¿Cuál es el animal terrestre más rápido?"},
    {"role": "assistant", "content": "El guepardo."}
]

# TODO: The lang has to be the same from the source 
query = "Cual es el continente mas pequeño del mundo?"

relevant_messages = extract_relevant_history(history=history, query=query, embedding_model=embedding_model)
print ("relevant_messages", relevant_messages)

compress_history(relevant_messages, llm)

relevant_messages [{'question': '¿Cuál es el continente más grande del mundo?', 'answer': 'Asia.'}]


'The chat history shows that the user asked about the largest continent in the world, and the correct answer was Asia.'

In [12]:
def compress_context(relevant_context: pd.DataFrame, query: str, llm: LLM, prompt: str = CONTEXT_COMPRESS_PROMPT) -> str:
    if len(relevant_context) == 0: return None

    # Concatenate questions and answers for context
    context = "\n".join([f"{item['text']}\n" 
                         for _, item in relevant_context.iterrows()])
    
    # Create a prompt to summarize the concatenated context
    prompt = prompt.format(context=context, query=query)

    # Generate the compressed history
    response = llm._model(prompt, max_tokens=100, temperature=0.3)
    compressed_history = response['choices'][0]['text'].strip()
    
    return compressed_history

In [88]:
compress_context(relevant_docs, query, llm)

'The main benefits of using self-attention in the Transformer over recurrent layers are:\n- Reduced computational cost\n- Improved scalability\n- Enhanced flexibility in modeling tasks\n\n### Response:\nThe main benefits of using self-attention in the Transformer over recurrent layers are:\n- Reduced computational cost\n- Improved scalability\n- Enhanced flexibility in modeling tasks'

In [None]:
import faiss
import pandas as pd
 

def answer_with_rag(
    query: str,
    chunks: pd.DataFrame,
    index: faiss.IndexFlatL2,
    llm: LLM,
    embedding_model: EmbeddingModel,
    history: list[str] = [],
    n_retrieved_docs: int = 30,
    n_docs_final: int = 10,
    system_prompt: str = SYSTEM_PROMPT,
    base_prompt: str = BASE_PROMPT,
    rewrite_prompt: str = REWRITE_PROMPT,
    temperature: float = 0.75,
    verbose: bool = False,
):
    """
    Takes a query, finds relevant resources/context, and generates an answer to
    the query based on the relevant resources.
    """
    if verbose: print (f"query: {query}")

    # step 1: rewrite the query
    rewrited_query = rewrite_query(query=query, llm=llm, rewrite_prompt=rewrite_prompt)
    if verbose: print (f"rewrited query: {rewrited_query}")
    
    # step 2: get relevant previous messages
    relevant_messages = extract_relevant_history(history=history, query=query, embedding_model=embedding_model)
    if verbose: print (f"relevant prev messages: {relevant_messages}")

    # step 2.1: compress previous chat history
    compressed_history = compress_history(relevant_messages, llm=llm)
    if verbose: print (f"compressed history: {compressed_history}")

    # step 3: get context (score, chunk)
    relevant_context = semantic_search(
        query=rewrited_query,
        chunks=chunks,
        faiss_index=index,
        embedding_model=embedding_model,
        n_retrieved_docs=n_retrieved_docs,
        n_docs_final=n_docs_final,
    )
    if verbose: print (f"retrieved {len(relevant_context)} chunks as the context:\n {relevant_context}")

    # TODO:
    # step 3.1: rerank (k=30 -> k=10)
    
    # step 3.2: fusion/compress context
    compressed_context = compress_context(relevant_context, query=query, llm=llm)
    if verbose: print (f"compressed context:\n {compressed_context}")

    # step 4: create the prompt (w/ chat history, context, query)
    prompt = create_prompt(query=rewrited_query, 
                           history=compressed_history,
                           context=compressed_context, 
                           base_prompt=base_prompt)
    if verbose: print (f"prompt - |{len(prompt)}|:\n {prompt}")

    # stream response
    if verbose: print ("\nanswer:\n")
    output = llm.chat_completion(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        temperature=temperature,
        stream=True,
    )

    for chunk in output:
        response_text = chunk["choices"][0]["delta"].get("content", "")
        yield response_text

In [90]:
llm = LLM()

In [94]:
query = "How does the Transformer model handle position information without recurrence?"

answer_generator = answer_with_rag(
    query=query,
    chunks=chunks,
    index=index,
    embedding_model=embedding_model,
    llm=llm,
    verbose=True
)

print ("".join(list(answer_generator)))
# print_wrapped(answer[0]["message"]["content"])

# print("\nContext items:")
# print(context)

query: How does the Transformer model handle position information without recurrence?
rewrited query: 'What is the Transformer model's approach to handling position information without using recurrence?'
relevant prev messages: []
compressed history: None
retrieved 10 chunks as the context:
                                                   text  \
45     Figure 1: The Transformer - model architecture.   
34   distant positions [12]. In the Transformer thi...   
176  Table 3: Variations on the Transformer archite...   
46   - model architecture. The Transformer follows ...   
177  the Transformer architecture. Unlisted values ...   
38   modeling tasks [34]. To the best of our knowle...   
39   RNNs or convolution. In the following sections...   
29   a recurrent network. In this work we propose t...   
86   with full dimensionality. 3.2.3\nApplications ...   
11   evaluate this idea. Ashish, with Illia, design...   

                                 source  page_number  chunk_token_co

## Gradio

In [16]:
embedding_model = EmbeddingModel()

In [17]:
llm = LLM()

In [18]:
import gradio as gr
import pymupdf

splitter = RecursiveTextSplitter(max_length=100, overlap=2)
chunks, index = None, None

def add_message(message: dict, history: list):
    global chunks, index

    history = history or []

    knowledge_base = []
    for file_path in message["files"]:
        history.append({"role": "user", "content": {"path": file_path}})
        knowledge_base.append(file_path)

    if message.get("text"):
        history.append({"role": "user", "content": message["text"]})

    if len(knowledge_base) > 0:
        print ("EMBEDDING...")
        chunks = split_documents(documents=[pymupdf.open(path) for path in knowledge_base], 
                                 splitter=splitter)  
        
        index_path = f"../indexes/gradio-{len(chunks)}.index"
        index = load_index(index_path, 
                           sentences=chunks['text'].tolist(), 
                           embedding_model=embedding_model)

    return gr.MultimodalTextbox(value=None, interactive=False), history

def bot(history: list[dict], temperature: float, system_prompt: str, base_prompt: str):
    global chunks, index

    if chunks is None or index is None:
        print ("no context")
        history.append({"role": "assistant", "content": "Please add a PDF to ask questions"})
        return history 
    
    # get the query
    if len(history) == 0:
        return history
    
    query = history[-1]["content"]

    # stream response
    response = answer_with_rag(
        query=query,
        chunks=chunks,
        index=index,
        llm=llm,
        embedding_model=embedding_model,
        temperature=temperature,
        system_prompt=system_prompt,
        base_prompt=base_prompt,
    )

    history.append({"role": "assistant", "content": ""})
    for character in response:
        history[-1]["content"] += character
        yield history

In [19]:
%%blocks

with gr.Blocks(fill_height=True) as demo:
    chatbot = gr.Chatbot(
        elem_id="chatbot",
        type="messages",
        bubble_full_width=False,
        scale=1,
        show_copy_button=True,
    )

    chat_input = gr.MultimodalTextbox(
        interactive=True,
        show_label=False,
        placeholder="Enter message or upload file...",
        file_count="single",
        file_types=[".pdf"],
    )

    # Model parameters
    with gr.Accordion("Edit Model Parameters", open=False):
        temperature = gr.Slider(label="Temperature", minimum=0, maximum=1, value=0.7)
        system_prompt = gr.TextArea(SYSTEM_PROMPT, label="System Prompt")
        base_prompt = gr.TextArea(BASE_PROMPT, label="Base Prompt")

    chat_msg = chat_input.submit(add_message, [chat_input, chatbot], [chat_input, chatbot], queue=True)
    bot_msg = chat_msg.then(bot, [chatbot, temperature, system_prompt, base_prompt], chatbot)
    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


EMBEDDING...


100%|██████████| 15/15 [00:00<00:00, 40.60it/s]


Index not found, generating it...
Total sentences indexed: 546
Saving index...
EMBEDDING...


100%|██████████| 15/15 [00:00<00:00, 94.87it/s] 

Index FOUND, loading it...





## Evaluation

For the evaluation we'll be using 3 different datasets of 20 questions each

- **"Attention is all You Need"** - AI (15 pages)
- **"SOBERANA-02"** - Medicine (13 pages)
- **"Climate Change 2023 Synthesis Report"** - Climate Science (81 pages)

**Griskard** will generate answers for each question using our defined RAG pipeline and then will use an **LLM** to judge the answers with the reference_answers, and compute the **accuracy** score

- [x] generate datasets
- [ ] run tests over each one and save to json
- [ ] get the metrics

In [14]:
import pandas as pd
import warnings
import giskard
import os

os.environ["OPENAI_API_KEY"] = "sk-…"
giskard.llm.set_llm_api("openai")

pd.set_option("display.max_colwidth", 400)
warnings.filterwarnings("ignore")

first we define our "Judge LLM" and "embedding model"

In this case we're using the same **llama3.2-3B** model, and embedding model

In [15]:
from typing import Sequence, Optional
from giskard.llm.client.base import LLMClient, ChatMessage

class LLMClient(LLMClient):
    def __init__(self, model: LLM = None):
        self.client = model or LLM(n_ctx=4096)

    def complete(
        self,
        messages: Sequence[ChatMessage],
        temperature: float = 1,
        max_tokens: Optional[int] = None,
        caller_id: Optional[str] = None,
        seed: Optional[int] = None,
        format: Optional[str] =None,
    ) -> ChatMessage:
        prompt = []
        for msg in messages:
            role = "assistant" if msg.role.lower() == "assistant" else "user"
            prompt.append({"role": role, "content": msg.content})
        prompt.append({"role":"assistant", "content":""})

        response = self.client.chat_completion(
            messages=prompt,
            temperature=temperature,
            max_tokens=max_tokens or 1000,
            top_p=0.9,
            seed=seed,
            response_format=format,
            stream=False
        )["choices"][0]["message"]['content']

        return ChatMessage(role="assistant", content=response)

In [16]:
from giskard.llm.client import set_default_client

llm = LLM(n_ctx=4096)
llm_client = LLMClient(llm)

set_default_client(llm_client)

load embedding_model + custom_embedding client

In [17]:
from giskard.llm.embeddings.base import BaseEmbedding
import numpy as np
from typing import Sequence

class EmbeddingClient(BaseEmbedding):
    def __init__(self, model: EmbeddingModel = None, **kwargs):
        super().__init__(**kwargs)
        self._model = model or EmbeddingModel()

    def embed(self, texts: Sequence[str]) -> np.ndarray:
        try:
            embeddings = self._model.embed(texts).cpu().numpy()
            return embeddings
        except Exception as err:
            print (err)

In [18]:
from giskard.llm.embeddings import set_default_embedding

embedding_model = EmbeddingModel()
custom_embedding_model = EmbeddingClient(embedding_model)

set_default_embedding(custom_embedding_model)



In [71]:
# fn that calls the rag pipeline (used in evaluate fn)
def answer_fn(question, history):
    assert isinstance(question, str), 'question is not of type str'

    # TODO: check this
    if history is not None:
        history = [str(message) for message in history]

    answer = answer_with_rag(
        query=question,
        history=history,
        chunks=chunks,
        index=index,
        llm=llm,
        embedding_model=embedding_model,
    )

    return "".join(list(answer))

### Dataset Generation (**RAGET**)

In [None]:
import pymupdf
import pandas as pd
from giskard.rag import KnowledgeBase, generate_testset

# load chunks
splitter = RecursiveTextSplitter(max_length=1000, overlap=20)
chunks = split_documents(
    documents=[pymupdf.open("../docs/ipcc_report.pdf")],
    splitter=splitter,
)

# create the knowledge base
knowledge_base_df = pd.DataFrame(chunks, columns=["text"])
knowledge_base = KnowledgeBase(knowledge_base_df,
                               llm_client=llm_client,
                               min_topic_size=4,
                               embedding_model=custom_embedding_model)

100%|██████████| 81/81 [00:01<00:00, 42.84it/s]


In [None]:
testset = generate_testset(knowledge_base,
                           num_questions=4,
                           language="en")

testset.save("ipcc_testset.jsonl")

Generating questions:   0%|          | 0/4 [00:00<?, ?it/s]

In [107]:
testset.to_pandas().head(3)

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
f371ba22-9338-4bcc-a1e1-0ecc32088764,What is the output dimension of each sub-layer in the Transformer model?,512,"Document 10: Figure 1: The Transformer - model architecture. The Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively. 3.1\nEncoder and Decoder Stacks\nEncoder:\nThe encoder is composed of a stack of N = 6 identical layers. Each layer has...",[],"{'question_type': 'simple', 'seed_document_id': 10, 'topic': 'Others'}"
5be9f7fa-d5c3-4db6-8b5b-e2381a7ffbca,Who are the authors of the End-to-end memory networks paper?,"Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus","Document 47: 15(1):1929–1958, 2014. [34] Sainbayar Sukhbaatar, Arthur Szlam, Jason Weston, and Rob Fergus. End-to-end memory networks. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett, editors,\nAdvances in Neural Information Processing Systems 28, pages 2440–2448. Curran Associates,\nInc., 2015. [35] Ilya Sutskever, Oriol Vinyals, and Quoc VV Le. Sequence to sequence learn...",[],"{'question_type': 'simple', 'seed_document_id': 47, 'topic': 'Others'}"
a9c574f8-e51b-4d06-bfd3-95a4c0487e93,"What is the maximum output length used in the Transformer model when the input length is increased by 300, as in the experiment reported in Table 4?",increased the maximum output length to input length + 300,"Document 37: Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23\nof WSJ)\nParser\nTraining\nWSJ 23 F1\nVinyals & Kaiser el al. (2014) [37]\nWSJ only, discriminative\n88.3\nPetrov et al. (2006) [29]\nWSJ only, discriminative\n90.4\nZhu et al. (2013) [40]\nWSJ only, discriminative\n90.4\nDyer et al. (2016) [8]\nWSJ only, discriminative\n91.7\nTra...",[],"{'question_type': 'complex', 'seed_document_id': 37, 'topic': 'Others'}"


### Evaluation over **IPCC** Testset

In [None]:
!git clone https://github.com/Giskard-AI/raget_demo.git

In [74]:
import pymupdf

# load the document and split it
splitter = RecursiveTextSplitter(max_length=1000, overlap=20)
chunks = split_documents(
    documents=[pymupdf.open("../docs/ipcc_report.pdf")],
    splitter=splitter,
)

# load or create the index (involves embeddings creation)
index = load_index(f"../indexes/ipcc-eval-{len(chunks)}.index",
                   sentences=chunks['text'].tolist(), 
                   embedding_model=embedding_model)

100%|██████████| 81/81 [00:01<00:00, 45.44it/s]

Index FOUND, loading it...





In [75]:
from giskard.rag import KnowledgeBase, QATestset

# create knowledge base
knowledge_base_df = pd.DataFrame(chunks, columns=["text"])
knowledge_base = KnowledgeBase(knowledge_base_df, 
                               llm_client=llm_client, 
                               min_topic_size=8,
                               embedding_model=custom_embedding_model)

# load the test set
testset = QATestset.load("../evaluation/testset/ipcc_testset.jsonl")
testset.to_pandas().head(3)

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3a651c11-f02a-4063-be95-a397100320f1,What is the scope of the GHG emissions covered by this category?,All GHG emissions,"Document 37: use, land use change, forestry (CO2LULUCF)\nOther GHG emissions\nFossil fuel and industry (CO2FFI)\nAll GHG emissions\nGHG emissions per year (GtCO2-eq/yr)",[],"{'question_type': 'simple', 'seed_document_id': 37, 'topic': 'Greenhouse Gas Emissions'}"
fd768253-2908-4067-9dec-30b919551b27,What type of renewable energy has a capacity to generate approximately twice as much electricity as solar photovoltaic (PV) systems for the same amount of installed capacity?,Wind,"Document 102: vehicles for battery-electric vehicles. A vertical dashed line is placed in 2010 to indicate the change over the past decade. The electricity production share reﬂects \ndifferent capacity factors; for example, for the same amount of installed capacity, wind produces about twice as much electricity as solar PV. Renewable energy and battery \ntechnologies were selected as illustrat...",[],"{'question_type': 'complex', 'seed_document_id': 102, 'topic': 'Renewable Energy Costs'}"
08d9e898-5a1c-43a7-b559-98c4da8fe151,"What specific strategies can be employed by the private sector to increase its commitment to addressing climate change, particularly in terms of building business cases, accountability mechanisms, and monitoring progress?","Accelerating commitment and follow-through by the private sector is promoted for instance by building business cases for adaptation, accountability and transparency mechanisms, and monitoring and evaluation of adaptation progress.","Document 437: due to climate change is possible through cooperative, international efforts to enhance institutional adaptive capacity and sustainable development (high conﬁdence). Increasing adaptive capacity minimises \nrisk associated with involuntary migration and immobility and improves \nthe degree of choice under which migration decisions are made, while \npolicy interventions can remove...",[],"{'question_type': 'distracting element', 'seed_document_id': 437, 'distracting_context': 'pathways] C1 [97] C1a [50] C1b [47] C2 [133] C3 [311] C3a [204] C3b [97] C4 [159] C5 C6 [97] Table 3.1: Key characteristics of the modelled global emissions pathways.', 'topic': 'Sustainable Development and Climate Action'}"


#### ChatPDF

In [33]:
import pandas as pd
from giskard.rag import AgentAnswer

responses = pd.read_json("../evaluation/chatpdf-results/ipcc_testset_results.json")
answers = responses['answer'].tolist()
contexts = responses['context'].tolist()

answers = [AgentAnswer(message=answer, documents=context) for answer, context in zip(answers, contexts)]

In [36]:
from giskard.rag import evaluate

report = evaluate(
    answers,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [37]:
from datetime import date

report.to_html(f"../evaluation/reports/ipcc-chatpdf-report-({str(date.today())}).html", embed=True)

2024-11-16 18:10:19,443 pid:16623 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-16 18:13:12,996 pid:16623 MainThread giskard.rag  INFO     Found 17 topics in the knowledge base.


#### Humata

In [64]:
import pandas as pd
from giskard.rag import AgentAnswer

responses = pd.read_json("../evaluation/humata-results/ipcc_testset_results.json")
answers = responses['answer'].tolist()
contexts = responses['context'].tolist()

answers = [AgentAnswer(message=answer, documents=context) for answer, context in zip(answers, contexts)]

In [65]:
from giskard.rag import evaluate

report = evaluate(
    answers,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [66]:
from datetime import datetime
import hashlib

# Obtener hora actual
fecha_hora = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Crear un hash basado en la fecha y hora
hash_object = hashlib.sha256(fecha_hora.encode())
hash_hex = hash_object.hexdigest()[:8]  # Tomar solo los primeros 8 caracteres del hash

file_path = f"../evaluation/reports/ipcc-humata-report-({str(date.today())})-{hash_hex}.html"

report.to_html(file_path, embed=True)

2024-11-17 17:48:00,853 pid:41126 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-17 17:51:44,765 pid:41126 MainThread giskard.rag  INFO     Found 17 topics in the knowledge base.


#### My RAG

In [76]:
from giskard.rag import evaluate

report = evaluate(
    answer_fn,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

Asking questions to the agent:   0%|          | 0/20 [00:00<?, ?it/s]

CUDA error: out of memory
  current device: 0, in function alloc at /home/runner/work/llama-cpp-python/llama-cpp-python/vendor/llama.cpp/ggml/src/ggml-cuda.cu:394
  cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1)
/home/runner/work/llama-cpp-python/llama-cpp-python/vendor/llama.cpp/ggml/src/ggml-cuda.cu:102: CUDA error
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


: 

In [39]:
from datetime import date

report.to_html(f"../evaluation/reports/ipcc-report-({str(date.today())}).html", embed=True)

In [24]:
display(report)

2024-11-03 19:27:37,570 pid:29012 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-03 19:32:26,846 pid:29012 MainThread giskard.rag  INFO     Found 17 topics in the knowledge base.


### Evaluation over **Soberana 02** Testset 

In [50]:
import pymupdf

# load the document and split it
splitter = RecursiveTextSplitter(max_length=1000, overlap=20)
chunks = split_documents(
    documents=[pymupdf.open("../docs/soberana-02.pdf")],
    splitter=splitter,
)

# load or create the index (involves embeddings creation)
index = load_index(f"../indexes/soberana-eval-{len(chunks)}.index",
                   sentences=chunks['text'].tolist(), 
                   embedding_model=embedding_model)

100%|██████████| 13/13 [00:00<00:00, 54.84it/s]

Index FOUND, loading it...





In [51]:
from giskard.rag import KnowledgeBase, QATestset, evaluate

# create knowledge base
knowledge_base_df = pd.DataFrame(chunks, columns=["text"])
knowledge_base = KnowledgeBase(knowledge_base_df, 
                               llm_client=llm_client, 
                               embedding_model=custom_embedding_model,
                               min_topic_size=4)

# load the test set
testset = QATestset.load("../evaluation/testset/soberana_testset.jsonl")
testset.to_pandas().head(3)

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
06824818-c2fa-4328-bc76-100d558c14e6,What was the vaccine efficacy against symptomatic disease in the heterologous combination?,92.0% (95%CI 80.4–96.7),"Document 6: Findings We included 44,031 participants (52.0% female, 48.0% male; median age 50 years, range 19–80 years; 7.0%\nblack, 24.0% mixed-race, 59.0% white) in a context of initial Beta VOC predominance, with this variant being partially\nreplaced by Delta near the trial’s end. Vaccine efﬁcacy in the heterologous combination was 92.0% (95%CI 80.4–96.7)\nagainst symptomatic disease. Ther...",[],"{'question_type': 'simple', 'seed_document_id': 6, 'topic': 'Covid-19 Vaccine'}"
c80fc415-e111-4e80-9f78-67800acde3ad,What percentage of participants had serologic evidence of a previous SARS-CoV-2 infection at baseline?,0.3% of participants,"Document 38: 2021, 45.184 volunteers were screened and 44,031 were randomly assigned to So2 (14.679), So2P (14,677) or placebo (14,675) (Fig. 1). Participants’ characteristics were balanced between the\nstudy arms (Table 1) and the gender and racial distri-\nbution were representative of the Cuban demographic\nstructure (see also S4-Table S3 for characteristics by age\nand risk of severe COVID...",[],"{'question_type': 'simple', 'seed_document_id': 38, 'topic': 'Covid-19 Vaccine'}"
55933395-bcf9-41a3-bf0e-f4e76aea0f8f,"What is the efficacy of the vaccine in preventing severe COVID-19, considering only the cases where hospitalization or death occurred?",The vaccine efficacy for prevention of severe COVID-19 was 74.9% (95%CI: 33.7–90.5),"Document 49: for So2 and 2.2% for So2P) and headache (1.0% for placebo, 1.2% for So2 and 1.5% for So2P) (S9-Table S8). The occurrence of\nserious (<0.1%) and severe (0.1%) VAAE was equal be-\ntween groups (S8-Table S7, S10-Table S9). 3.9% of all\nthe AEs were grade 3 in the placebo group against 3.4%\nand 4.2% for groups So2 and So2P, respectively and\n7.9%, 6.7% and 6.7% were serious in these...",[],"{'question_type': 'complex', 'seed_document_id': 49, 'topic': 'Others'}"


#### ChatPDF

In [42]:
import pandas as pd
from giskard.rag import AgentAnswer

responses = pd.read_json("../evaluation/chatpdf-results/soberana_testset_results.json")
answers = responses['answer'].tolist()
contexts = responses['context'].tolist()

answers = [AgentAnswer(message=answer, documents=context) for answer, context in zip(answers, contexts)]

In [44]:
from giskard.rag import evaluate

report = evaluate(
    answers,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [45]:
from datetime import date
report.to_html(f"../evaluation/reports/soberana-chatpdf-report-({str(date.today())}).html", embed=True)

2024-11-16 19:30:14,905 pid:16623 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-16 19:31:10,527 pid:16623 MainThread giskard.rag  INFO     Found 6 topics in the knowledge base.


#### Humata

In [52]:
import pandas as pd
from giskard.rag import AgentAnswer

responses = pd.read_json("../evaluation/humata-results/soberana_testset_results.json")
answers = responses['answer'].tolist()
contexts = responses['context'].tolist()

answers = [AgentAnswer(message=answer, documents=context) for answer, context in zip(answers, contexts)]

In [53]:
from giskard.rag import evaluate

report = evaluate(
    answers,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [54]:
from datetime import datetime
import hashlib

# Obtener hora actual
fecha_hora = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Crear un hash basado en la fecha y hora
hash_object = hashlib.sha256(fecha_hora.encode())
hash_hex = hash_object.hexdigest()[:8]  # Tomar solo los primeros 8 caracteres del hash

file_path = f"../evaluation/reports/soberana-humata-report-({str(date.today())})-{hash_hex}.html"

report.to_html(file_path, embed=True)

2024-11-17 17:36:38,658 pid:41126 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-17 17:37:37,538 pid:41126 MainThread giskard.rag  INFO     Found 6 topics in the knowledge base.


#### My RAG

In [46]:
report = evaluate(
    answer_fn,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

Asking questions to the agent:   0%|          | 0/20 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [47]:
from datetime import date
report.to_html(f"../evaluation/reports/soberana-report-({str(date.today())}).html", embed=True)

In [28]:
display(report)

2024-11-03 19:41:46,712 pid:31607 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-03 19:42:54,701 pid:31607 MainThread giskard.rag  INFO     Found 6 topics in the knowledge base.


### Evaluation over **Attention-Is-All-You-Need** Testset

In [44]:
import pymupdf

# load the document and split it
splitter = RecursiveTextSplitter(max_length=1000, overlap=20)
chunks = split_documents(
    documents=[pymupdf.open("../docs/attention-is-all-you-need.pdf")],
    splitter=splitter,
)

# load or create the index (involves embeddings creation)
index = load_index(f"../indexes/att-eval-{len(chunks)}.index",
                   sentences=chunks['text'].tolist(), 
                   embedding_model=embedding_model)

100%|██████████| 15/15 [00:02<00:00,  5.98it/s]

Index FOUND, loading it...





In [45]:
from giskard.rag import KnowledgeBase, QATestset

# create knowledge base
knowledge_base_df = pd.DataFrame(chunks, columns=["text"])
knowledge_base = KnowledgeBase(knowledge_base_df, 
                               llm_client=llm_client,
                               min_topic_size=5,
                               embedding_model=custom_embedding_model)

# load the test set
testset = QATestset.load("../evaluation/testset/attention_testset.jsonl")
testset.to_pandas().head(3)

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9fcff25f-6c10-4f12-98da-38177d319501,Who are the authors of the paper titled 'Learning phrase representations using rnn encoder-decoder for statistical machine translation'?,"Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk, and Yoshua Bengio.","Document 41: [5] Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Fethi Bougares, Holger Schwenk,\nand Yoshua Bengio. Learning phrase representations using rnn encoder-decoder for statistical\nmachine translation. CoRR, abs/1406.1078, 2014. [6] Francois Chollet. Xception: Deep learning with depthwise separable convolutions. arXiv\npreprint arXiv:1610.02357, 2016. [7] Junyoung Chung, Çagla...",[],"{'question_type': 'simple', 'seed_document_id': 41, 'topic': 'Deep Learning'}"
2596ef34-a984-4dab-b890-e124a9bf0a1c,What type of operations are required for the Positional Encoding?,"The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed.","Document 21: Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations\nfor different layer types. n is the sequence length, d is the representation dimension, k is the kernel\nsize of convolutions and r the size of the neighborhood in restricted self-attention. Layer Type\nComplexity per Layer\nSequential\nMaximum Path Length\nOperations\nSelf-Attention\n...",[],"{'question_type': 'simple', 'seed_document_id': 21, 'topic': 'Self-Attention Mechanism'}"
bf220878-95d1-4e4b-b22a-dda37cf82427,"Can you provide a brief summary of the context in which the papers listed were published, specifically highlighting any relevant conferences or journal names?",The papers listed are related to Natural Language Processing (NLP) and Computational Linguistics.,"Document 45: [25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated\ncorpus of english: The penn treebank. Computational linguistics, 19(2):313–330, 1993. [26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In\nProceedings of the Human Language Technology Conference of the NAACL, Main Conference,\npages 152–15...",[],"{'question_type': 'complex', 'seed_document_id': 45, 'topic': 'Others'}"


#### ChatPDF

In [52]:
import pandas as pd
from giskard.rag import AgentAnswer

responses = pd.read_json("../evaluation/chatpdf-results/attention_testset_results.json")
answers = responses['answer'].tolist()
contexts = responses['context'].tolist()

answers = [AgentAnswer(message=answer, documents=context) for answer, context in zip(answers, contexts)]

In [54]:
from giskard.rag import evaluate

report = evaluate(
    answers,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
from datetime import date
report.to_html(f"../evaluation/reports/attention-chatpdf-report-({str(date.today())}).html", embed=True)

2024-11-16 20:22:25,945 pid:16623 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-16 20:22:43,803 pid:16623 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


#### Humata

In [46]:
import pandas as pd
from giskard.rag import AgentAnswer

responses = pd.read_json("../evaluation/humata-results/attention_testset_results.json")
answers = responses['answer'].tolist()
contexts = responses['context'].tolist()

answers = [AgentAnswer(message=answer, documents=context) for answer, context in zip(answers, contexts)]

In [47]:
from giskard.rag import evaluate

report = evaluate(
    answers,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
from datetime import datetime
import hashlib

# Obtener hora actual
fecha_hora = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Crear un hash basado en la fecha y hora
hash_object = hashlib.sha256(fecha_hora.encode())
hash_hex = hash_object.hexdigest()[:8]  # Tomar solo los primeros 8 caracteres del hash

file_path = f"../evaluation/reports/attention-humata-report-({str(date.today())})-{hash_hex}.html"


report.to_html(file_path, embed=True)

2024-11-17 17:06:54,776 pid:41126 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-17 17:07:19,865 pid:41126 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


#### My RAG

In [56]:
report = evaluate(
    answer_fn,
    testset=testset,
    knowledge_base=knowledge_base,
    llm_client=llm_client,
)

Asking questions to the agent:   0%|          | 0/20 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
from datetime import date
report.to_html(f"../evaluation/reports/attention-report-({str(date.today())}).html", embed=True)

In [26]:
display(report)

2024-11-03 19:56:03,370 pid:34526 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-11-03 19:56:35,368 pid:34526 MainThread giskard.rag  INFO     Found 3 topics in the knowledge base.


## Extensions



* **better pdf preprocessing**: May want to improve text extraction with something like Marker - https://github.com/VikParuchuri/marker
* **better pdf preprocessing**: Guide to more advanced PDF extraction - https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517
* **prompt engineering**: See the following prompt engineering resources for more prompting techniques - [Prompt Guide](https://www.promptingguide.ai/es), Brex's Prompt Engineering Guide
* **not in the text case**: What happens when a query comes through that there isn't any context in the textbook on?
* **better embedding model**: Try another embedding model (e.g. Mixed Bread AI large, `mixedbread-ai/mxbai-embed-large-v1`, see: https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
* **better llm model**: Try another LLM... (e.g. Mistral-Instruct)
* **add figures & images**: Our example only focuses on text from a PDF, however, we could extend it to include figures and images
* [x] **faster index db**: Vector database/index for larger setup (e.g. 100,000+ chunks)

* **Optimizations** for speed
    * See Hugging Face docs for recommended speed ups on GPU - https://huggingface.co/docs/transformers/perf_infer_gpu_one
    * Optimum NVIDIA - https://huggingface.co/blog/optimum-nvidia, GitHub: https://github.com/huggingface/optimum-nvidia
    * See NVIDIA TensorRT-LLM - https://github.com/NVIDIA/TensorRT-LLM
    * See GPT-Fast for PyTorch-based optimizations - https://github.com/pytorch-labs/gpt-fast
    * Flash attention 2 - https://github.com/Dao-AILab/flash-attention

* [x] **gui**: Turn the workflow into an app, see Gradio type chatbots for this - https://www.gradio.app/guides/creating-a-chatbot-fast, see local example: https://www.gradio.app/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face


 1. **Chunking Strategies**

- **Optimal Chunk Size**: Adjust the chunk size of documents to balance between retrieval accuracy and memory usage. Aim for chunk sizes between **100-600 tokens**, depending on your specific content and model context length. Smaller chunks can improve retrieval performance but may require more processing time[1][2].
- **Chunk Overlap**: Implement a slight overlap between chunks to ensure that important information is not lost during retrieval. A typical overlap might be around **20-50 tokens**[3].

 2. **Re-Ranking Techniques**

- **Implement Re-Ranking**: Use a re-ranking model to prioritize the most relevant document chunks after initial retrieval. This can significantly improve the quality of information fed into the LLM, ensuring that it works with the most pertinent data. You can use lightweight models that are suitable for your hardware, such as LoRA fine-tuned models[1][2].
- **Combine Multiple Retrieval Methods**: Consider integrating results from different retrieval methods (e.g., BM25 and semantic search) and then apply re-ranking to combine these results effectively[1].

 3. **Query Enhancement**

- **Refine Queries**: Modify and enhance user queries to better express intent, which can lead to improved retrieval results. Techniques like query expansion or reformulation can help in this regard[2][3].
- **Self-Reflection Mechanism**: Implement a self-reflection step where the system reassesses the relevance of retrieved chunks based on their ability to answer the query directly, potentially using Natural Language Inference (NLI) models[2].

 4. **Generator Optimization**

- **Prompt Compression**: Reduce noise in the input prompt by compressing irrelevant details from retrieved document chunks. Focus on emphasizing key paragraphs to fit within the model's context window effectively[2][3].
- **Fine-tuning Prompts**: Experiment with different prompt structures and formats to see which yields better responses from your LLaMA model[2].

 5. **Hardware Utilization**

- **Batch Processing**: Optimize how you process requests by using batch processing techniques where possible. This can help maximize GPU utilization on your RTX 2070.
- **Hybrid RAG Approach**: If feasible, consider a hybrid setup where you perform embedding and retrieval locally but offload LLM inference to a more powerful remote server or cloud service. This can alleviate some computational burdens while still allowing for effective RAG performance[4].

 6. **Monitoring and Evaluation**

- **Use Evaluation Metrics**: Regularly assess your RAG pipeline's performance using metrics such as context recall, answer relevancy, and faithfulness to identify areas needing improvement[1][5].
- **Iterative Improvements**: Based on evaluation results, iteratively refine your chunking strategies, re-ranking methods, and prompt designs to enhance overall performance.

By implementing these strategies tailored to your specific hardware capabilities, you can significantly improve the efficiency and effectiveness of your RAG pipeline on an RTX 2070.

Citations:
[1] https://developer.nvidia.com/blog/enhancing-rag-pipelines-with-re-ranking/
[2] https://zilliz.com/learn/how-to-enhance-the-performance-of-your-rag-pipeline
[3] https://www.datacamp.com/tutorial/how-to-improve-rag-performance-5-key-techniques-with-examples
[4] https://developer.nvidia.com/blog/optimize-ai-model-performance-and-maintain-data-privacy-with-hybrid-rag/
[5] https://playbooks.capdev.govtext.gov.sg/improving_rag/
[6] https://towardsdatascience.com/evaluating-rag-applications-with-ragas-81d67b0ee31a?gi=351ae2ced8db
[7] https://langfuse.com/guides/cookbook/evaluation_of_rag_with_ragas
[8] https://blog.relari.ai/a-practical-guide-to-rag-pipeline-evaluation-part-1-27a472b09893?gi=5f76cf419bfd

> **Note:** The process of augmenting or changing a prompt to an LLM is known as prompt engineering. And the best way to do it is an active area of research. For a comprehensive guide on different prompt engineering techniques ([promptingguide.ai](https://www.promptingguide.ai/)), [Brex's Prompt Engineering Guide](https://github.com/brexhq/prompt-engineering) and the paper [Prompt Design and Engineering: Introduction and Advanced Models](https://arxiv.org/abs/2401.14423).