In [None]:
# Add autoreload
%reload_ext autoreload
%autoreload 2

In [None]:
from uuid import UUID
import json
import pandas as pd
import pickle
from dataclasses import asdict
from pathlib import Path
import jsonlines
from elasticsearch import Elasticsearch

from redbox.models import Settings
from redbox.models.settings import ElasticLocalSettings
from redbox.storage.elasticsearch import hit_to_chunk
from redbox.models import Settings

from langchain_community.chat_models import ChatLiteLLM
from langchain.globals import set_verbose

from collections.abc import Callable

from elasticsearch import Elasticsearch
from langchain_community.chat_models import ChatLiteLLM
from langchain_core.retrievers import BaseRetriever
from langchain_core.runnables import ConfigurableField

from redbox.models import Settings
from redbox.models.file import UUID
from redbox.storage.elasticsearch import hit_to_chunk


set_verbose(False)

from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv())

pd.set_option("display.max_colwidth", None)

ENV = Settings(minio_host="localhost", elastic=ElasticLocalSettings(host="localhost"))

☝️⚠️ _expand cell to run imports_

# Experimentation notebook for trying to improve Redbox RAG chat  <a class="anchor" id="title"></a>

-----------

**Evaluate Redbox RAG chat on one stable, numbered version of these data**

-----------------------

## Table of Contents <a class="anchor" id="toc"></a>

1. Introduction

* [Overview](#overview)
* [Metrics](#metrics)
    - [Contextual Precisio]()
    - [Contextual Recall]()
    - [Contextual Relevancy]()
    - [Fathfulness]()
    - [Answer Relevancy]()
    - [Hallucination]()
    
2. Setup

* [Set version of the evaluation dataset](#setversion)
* [Run Redbox locally](#run-redbox)
* [Load embeddings into the index](#load-embeddings)

3. Experiment

* [a. Get files that correspond to the version of evaluation dataset](#files)
* [b. Load Evaluation Dataset into test cases](#load-test-cases)
* [c. Generate `actual_output` using RAG and evaluation dataset](#evaluate)
    - [Retrieval Evaluation Metrics]()
    - [Generation Evaluation Metrics]()
* [d. Analyse evaluation results](#analysis)

------------

## 1. Introduction

### Overview <a class="anchor" id="overview"></a>

This notebook allows you to experiment with the retrieval and generation parts of Redbox RAG chat **WITHIN** the notebook, and get back evaluation metrics quickly. This allows you to test if the changes you make can improve the evaluation metrics, compared against the current/deployed RAG chat endpoint.

Redbox RAG chat is made up of many components that work together to give the final RAG pipeline. Each component can be optimised, to hopefully improve the over all performance of the RAG pipeline for Redbox tasks. In order to track if changes made are improving or degrading Redbox performance, we need to establish an evaluation framework. The overall RAG pipeline can be broken down into two main parts:

1. Retrieval - searching and returning the most relevant documents to answer a user question
2. Generation - the ouput of the LLM after considering the retrieved documents, any prompts provided and the user question

This notebook tests both the retrieval and generation sides of the RAG pipeline using specific metrics for each, using the `DeepEval` framework.


For consistency across the team, it is important to evaluate Redbox RAG chat on one stable, numbered version of these data.


[Back to top](#title)

### Metrics <a class="anchor" id="metrics"></a>

Retrieval metrics
- Contextual Precision
- Contextual Recall
- Contextual Relevancy

Generation metrics
- Faithfulness
- Answer Relevancy
- Hallucination


#### Contextual Precision

The contextual precision metric measures your RAG pipeline's retriever by evaluating whether nodes in your `retrieval_context` that are relevant to the given `input` are ranked higher than irrelevant ones.

#### Contextual Recall

The contextual recall metric measures the quality of your RAG pipeline's retriever by evaluating the extent of which the `retrieval_context` aligns with the `expected_output`.

#### Contextual Relevancy

The contextual relevancy metric measures the quality of your RAG pipeline's retriever by evaluating the overall relevance of the information presented in your `retrieval_context` for a given `input`.

#### Faithfulness

The faithfulness metric measures the quality of your RAG pipeline's generator by evaluating whether the `actual_output` factually aligns with the contents of your `retrieval_context`. `deepeval`'s faithfulness metric is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score.

##### Required Arguments
To use the `FaithfulnessMetric`, you need to provide the following arguments when creating an LLMTestCase:

- `input`
- `actual_output`
- `retrieval_context`

[Back to top](#title)

#### Answer Relevancy
The answer relevancy metric measures the quality of your RAG pipeline's generator by evaluating how relevant the actual_output of your LLM application is compared to the provided `input`. `deepeval`'s answer relevancy metric is a self-explaining LLM-Eval, meaning it outputs a reason for its metric score.

##### Required Arguments
To use the AnswerRelevancyMetric, you'll have to provide the following arguments when creating an LLMTestCase:

- `input`
- `actual_output`

[Back to top](#title)

#### Hallucination
The hallucination metric determines whether your LLM generates factually correct information by comparing the `actual_output` to the provided `context`.

##### Required Arguments
To use the HallucinationMetric, you'll have to provide the following arguments when creating an LLMTestCase:

- `input`
- `actual_output`
- `retrieval_context`

[Back to top](#title)

-------------------

## 2. Setup

**Evaluate Redbox RAG chat on one stable, numbered version of these data**

**Set the version of the evaluation dataset you are using to evalute Redbox in the cell below**   <a class="anchor" id="setversion"></a>

In [None]:
DATA_VERSION = "0.2.0"

Embedding and retrieval is locked to a particular embedding model, which should be tied to a single index in the vector stoer. Here we default to the `EMBEDDING_MODEL` environment variable, which will match production if set via `.env.example`.

In [None]:
MODEL = ENV.embedding_model
INDEX = f"{DATA_VERSION}-{MODEL}".lower()

Run the cell below to set up the required folder structure (it will not overwrite folders and files if they already exist)

In [None]:
ROOT = Path.cwd().parents[1]
EVALUATION_DIR = ROOT / "notebooks/evaluation"

V_ROOT = EVALUATION_DIR / f"data/{DATA_VERSION}"
V_RAW = V_ROOT / "raw"
V_SYNTHETIC = V_ROOT / "synthetic"
V_CHUNKS = V_ROOT / "chunks"
V_RESULTS = V_ROOT / "results"
V_EMBEDDINGS = V_ROOT / "embeddings"

V_ROOT.mkdir(parents=True, exist_ok=True)
V_RAW.mkdir(parents=True, exist_ok=True)
V_SYNTHETIC.mkdir(parents=True, exist_ok=True)
V_CHUNKS.mkdir(parents=True, exist_ok=True)
V_RESULTS.mkdir(parents=True, exist_ok=True)
V_EMBEDDINGS.mkdir(parents=True, exist_ok=True)

To save on API costs, we only need to generate a particular version of the evaluation dataset once. If you are using a previously generaterated evalutation dataset, **please download it from shared team location (Google Drive).**

It's helpful for all calls to share a dummy user. Set that here.

In [None]:
USER_UUID = UUID("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa")

Set up the clients to connect to the backend.

In [None]:
S3_CLIENT = ENV.s3_client()
ES_CLIENT = ENV.elasticsearch_client()

### Start Redbox locally <a id="run-redbox"></a>

Start docker runtime, likely with Docker Desktop. However, if you are using colima run the following terminal command

```bash
colima start --memory 8
``` 

#### First-time setup

First time users need to do the following

```bash
poetry install
```

Ensure your `.env` file has an OpenAI API key in, and has the following settings.

Note `EMBEDDING_MODEL` must also be set, and should match both production, and the embeddings shared with the dataset. If not, go back to dataset creation, and embed the documents using a model that matches production.

```bash
# === Object Storage ===

MINIO_HOST=minio
MINIO_PORT=9000
MINIO_ACCESS_KEY=minioadmin
MINIO_SECRET_KEY=minioadmin
AWS_ACCESS_KEY=minioadmin
AWS_SECRET_KEY=minioadmin

AWS_REGION=eu-west-2

# minio or s3
OBJECT_STORE=minio
BUCKET_NAME=redbox-storage-dev
```

#### Run Redbox locally

**Every time you start Redbox for evaluation (no Django frontend required), please run the following command**

```bash
make eval_backend
````

The above command will bring up everything you need for the backend (`core-api`, `worker`, `mino`, `elasticsearch` and `redis`), then create the MinIO bucket needed to store raw files

[Back to top](#title)

### Load embeddings into the index and get file UUIDs <a id="load-embeddings"></a>

In [None]:
def load_chunks_from_jsonl_to_index(file_path: Path, es_client: Elasticsearch, index: str) -> set:

    file_uuids = set()

    with jsonlines.open(file_path, mode="r") as reader:
        for chunk_raw in reader:
            chunk = json.loads(chunk_raw)
            es_client.index(
                index=index,
                id=chunk["uuid"],
                body=chunk,
            )

            file_uuids.add(chunk["parent_file_uuid"])

    return file_uuids

In [None]:
FILE_UUIDS = load_chunks_from_jsonl_to_index(file_path=V_EMBEDDINGS / f"{MODEL}.jsonl", es_client=ES_CLIENT, index=INDEX)

[Back to top](#title)

----------

## 3. Experiment

Use code below to experiment, in order to improve evaluation metrics or address a performance issue

For setting an initial baseline with the existing Redbox Core API endpoint, please start [HERE](#baseline)

If you've already uploaded documents you can skip to [the experimentation phase](#evaluate).

### 3a. Generate `actual_output` and `retrieval_context` using in-notebook `rag_chat()` function <a id="evaluate"></a>

In [None]:
df = pd.read_csv(f"{V_SYNTHETIC}/ragas_synthetic_data.csv")
inputs = df["input"].tolist()

##### Using a `rag_chat()` function

We can conceptualise RAG as having four mechanisms we might tune:

* Chunking
* Embedding
* Retriever
* Prompts

The below `rag_chat()` function replicates the internal logic of the RAG endpoint. By editing and using it here, you can quickly iterate and test the retriever and prompt mechanisms using your stable, versioned data, giving sharable, reproducible results.

As long as `rag_chat()` takes a question (and history) and produces an answer, it's a testable process that could be used in Redbox. Everything within the function is yours to play with -- prompts, retriever, everything.

**Set the experiment name that you want to associate with your experiment in the cell below** - this will save the output scores with this experiment in the file name

In [None]:
EXPERIMENT_NAME = "original_prompt"

In [None]:
from core_api.src import dependencies
from redbox.models import ChatRoute, Chunk, Settings
from redbox.models.chain import ChainInput

from typing import Annotated, Any
from fastapi import Depends
from tiktoken import Encoding
from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough
from langchain_core.vectorstores import VectorStoreRetriever
from operator import itemgetter
from langchain.schema import StrOutputParser

from core_api.src.format import format_documents
from core_api.src.runnables import make_chat_prompt_from_messages_runnable
from langchain_community.chat_models import ChatLiteLLM
from core_api.src.dependencies import get_parameterised_retriever, get_tokeniser
from core_api.src.retriever import ParameterisedElasticsearchRetriever

LLM = ChatLiteLLM(
    model="gpt-4o",
    streaming=True,
)

RETRIEVAL_SYSTEM_PROMPT = (
    "Given the following conversation and extracted parts of a long document and a question, create a final answer. \n"
    "If you don't know the answer, just say that you don't know. Don't try to make up an answer. "
    "If a user asks for a particular format to be returned, such as bullet points, then please use that format. "
    "If a user asks for bullet points you MUST give bullet points. "
    "If the user asks for a specific number or range of bullet points you MUST give that number of bullet points. \n"
    "Use **bold** to highlight the most question relevant parts in your response. "
    "If dealing dealing with lots of data return it in markdown table format. "
)

RETRIEVAL_QUESTION_PROMPT = "{question} \n=========\n{formatted_documents}\n=========\nFINAL ANSWER: "

def get_parameterised_retriever(
    env: Annotated[Settings, Depends(ENV)], es: Annotated[Elasticsearch, Depends(dependencies.get_elasticsearch_client)]
) -> BaseRetriever:
    """Creates an Elasticsearch retriever runnable.

    Runnable takes input of a dict keyed to question, file_uuids and user_uuid.

    Runnable returns a list of Chunks.
    """
    default_params = {
        "size": env.ai.rag_k,
        "num_candidates": env.ai.rag_num_candidates,
        "match_boost": 1,
        "knn_boost": 1,
        "similarity_threshold": 0,
    }
    return ParameterisedElasticsearchRetriever(
        es_client=es,
        index_name=INDEX,
        params=default_params,
        embedding_model=dependencies.get_embedding_model(env),
    ).configurable_fields(
        params=ConfigurableField(
            id="params", name="Retriever parameters", description="A dictionary of parameters to use for the retriever."
        )
    )
      
              
def build_retrieval_chain(
    llm: Annotated[ChatLiteLLM, Depends(dependencies.get_llm)],
    retriever: Annotated[VectorStoreRetriever, Depends(dependencies.get_parameterised_retriever)],
    tokeniser: Annotated[Encoding, Depends(dependencies.get_tokeniser)],
    env: Annotated[Settings, Depends(dependencies.get_env)],
) -> Runnable:
    return (
        RunnablePassthrough.assign(documents=retriever)
        | RunnablePassthrough.assign(
            formatted_documents=(RunnablePassthrough() | itemgetter("documents") | format_documents)
        )
        | {
            "response": make_chat_prompt_from_messages_runnable(
                system_prompt=env.ai.retrieval_system_prompt,
                question_prompt=env.ai.retrieval_question_prompt,
                input_token_budget=env.ai.context_window_size - env.llm_max_tokens,
                tokeniser=tokeniser,
            )
            | llm
            | StrOutputParser(),
            "source_documents": itemgetter("documents"),
            "route_name": RunnableLambda(lambda _: ChatRoute.search.value),
        }
    )


def get_rag_results(question: str
                    ) -> dict:
    
    '''Get Redbox response for a given question.'''
    
    retriever = get_parameterised_retriever(es=ES_CLIENT, env=ENV)

    chain = build_retrieval_chain(llm=LLM, retriever=retriever, tokeniser=get_tokeniser(), env=ENV)
    
    response = chain.invoke(
        input=ChainInput(
            question=question,
            chat_history = [{"text": "", "role": "user"}],
            file_uuids=list(FILE_UUIDS),
            user_uuid=USER_UUID,
        ).model_dump()
    )

    filtered_chunks = []

    for chunk in response['source_documents']:

        chunk = dict(chunk)
        filtered_chunk = {'page_content': chunk['page_content'], 'page_number': chunk['metadata']['page_number'], 'parent_file_uuid': chunk['metadata']['parent_file_uuid']}
        filtered_chunks.append(filtered_chunk)

    return {"output_text": response["response"], "source_documents": filtered_chunks}

In [None]:
%%capture

df_function = df.copy()

retrieval_context = []
actual_output = []

for question in inputs:
    data = get_rag_results(question=question)

    retrieval_context.append(data["source_documents"])
    actual_output.append(data["output_text"])

df_function["actual_output"] = actual_output
df_function["retrieval_context"] = retrieval_context

#### Confirm `actual_output` and `retrieved_context` added to the dataframe

In [None]:
df_function.head()

#### Remove rows containing NaN to prevent Pydantic validation errors

In [None]:
df_function_clean = df_function.dropna()
df_function_clean.to_csv(f"{V_SYNTHETIC}/{EXPERIMENT_NAME}_complete_ragas_synthetic_data.csv", index=False)

[Back to top](#title)

### 3b. Load evaluation dataset into test cases <a class="anchor" id="load-test-cases"></a>

Put the CSV file that you want to use for evaluation into `/notebooks/evaluation/data/synthetic_data/` directory

Import test cases from CSV

In [None]:
from deepeval.dataset import EvaluationDataset

dataset = EvaluationDataset()
dataset.add_test_cases_from_csv_file(
    file_path=f"{V_SYNTHETIC}/{EXPERIMENT_NAME}_complete_ragas_synthetic_data.csv",  # function
    input_col_name="input",
    actual_output_col_name="actual_output",
    expected_output_col_name="expected_output",
    context_col_name="context",
    context_col_delimiter=";",
    retrieval_context_col_name="retrieval_context",
    retrieval_context_col_delimiter=";",
)

[Back to top](#title)

### 3c. Evaluate RAG pipeline <a id="evaluate"></a>

DeepEval imports

In [None]:
from deepeval import evaluate
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
)

Instantiate retrieval and generation evaluation metrics

In [None]:
# Instantiate retrieval metrics
contextual_precision = ContextualPrecisionMetric(
    threshold=0.5,  # default is 0.5
    model="gpt-4o",
    include_reason=True,
)

contextual_recall = ContextualRecallMetric(
    threshold=0.5,  # default is 0.5
    model="gpt-4o",
    include_reason=True,
)

contextual_relevancy = ContextualRelevancyMetric(
    threshold=0.5,  # default is 0.5
    model="gpt-4o",
    include_reason=True,
)

In [None]:
# Instantiate generation metrics
answer_relevancy = AnswerRelevancyMetric(
    threshold=0.5,  # default is 0.5
    model="gpt-4o",
    include_reason=True,
)

faithfulness = FaithfulnessMetric(
    threshold=0.5,  # default is 0.5
    model="gpt-4o",
    include_reason=True,
)

hallucination = HallucinationMetric(
    threshold=0.5,  # default is 0.5
    model="gpt-4o",
    include_reason=True,
)

#### View test cases

In [None]:
dataset.test_cases

#### Evaluation

In [None]:
eval_results = evaluate(
    test_cases=dataset,
    metrics=[
        contextual_precision,
        contextual_recall,
        contextual_relevancy,
        answer_relevancy,
        faithfulness,
        hallucination,
    ],
)

#### Save evaluation results

In [None]:
with open(f"{V_RESULTS}/{EXPERIMENT_NAME}_generation_eval_results", "wb") as f:
    pickle.dump(eval_results, f)

In [None]:
with open(f"{V_RESULTS}/{EXPERIMENT_NAME}_generation_eval_results", "rb") as f:
    eval_results = pickle.load(f)

### 3d. Analyse evaluation results <a id="analysis"></a>

In [None]:
metric_type = {
    "metric_name": [
        "Contextual Precision",
        "Contextual Recall",
        "Contextual Relevancy",
        "Answer Relevancy",
        "Faithfulness",
        "Hallucination",
    ],
    "metric_type": ["retrieval", "retrieval", "retrieval", "generation", "generation", "generation"],
}

evaluation = (
    pd.DataFrame.from_records(asdict(result) for result in eval_results)
    .explode("metrics_metadata")
    .reset_index(drop=True)
    .assign(
        metric_name=lambda df: df.metrics_metadata.apply(getattr, args=["metric"]),
        score=lambda df: df.metrics_metadata.apply(getattr, args=["score"]),
        reason=lambda df: df.metrics_metadata.apply(getattr, args=["reason"]),
    )
    .merge(pd.DataFrame(metric_type), on="metric_name")
    .drop(columns=["success", "metrics_metadata"])
)

evaluation.to_csv(f"{V_RESULTS}/{EXPERIMENT_NAME}_generation_eval_results.csv", index=False)
evaluation.head()

#### Evaluation results

In [None]:
(evaluation.groupby(["metric_name", "metric_type"]).mean("score"))

#### Compare experiments against baseline or visualise baseline alone
Note: there are some complexities that could require additional analysis, such as the uncertainty associated with each individual LLM judge score and the wide range of scores (0 to 1 for some metrics; see boxplot below)

In [None]:
%config InlineBackend.figure_format = 'retina'
import scipy.stats as stats
import seaborn as sns

experiments = []

baseline = pd.read_csv(f"{V_RESULTS}/baseline.csv")
baseline["experiment_name"] = "baseline"
experiments.append(baseline)

# Comment out if you only want to view baseline statistics
# or populate with experiment names
experiment_names = ['original_prompt']
for experiment_name in experiment_names:
    experiment = pd.read_csv(f"{V_RESULTS}/{experiment_name}_generation_eval_results.csv")
    experiment['experiment_name'] = experiment_name
    experiments.append(experiment)

experiments_df = pd.concat(experiments)

def empirical_ci(df: pd.DataFrame
                 ) -> pd.DataFrame:

    '''Calculate confidence intervals for aggregated metrics.'''

    df_grouped = (df
                  .groupby(["experiment_name", "metric_name"])['score']
                  .agg(["mean", 'sem', 'min', 'max', 'count'])
                  .reset_index()
                  )
        
    ci = stats.t.interval(confidence=0.95, 
                          df=df_grouped['count']-1,
                          loc=df_grouped['mean'],
                          scale=df_grouped['sem'])

    df_grouped['ci_low'] = ci[0]
    df_grouped['ci_high'] = ci[1] 

    return df_grouped

# Note that the confidence intervals in sns.barplot are calculated by bootstrapping.
# See empirical_ci() above for empirical confidence interval calculation.
sns.barplot(experiments_df, x="score", y="metric_name", hue="experiment_name", errorbar=("ci", 95))

experiment_metrics = empirical_ci(experiments_df)
experiment_metrics.to_csv(f"{V_RESULTS}/{experiment_name}_eval_results_full.csv")
experiment_metrics

#### Boxplot of results

In [None]:
sns.boxplot(experiments_df, x="score", y="metric_name", hue="experiment_name")

**THIS IS THE END OF THE EXPERIMENT**

Return to [HERE](#evaluate) to try a new experiment

The section below is to establish a baseline with the current Core API endpoint only

[Back to top](#title)

-------