In [None]:
from tqdm.auto import tqdm
import pandas as pd
import typing as t
import json
import jsonlines
import pickle

from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

from redbox.model_db import SentenceTransformerDB
from redbox.models import Settings
from redbox.parsing import chunk_file
from redbox.models.file import File, Chunk

from elasticsearch import Elasticsearch

from mypy_boto3_s3.client import S3Client

from dotenv import find_dotenv, load_dotenv
_ = load_dotenv(find_dotenv())

pd.set_option("display.max_colwidth", None)

ENV = Settings()
ENV.minio_host = "localhost"
ENV.elastic.host = "localhost"

S3_CLIENT = ENV.s3_client()
ES_CLIENT = ENV.elasticsearch_client()

!cd ../.. && make eval_backend

INFO:root:Connecting to self managed Elasticsearch
INFO:root:Elasticsearch host = localhost


make: *** No rule to make target `eval_backend'.  Stop.


# Create evaluation dataset for Redbox RAG chat  <a class="anchor" id="title"></a>

------

**Evaluate Redbox RAG chat on one stable, numbered version of these data**

----------------

**Before running this notebook**

Set the version of the evaluation dataset you are creating **[HERE](#setversion)**

## Table of Contents <a class="anchor" id="toc"></a>
* [Overview](#overview)
* [Set version of the evaluation dataset](#setversion)
* [Select files for creating evaluation dataset](#files)
* [Imports](#imports)
* [Generate evaluation dataset](#ragas)
* [Save evaluation dataset](#save)
* [Pre-embed the documents](#embed)
* [Troubleshooting](#troubleshooting)

--------

## Overview <a class="anchor" id="overview"></a>

It is really important to version the evaluations we are doing, including the input data used to generate evaluation datasets.

This notebook uses the files you select in combination with the RAGAS framework to generate synthetic data. Two different LLMs are used, one for the 'generator' and one for the 'critic'.

Please be aware the generating synthetic data will incur LLM API costs.

The purpose of this note book is to **create a filesystem with a versioned dataset**. This means:

* Raw files, like documents nad PDFs
* An evaluation dataset, containing questions and answers
* Embedded chunks for those raw files

There is a troubleshooting section at the end of this notebook [Troubleshooting](#troubleshooting)

[Back to top](#title)

-----------

**Evaluate Redbox RAG chat on one stable, numbered version of these data**

**Set the version of the evaluation dataset you will be creating in this notebook in the cell below**  <a class="anchor" id="setversion"></a>

In [1]:
DATA_VERSION = "0.1.0"

In [None]:






MODEL = ENV.embedding_model
INDEX = f"{DATA_VERSION}-{MODEL.embedding_model_name}".lower()

Run the cell below to set up the required folder structure (it will not overwrite folders and files if they already exist)

In [2]:
from pathlib import Path

ROOT = Path.cwd().parents[1]
EVALUATION_DIR = ROOT / "notebooks/evaluation"

V_ROOT = EVALUATION_DIR / f"data/{DATA_VERSION}"
V_RAW = V_ROOT / "raw"
V_SYNTHETIC = V_ROOT / "synthetic"
V_CHUNKS = V_ROOT / "chunks"
V_RESULTS = V_ROOT / "results"
V_EMBEDDINGS = V_ROOT / "embeddings"

V_ROOT.mkdir(parents=True, exist_ok=True)
V_RAW.mkdir(parents=True, exist_ok=True)
V_SYNTHETIC.mkdir(parents=True, exist_ok=True)
V_CHUNKS.mkdir(parents=True, exist_ok=True)
V_RESULTS.mkdir(parents=True, exist_ok=True)
V_EMBEDDINGS.mkdir(parents=True, exist_ok=True)

It's helpful for all calls to share a dummy user. Set that here.

In [3]:
from uuid import UUID

USER_UUID = UUID("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa")

[Back to top](#title)

---------

#### Select files that you will use to generate versioned evaluation dataset   <a class="anchor" id="files"></a>

Now copy all the files you want to use to generate **THIS VERSION** of the evaluation dataset into `notebooks/evaluation/data/{DATA_VERSION}/raw/`

Also upload these files to shared Google Drive and the corresponding version number/location

--------------

#### Imports <a id="imports"></a>

In [4]:
from tqdm.auto import tqdm
import pandas as pd
import typing as t
import json
import jsonlines
import pickle

pd.set_option("display.max_colwidth", None)

In [5]:
from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

In [6]:
from redbox.model_db import SentenceTransformerDB
from redbox.models import Settings
from redbox.parsing import chunk_file
from redbox.models.file import File, Chunk

from elasticsearch import Elasticsearch

from mypy_boto3_s3.client import S3Client



Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

INFO:root:Connecting to self managed Elasticsearch
INFO:root:Elasticsearch host = localhost


[Back to top](#title)

--------

## Synthetically generate evaluation dataset <a class="anchor" id="ragas"></a>

RAGAS generating a synthetic test set detailed [HERE](https://docs.ragas.io/en/stable/getstarted/testset_generation.html). Perhaps not as SOTA as DeepEval (validate!), but it creates `input` AND `expected_output` for us. 

So we are not generating input questions based on our chunking strategy, however, we are using the same files

In [None]:
# Takes about 4 minutes for 4 docs. Consider Langchain `unstructured`
loader = DirectoryLoader(V_RAW)
documents = loader.load()

#### Save Langchain documents for future use

In [None]:
def save_docs_to_jsonl(documents: t.Iterable[Document], file_path: str) -> None:
    with jsonlines.open(file_path, mode="w") as writer:
        for doc in documents:
            writer.write(doc.dict())


def load_docs_from_jsonl(file_path) -> t.Iterable[Document]:
    documents = []
    with jsonlines.open(file_path, mode="r") as reader:
        for doc in reader:
            documents.append(Document(**doc))
    return documents

In [None]:
save_docs_to_jsonl(documents, V_CHUNKS / "documents.jsonl")

-----------

In [None]:
# RAGAS generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo") # to match core-api
critic_llm = ChatOpenAI(model="gpt-4o") # cheaper model with similar performance
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [None]:
# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.4, reasoning: 0.3, multi_context: 0.3})

#### Save RAGAS generated testset <a class="anchor" id="save"></a>

As pickle

In [None]:
with open(f'{V_SYNTHETIC}/ragas_testset.pkl', 'wb') as f:
    pickle.dump(testset, f)

Convert dataframe into a DeepEval compatible CSV & save

In [None]:
testset_df = testset.to_pandas()

# Rename the columns
new_column_names = {
    'question': 'input',
    'contexts': 'context',
    'ground_truth': 'expected_output',
    # Add more column names here
}

testset_df_renamed = testset_df.rename(columns=new_column_names)

#  DeepEval dataset format requires an 'actual_output' column
testset_df_renamed['actual_output'] = ''
testset_df_renamed = testset_df_renamed.drop(['evolution_type', 'metadata', 'episode_done'], axis=1)

# Convert all columns to string & drop NaN - otherwise DeepEval will throw an Pydantic validation error
testset_df_renamed = testset_df_renamed.astype(str)
testset_df_renamed = testset_df_renamed.dropna()

# save as CSV
testset_df_renamed.to_csv(f'{V_SYNTHETIC}/ragas_synthetic_data.csv', index=False)

#### (Optional) View top 5 rows of synthetically generated data

In [None]:
testset_df_renamed.head()

[Back to top](#title)

-----------------------

## Pre-embed the documents for other users <a class="anchor" id="embed"></a>

Embeddings take a while. Here we show how to compute and save them for other users.

For now we use the chunking strategy from `worker/`, and embed with any models we choose.

Ensure the necessary services are running with `make eval_backend`, but all we really need is MinIO.

In [8]:
def save_chunks_to_jsonl(chunks: t.Iterable[Chunk], file_path: Path) -> None:
    with jsonlines.open(file_path, mode="w") as writer:
        for chunk in chunks:
            writer.write(chunk.model_dump_json())

def embed_file(
    file_path: Path, 
    data_version: str, 
    bucket_name: str, 
    model: SentenceTransformerDB, 
    user_uuid: UUID = USER_UUID, 
    s3_client: S3Client = S3_CLIENT
) -> list[Chunk]:
    key = f"{data_version}/{file_path.name}"
    file = File(key=key, bucket=bucket_name, creator_user_uuid=user_uuid)
    
    # Upload to bucket
    with open(file_path, 'rb') as f:
        s3_client.upload_fileobj(f, bucket_name, key)
    
    # Chunk
    chunks = chunk_file(file=file, s3_client=s3_client)

    # Embed
    embeddings = [
        embedding.embedding 
        for embedding 
        in model.embed_sentences([chunk.text for chunk in chunks]).data
    ]

    # Merge
    chunks_embedded = []
    for chunk, embedding in zip(chunks, embeddings, strict=True):
        chunk_embedded = Chunk(
            uuid=chunk.uuid, 
            created_datetime=chunk.created_datetime, 
            creator_user_uuid=chunk.creator_user_uuid, 
            parent_file_uuid=chunk.parent_file_uuid, 
            index=chunk.index, 
            text=chunk.text, 
            metadata=chunk.metadata, 
            embedding=embedding
        )
        chunks_embedded.append(chunk_embedded)
    
    return chunks_embedded

In [None]:
all_embedded_files = []

for file_path in V_RAW.glob("*.*"):
    file_chunks_embedded = embed_file(
        file_path=file_path,
        data_version=DATA_VERSION,
        bucket_name=ENV.bucket_name,
        model=SentenceTransformerDB(model_name=EMBEDDING_MODEL)
    )
    all_embedded_files += file_chunks_embedded

[Back to top](#title)

-----------------------

## Troubleshooting <a class="anchor" id="troubleshooting"></a>

#### Langchain DirectoryLoader Error

If you run into a poppler path error and poppler is installed and can be access from your virtual environment (by running `pdfinfo -v`), then close notebook and restart the Jupyter server from the terminal where the path is correctly set (by running `code notebooks/evaluation/evaluation_dataset_generation.ipynb`) 

#### RAGAS synthetically generated evaluation data

We have found some rows of synthetically generated evaluation data from using the RAGAS framework, includes some NaN and/or not str type, which results in an error for DeepEval metrics, as these data fail Pydantic validation.

To avoid this, ensure you turn RAGAS synthetically generated evaluation data to type str and remove rows of data with NaN

#### DeepEval framework

At the moment, this notebook only loads the evaluation dataset into DeepEval from a CSV. There is a JSON import option that we are not using.

[Back to top](#title)

-------