In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import pandas as pd
from getpass import getpass
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import Testset
from ragas.llms import LangchainLLMWrapper
from docu_bot.utils import create_chatopenai_model, create_openai_embeddings
from docu_bot.document_loaders.git_document_loader import GitDocumentLoader
from docu_bot.document_loaders.utils import LoadedRepositoriesAndFiles
from docu_bot.datasets.generate_synthetic_data_ragas import (
    generate_dataset,
    create_generator,
)
from docu_bot.evaluation.evaluate import Evaluator
from docu_bot.retrievals.document_retrival import DocumentRetrieval
from docu_bot.retrievals.empty_retrieval import EmptyRetrieval
from docu_bot.retrievals.context_query_alteration_retrieval import ContextQueryAlterationDocumentRetrieval
from docu_bot.retrievals.query_alteration_retrieval import QueryAlterationDocumentRetrieval
from docu_bot.retrievals.generative_retrieval import GenerativeDocumentRetrieval
from docu_bot.retrievals.rerank_retrieval import RerankDocumentRetrieval
from docu_bot.retrievals.ner_retireval import NerRetrieval
from docu_bot.retrievals.theme_retrieval import ThemeRetrieval

from docu_bot.stores.docstore import DocumentStore
from docu_bot.stores.utils import create_vector_store_from_document_loader, LoadedVectorStores

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_type = "gpt-4o-mini"
embedding_model_type="text-embedding-3-small"
api_key = getpass("Enter your OpenAI API key: ")

## Generate Sythetic Datase


In [6]:
llm_model = LangchainLLMWrapper(create_chatopenai_model(model_type=model_type, api_key=api_key))
embeddings_model = LangchainEmbeddingsWrapper(create_openai_embeddings(model_type=embedding_model_type, api_key=api_key))
generator = create_generator(llm_model, embeddings_model)
document_loader = GitDocumentLoader(
    repo_path="https://code.it4i.cz/sccs/docs.it4i.cz.git", branch="master", loaded_repositories_and_files=LoadedRepositoriesAndFiles()
)

In [None]:
synthetic_data_list = []
for i in range(10):
    print(f"Generating synthetic data {i}")
    synthetic_data = generate_dataset(generator, document_loader.load(), dataset_size=100)
    synthetic_data_list.append(synthetic_data.to_pandas())
    print(f"Sleeping for 10 seconds to prevent Token Limit Error" )
    time.sleep(10)
synthetic_data = pd.concat(synthetic_data_list)

In [9]:
synthetic_data

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What is IT4Innovations and how do I install it...,[# IT4Inoovations Documentation\n\nThis projec...,IT4Innovations is a project that contains user...,single_hop_specifc_query_synthesizer
1,Can you explain what EOSC is and how it relate...,[# What Is DICE Project?\n\nDICE (Data Infrast...,The EOSC (European Open Science Cloud) project...,single_hop_specifc_query_synthesizer
2,Can you explain the significance of IT4I in th...,[# Migration to e-INFRA CZ\n\n## Introduction\...,IT4Innovations is a crucial part of e-INFRA CZ...,single_hop_specifc_query_synthesizer
3,What is the default shell available on IT4Inno...,[# Environment and Modules\n\n## Shells on Clu...,The default shell available on IT4Innovations ...,single_hop_specifc_query_synthesizer
4,Why is CentOS recommended for new developers w...,[# Documentation\n\nWelcome to the IT4Innovati...,CentOS is recommended for new developers becau...,single_hop_specifc_query_synthesizer
...,...,...,...,...
45,What is IT4Innovashuns and what does it do?,[IT4Innovations national supercomputing center...,IT4Innovations is a national supercomputing ce...,single_hop_specifc_query_synthesizer
46,Wht is DGX-A100?,"[Savings GPU\n....\nSpetko, Vysocky, Jansik, R...",The DGX-A100 is referenced in the context of a...,single_hop_specifc_query_synthesizer
47,How can I contact support at it4i?,[# Satisfaction and Feedback\n\nIT4Innovations...,"For acute, pressing issues and immediate conta...",single_hop_specifc_query_synthesizer
48,Can you explain how to use HyperQueue on Karol...,[# HyperQueue\n\nHyperQueue lets you build a c...,"To use HyperQueue on Karolina, you first need ...",single_hop_specifc_query_synthesizer


In [None]:
synthetic_data.to_feather(
    os.path.join(os.path.abspath(''), "..", "datasets", "it4i_large_synthetic_data.feather")
)

### Use Sythetic Data to test the model

In [3]:
RESULT_PATH = os.path.join(os.path.abspath(''), "..", "results")
DATASET_NAME = "it4i_synthetic_data.feather"

In [9]:
model_type = "aya-expanse:latest"
embedding_model_type = "text-embedding-3-small"
api_key = getpass("Enter your Metacentrum API key: ")
evaluator_llm_model_type = "gpt-4o-mini"
evaluator_embeddings_model_type = "text-embedding-3-small"
open_ai_api_key = getpass("Enter your OpenAI API key: ")

In [6]:
synthetic_data = Testset.from_pandas(pd.read_feather(os.path.join(os.path.abspath(''), "..", "datasets", DATASET_NAME)))
document_loader = GitDocumentLoader(
    repo_path="https://code.it4i.cz/sccs/docs.it4i.cz.git", branch="master", loaded_repositories_and_files=LoadedRepositoriesAndFiles()
)
docstore =DocumentStore()
cached_vector_store = LoadedVectorStores(embedding_model=embedding_model_type, api_key=open_ai_api_key)
vector_store =create_vector_store_from_document_loader(
    document_loader, docstore, cached_vector_store, embedding_model= embedding_model_type, embedding_api_key=open_ai_api_key,
)

In [10]:
evaluator = Evaluator(
    evaluator_llm=create_chatopenai_model(model_type=evaluator_llm_model_type, api_key=open_ai_api_key),
    evaluator_embedding_model=create_openai_embeddings(model_type=evaluator_embeddings_model_type, api_key=open_ai_api_key),
)

In [None]:
eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=create_chatopenai_model(model_type=model_type, api_key=api_key),
    document_retriever=DocumentRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        search_kwargs={"min_score": 0, "k": 5}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+document_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 25/25 [07:37<00:00, 18.30s/it]
Evaluating: 100%|██████████| 200/200 [01:51<00:00,  1.79it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,How do I build a docker image for dCache if I ...,"[dCache\n======\n\n<img src=""dCache.png"" heigh...",[Building dCache\n===============\n\nRequireme...,The provided context does not offer instructio...,Building a container image is disabled by defa...,0.21,0.590909,0.0,0.489097,0.0,0.0,0.0,0.0,21.705567


In [None]:
eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=create_chatopenai_model(model_type=model_type, api_key=api_key),
    document_retriever=EmptyRetrieval(
        docstore=docstore,
        search_kwargs={}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+empty_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 25/25 [05:48<00:00, 13.93s/it]
Evaluating: 100%|██████████| 200/200 [01:16<00:00,  2.61it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,How do I build a docker image for dCache if I ...,[],[Building dCache\n===============\n\nRequireme...,Building a Docker image for dCache as a newcom...,Building a container image is disabled by defa...,0.0,1.0,0.0,0.434494,0.0,0.0,0.0,0.0,37.683092


In [None]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=GenerativeDocumentRetrieval(
        llm=llm,
        docstore=docstore,
        search_kwargs={"k": 1}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+generative_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 25/25 [32:30<00:00, 78.03s/it]
Evaluating: 100%|██████████| 200/200 [01:25<00:00,  2.33it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,How do I build a docker image for dCache if I ...,[## Building a Docker Image for dCache: A Begi...,[Building dCache\n===============\n\nRequireme...,To build a Docker image for dCache if you are ...,Building a container image is disabled by defa...,0.0,0.588235,0.0,0.422668,0.0,0.0,0.0,0.0,110.583865


In [None]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=QueryAlterationDocumentRetrieval(
        llm=llm,
        vectorstore=vector_store, 
        docstore=docstore, 
        search_kwargs={"min_score": 0, "k": 5, "num_custom_queires": 2}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+query_alt_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 25/25 [11:45<00:00, 28.21s/it]
Evaluating: 100%|██████████| 200/200 [01:44<00:00,  1.91it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,How do I build a docker image for dCache if I ...,[Chapter 2. Installing dCache\n===============...,[Building dCache\n===============\n\nRequireme...,"I'm sorry, but the provided text does not cont...",Building a container image is disabled by defa...,0.17,0.444444,0.0,0.464938,0.0,0.0,0.0,0.0,15.227065


In [None]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=ContextQueryAlterationDocumentRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 5}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+context_query_alt_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 25/25 [31:58<00:00, 76.73s/it]
Evaluating: 100%|██████████| 200/200 [02:30<00:00,  1.33it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,How do I build a docker image for dCache if I ...,"[dCache\n======\n\n<img src=""dCache.png"" heigh...",[Building dCache\n===============\n\nRequireme...,"Based on the provided context, there is no dir...",Building a container image is disabled by defa...,0.07,0.090909,0.0,0.435423,0.0,0.0,0.0,0.0,105.922973


In [None]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=RerankDocumentRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 5}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+rerank_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 25/25 [21:52<00:00, 52.48s/it]
Evaluating: 100%|██████████| 200/200 [01:41<00:00,  1.97it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,How do I build a docker image for dCache if I ...,"[dCache\n======\n\n<img src=""dCache.png"" heigh...",[Building dCache\n===============\n\nRequireme...,"Based on the provided context, there is no spe...",Building a container image is disabled by defa...,0.2,0.35,0.0,0.471682,0.0,0.0,0.0,0.0,69.632867


In [None]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=NerRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 5}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+ner_retrieval.feather"))
eval_results_df.head(1)

In [None]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=ThemeRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 5}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+theme_retrieval.feather"))
eval_results_df.head(1)