## Use Sythetic Data to test the model

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import os
import pandas as pd
from glob import glob
from getpass import getpass
from ragas.testset import Testset
from docu_bot.utils import create_chatopenai_model, create_openai_embeddings
from docu_bot.document_loaders.git_document_loader import GitDocumentLoader
from docu_bot.document_loaders.utils import LoadedRepositoriesAndFiles

from docu_bot.evaluation.evaluate import Evaluator
from docu_bot.retrievals.document_retrival import DocumentRetrieval
from docu_bot.retrievals.empty_retrieval import EmptyRetrieval
from docu_bot.retrievals.context_query_alteration_retrieval import ContextQueryAlterationDocumentRetrieval
from docu_bot.retrievals.query_alteration_retrieval import QueryAlterationDocumentRetrieval
from docu_bot.retrievals.generative_retrieval import GenerativeDocumentRetrieval
from docu_bot.retrievals.rerank_retrieval import RerankDocumentRetrieval
from docu_bot.retrievals.ner_retireval import NerRetrieval
from docu_bot.retrievals.theme_retrieval import ThemeRetrieval
from docu_bot.retrievals.keyphrase_retrieval import KeyphraseRetrieval

from docu_bot.stores.docstore import DocumentStore
from docu_bot.stores.utils import create_vector_store_from_document_loader, LoadedVectorStores

#### Available Datasets

In [13]:
[os.path.basename(path) for path in glob(os.path.join(os.path.abspath(''), '..', 'datasets', '*.feather'))]

['dcache_synthetic_data.feather',
 'dirac_pilot_synthetic_data.feather',
 'dirac_synthetic_data.feather',
 'it4i_large_synthetic_data.feather',
 'it4i_synthetic_data.feather',
 'synthetic_data.feather']

#### Evaluate 

In [14]:
RESULT_PATH = os.path.join(os.path.abspath(''), "..", "results")
DATASET_NAME = "dirac_pilot_synthetic_data.feather"

In [15]:
model_type = "llama3.3:latest"
embedding_model_type = "text-embedding-3-small"
api_key = getpass("Enter your Metacentrum API key: ")
evaluator_llm_model_type = "gpt-4o-mini"
evaluator_embeddings_model_type = "text-embedding-3-small"
open_ai_api_key = getpass("Enter your OpenAI API key: ")

In [16]:
synthetic_data = Testset.from_pandas(pd.read_feather(os.path.join(os.path.abspath(''), "..", "datasets", DATASET_NAME)))
document_loader = GitDocumentLoader(
    repo_path="https://github.com/DIRACGrid/Pilot.git", branch="master", loaded_repositories_and_files=LoadedRepositoriesAndFiles()
)
docstore =DocumentStore()
cached_vector_store = LoadedVectorStores(embedding_model=embedding_model_type, api_key=open_ai_api_key)
vector_store =create_vector_store_from_document_loader(
    document_loader, docstore, cached_vector_store, embedding_model= embedding_model_type, embedding_api_key=open_ai_api_key,
)

In [17]:
evaluator = Evaluator(
    evaluator_llm=create_chatopenai_model(model_type=evaluator_llm_model_type, api_key=open_ai_api_key),
    evaluator_embedding_model=create_openai_embeddings(model_type=evaluator_embeddings_model_type, api_key=open_ai_api_key),
)

In [18]:
eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=create_chatopenai_model(model_type=model_type, api_key=api_key),
    document_retriever=DocumentRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        search_kwargs={"min_score": 0, "k": 4}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+document_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [01:16<00:00,  2.92s/it]
Evaluating: 100%|██████████| 208/208 [00:54<00:00,  3.84it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[# Dirac Pilots\n\nThe Dirac interware is a so...,[Contributing to DIRAC/Pilot\n================...,GitHub is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.67,1.0,1.0,0.800282,1.0,0.5,0.5,0.0,2.421719


In [19]:
eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=create_chatopenai_model(model_type=model_type, api_key=api_key),
    document_retriever=EmptyRetrieval(
        docstore=docstore,
        search_kwargs={}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+empty_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [00:30<00:00,  1.18s/it]
Evaluating: 100%|██████████| 208/208 [00:40<00:00,  5.12it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[],[Contributing to DIRAC/Pilot\n================...,I don't know. The provided text is empty and d...,Github is used for issue tracking for the DIRA...,0.0,1.0,0.0,0.532278,0.0,0.0,0.0,0.0,0.950726


In [20]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=GenerativeDocumentRetrieval(
        llm=llm,
        vectorstore=vector_store, 
        docstore=docstore,
        search_kwargs={"generate_k": 1, "k": 4, "min_score" : 0}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+generative_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [03:06<00:00,  7.19s/it]
Evaluating: 100%|██████████| 208/208 [00:52<00:00,  3.94it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[# Dirac Pilots\n\nThe Dirac interware is a so...,[Contributing to DIRAC/Pilot\n================...,Github is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.5,1.0,1.0,0.931195,1.0,0.5,0.5,0.0,5.883797


In [21]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=QueryAlterationDocumentRetrieval(
        llm=llm,
        vectorstore=vector_store, 
        docstore=docstore, 
        search_kwargs={"min_score": 0, "k": 4, "num_custom_queires": 2}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+query_alt_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [02:47<00:00,  6.45s/it]
Evaluating: 100%|██████████| 208/208 [00:50<00:00,  4.12it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[# Dirac Pilots\n\nThe Dirac interware is a so...,[Contributing to DIRAC/Pilot\n================...,GitHub is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.67,1.0,1.0,0.797259,1.0,0.5,0.5,0.0,5.166222


In [22]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=ContextQueryAlterationDocumentRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 4}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+context_query_alt_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [05:12<00:00, 12.03s/it]
Evaluating: 100%|██████████| 208/208 [00:54<00:00,  3.83it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[# Dirac Pilots\n\nThe Dirac interware is a so...,[Contributing to DIRAC/Pilot\n================...,GitHub is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.5,1.0,1.0,0.780534,1.0,0.5,0.5,0.0,11.129166


In [23]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=RerankDocumentRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 4}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+rerank_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [02:33<00:00,  5.91s/it]
Evaluating: 100%|██████████| 208/208 [00:43<00:00,  4.75it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[Contributing to DIRAC/Pilot\n================...,[Contributing to DIRAC/Pilot\n================...,GitHub is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.67,1.0,1.0,0.797263,1.0,1.0,1.0,0.0,4.07471


In [25]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=NerRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 4}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+ner_retrieval.feather"))
eval_results_df.head(1)

100%|██████████| 26/26 [02:55<00:00,  6.75s/it]
Evaluating: 100%|██████████| 208/208 [00:54<00:00,  3.80it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[# Dirac Pilots\n\nThe Dirac interware is a so...,[Contributing to DIRAC/Pilot\n================...,Github is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.67,1.0,1.0,0.934655,1.0,0.5,0.5,0.0,7.172383


In [26]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=ThemeRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 4}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+theme_retrieval.feather"))
eval_results_df.head(1)

For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
100%|██████████| 26/26 [04:12<00:00,  9.71s/it]
Evaluating: 100%|██████████| 208/208 [00:55<00:00,  3.74it/s]


Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[Contributing to DIRAC/Pilot\n================...,[Contributing to DIRAC/Pilot\n================...,GitHub is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.67,1.0,1.0,0.800309,1.0,1.0,1.0,0.0,6.578254


In [27]:
llm = create_chatopenai_model(model_type=model_type, api_key=api_key)

eval_results, time_data = evaluator.evaluate_configuration(
    dataset=synthetic_data,
    rag_llm=llm,
    document_retriever=KeyphraseRetrieval(
        vectorstore=vector_store, 
        docstore=docstore, 
        llm=llm,
        search_kwargs={"min_score": 0, "k": 4}
    )
)
eval_results_df = eval_results.to_pandas()
eval_results_df["time"] = time_data
eval_results_df.to_feather(os.path.join(RESULT_PATH, f"{DATASET_NAME.split('.')[0]}+{model_type.replace(':', '_')}+keyphrase_retrieval.feather"))
eval_results_df.head(1)

For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
For troubleshooting

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,factual_correctness,faithfulness,context_recall,semantic_similarity,non_llm_context_recall,llm_context_precision_with_reference,non_llm_context_precision_with_reference,context_entity_recall,time
0,What is github used for in DIRAC/Pilot?,[# Dirac Pilots\n\nThe Dirac interware is a so...,[Contributing to DIRAC/Pilot\n================...,GitHub is used for issue tracking in the DIRAC...,Github is used for issue tracking for the DIRA...,0.5,1.0,1.0,0.780351,1.0,0.5,0.5,0.0,24.414648
