In [114]:
import os
import streamlit as st
import re
import pandas as pd
import jsonpickle
import numpy as np
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter, SemanticSplitterNodeParser, TokenTextSplitter
from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage, Settings
from llama_index.core.schema import TextNode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.core.evaluation import generate_question_context_pairs, RetrieverEvaluator
from llama_index.core.prompts import BasePromptTemplate, PromptTemplate
from transformers import AutoTokenizer
import nest_asyncio

# To allow nested event loops
nest_asyncio.apply()

# Define constants
results_folder = os.path.join("data_evaluation", "full_results")

In [97]:
def extract_score(text):
    # Define the regex pattern
    pattern = r"My score =\s*(\d+(?:\.\d*)?)(?:\s*[\.\:\,\;\!\?]|\s*\n)"
    
    # Search for the first match in the text
    match = re.search(pattern, text)
    
    # Return the first match if found, else return None
    if match:
        return float(match.group(1))
    else:
        return 0

In [115]:
def compute_correctness_relevancy_answer_relevancy_faithfulness_results(eval_results_dict: dict):
    """Display results from evaluate."""
    full_df = pd.DataFrame()
    for name, eval_results in eval_results_dict.items():
        faithfulness_score = sum(extract_score(result.feedback) for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
        relevancy_score = sum(result.score for result in eval_results['relevancy']) / len(eval_results['relevancy'])
        correctness_score = (sum(result.score for result in eval_results['correctness']) / len(eval_results['correctness'])) / 5  # Max = 5 points
        answer_relevancy_score = sum(extract_score(result.feedback) for result in eval_results['answer_relevancy']) / len(eval_results['answer_relevancy'])
        mean_score = np.mean([faithfulness_score, relevancy_score, correctness_score, answer_relevancy_score])
        nodes_nbr = eval_results["nodes_nbr"]

        metric_df = pd.DataFrame(
            {"retriever_name": [name], "faithfulness": [faithfulness_score], "relevancy": [relevancy_score],
             "correctness": [correctness_score], "answer_relevancy": [answer_relevancy_score], "nodes_number": nodes_nbr,
             "mean_score": mean_score}
        )

        full_df = pd.concat([full_df, metric_df])

    return full_df

In [3]:
def compute_hit_hrr_results(eval_results_dict: dict):
    """Display results from evaluate."""
    full_df = pd.DataFrame()
    for name, eval_results in eval_results_dict.items():
        metric_dicts = []
        for eval_result in eval_results:
            metric_dict = eval_result.metric_vals_dict
            metric_dicts.append(metric_dict)

        df = pd.DataFrame(metric_dicts)

        hit_rate = df["hit_rate"].mean()
        mrr = df["mrr"].mean()

        metric_df = pd.DataFrame(
            {"retriever_name": [name], "hit_rate": [hit_rate], "mrr": [mrr]}
        )

        full_df = pd.concat([full_df, metric_df])

    return full_df

In [22]:
# Craft questions and context pairs which can be used in the assessment of the RAG system of both Retrieval and Response Evaluations
input_folder = "./data_evaluation/batch_1"
documents = SimpleDirectoryReader(input_dir=input_folder, recursive=True).load_data()
print(f"\n\nNumber of documents : {len(documents)}\n\n")

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url='https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf',
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.0,  # Model needs to be factual and deterministic
    max_new_tokens=512,
    # Context size
    context_window=8192, # Max is ~32k
    # Kwargs to pass to __call__()
    generate_kwargs={},
    # Set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 10},
    # Transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True
)

parsers = {}

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

Settings.llm = llm
Settings.embed_model = embed_model

semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, 
breakpoint_percentile_threshold=95, 
embed_model=embed_model)
parsers["semantic_splitter"] = semantic_splitter

# Token splitter 512
token_splitter_512 = TokenTextSplitter(chunk_size=512, chunk_overlap=50, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_512"] = token_splitter_512

# Token splitter 1024
token_splitter_1024 = TokenTextSplitter(chunk_size=1024, chunk_overlap=102, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_1024"] = token_splitter_1024


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
l



Number of documents : 3




AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'general.quantization_version': '2', 'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.add_bos_token': 'true', 'tokenizer.ggml.padding_token_id': '0', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'tokenizer.ggml.bos_toke

In [94]:
eval_results_dict = {}
for parser_name, parser in parsers.items():
    print(parser_name, "\n")

    nodes = parser.get_nodes_from_documents(documents)

    qa_dataset = generate_question_context_pairs(
        nodes,
        llm=llm,
        num_questions_per_chunk=2
    )

    vector_index = VectorStoreIndex(nodes)
    retriever = vector_index.as_retriever(similarity_top_k=3)

    retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever)

    # Evaluate
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)  # Can't put this line in a function otherwise it raises an error
    eval_results_dict[parser_name] = eval_results

semantic_splitter 



  0%|          | 0/101 [00:00<?, ?it/s]
llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =      12.02 ms /   154 runs   (    0.08 ms per token, 12810.91 tokens per second)
llama_print_timings: prompt eval time =   18053.94 ms /   756 tokens (   23.88 ms per token,    41.87 tokens per second)
llama_print_timings:        eval time =   11712.03 ms /   153 runs   (   76.55 ms per token,    13.06 tokens per second)
llama_print_timings:       total time =   29966.20 ms /   909 tokens
  1%|          | 1/101 [00:29<49:57, 29.97s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       4.81 ms /    62 runs   (    0.08 ms per token, 12889.81 tokens per second)
llama_print_timings: prompt eval time =    2715.31 ms /   117 tokens (   23.21 ms per token,    43.09 tokens per second)
llama_print_timings:        eval time =    4339.40 ms /    61 runs   (   71.14 ms per token,    1

token_splitter_512 



  0%|          | 0/261 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       5.46 ms /    65 runs   (    0.08 ms per token, 11909.12 tokens per second)
llama_print_timings: prompt eval time =   10631.16 ms /   254 tokens (   41.85 ms per token,    23.89 tokens per second)
llama_print_timings:        eval time =    5145.43 ms /    64 runs   (   80.40 ms per token,    12.44 tokens per second)
llama_print_timings:       total time =   15882.94 ms /   318 tokens
  0%|          | 1/261 [00:15<1:08:51, 15.89s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       4.19 ms /    52 runs   (    0.08 ms per token, 12419.39 tokens per second)
llama_print_timings: prompt eval time =    7575.13 ms /   539 tokens (   14.05 ms per token,    71.15 tokens per second)
llama_print_timings:        eval time =    3721.38 ms /    51 

token_splitter_1024 



  0%|          | 0/105 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =      14.24 ms /   177 runs   (    0.08 ms per token, 12428.90 tokens per second)
llama_print_timings: prompt eval time =   20554.30 ms /   857 tokens (   23.98 ms per token,    41.69 tokens per second)
llama_print_timings:        eval time =   15493.65 ms /   176 runs   (   88.03 ms per token,    11.36 tokens per second)
llama_print_timings:       total time =   36469.64 ms /  1033 tokens
  1%|          | 1/105 [00:36<1:03:13, 36.48s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       5.12 ms /    67 runs   (    0.08 ms per token, 13096.17 tokens per second)
llama_print_timings: prompt eval time =    9079.79 ms /   565 tokens (   16.07 ms per token,    62.23 tokens per second)
llama_print_timings:        eval time =    6003.13 ms /    66 

In [86]:
batch_1_results = os.path.join(results_folder, "results_2_docs_Semantic_Token1024_Token512.csv")

df = pd.read_csv(batch_1_results, sep=";")
df

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,semantic_splitter,1.0,0.935185
1,token_splitter_512,1.0,0.892857
2,token_splitter_1024,1.0,0.875


Sentence Splitter accuracy was way below others that is why it is not going to be used (cf results_2_docs_Sentence_Semantic_Token.csv)

On two documents (7 chunks for SemanticSplitter, 5 for Token512 and 2 for Token1024) : Semantic has the higher score and increased token size for token splitter seems to lower HR and MRR

Try to evaluate on more documents/chunks to confirm

In [90]:
batch_2_results = os.path.join(results_folder, "results_10_docs_Semantic_Token1024_Token512.csv")

df = pd.read_csv(batch_2_results, sep=";")
df

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,semantic_splitter,0.801724,0.698276
1,token_splitter_512,0.717143,0.607143
2,token_splitter_1024,0.721519,0.622363


On 10 documents (49 chunks for Semantic, 121 for Token512 and 52 for Token1024) : Semantic splitter is still better than TokenSplitter, but the Token1024 is slightly better than the 512

Timer : 10.51 for Semantic, 20.40 for Token512, 14.57 for Token1024 ==> Meaning that, with increased database volume the Semantic Splitter is going to be more and more slow than the TokenSplitter BUT the results are way better so it is a trade-off

In [95]:
batch_3_results = os.path.join(results_folder, "results_23_docs_Semantic_Token1024_Token512.csv")

df = compute_hit_hrr_results(eval_results_dict)
df.to_csv(batch_3_results, index=False, sep=";")

df = pd.read_csv(batch_3_results, sep=";")
df

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,semantic_splitter,0.783133,0.684739
1,token_splitter_512,0.766615,0.649923
2,token_splitter_1024,0.763359,0.685751


Finally, on 23 documents (101 chunks for Semantic, 261 for Token512 and 105 for Token 1024) : Semantic splitter is still better than TokenSplitter but TokenSPlitter has increased its HitRate and MRR

Timer : 26.36 for Semantic, 46.23 for Token512 and 24.17 for Token1024 ==> Meaning that, with increased database volume the Semantic Splitter is going to be more and more slow than the TokenSplitter and the results tend to decrease

In [97]:
nodes = token_splitter_512.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes)
retriever = vector_index.as_retriever(similarity_top_k=3)

In [103]:
len(nodes[1].text)

1820

# Full evaluation

In [95]:
import os
import pandas as pd
import jsonpickle
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import math
import re
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SemanticSplitterNodeParser, TokenTextSplitter
from llama_index.core import VectorStoreIndex, load_index_from_storage, Settings
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.prompts import PromptTemplate
from llama_index.core.evaluation import generate_question_context_pairs, RetrieverEvaluator, FaithfulnessEvaluator, RelevancyEvaluator, AnswerRelevancyEvaluator, BatchEvalRunner, CorrectnessEvaluator
import nest_asyncio
from llama_index.llms.mistralai import MistralAI

# To allow nested event loops
nest_asyncio.apply()

In [69]:
def get_query_engine(sentence_index, similarity_top_k=3, rerank_top_n=2):
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base", device="mps"
    )
    engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[rerank]
    )

    return engine

In [121]:
# Craft questions and context pairs which can be used in the assessment of the RAG system of both Retrieval and Response Evaluations
batch = "batch_2"
input_folder = f"./data_evaluation/{batch}/files/"
batch_folder = os.path.join("data_evaluation", batch)
results_folder = os.path.join(batch_folder, "results")
documents = SimpleDirectoryReader(input_dir=input_folder, recursive=True).load_data()
print(f"\n\nNumber of documents : {len(documents)}\n\n")

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url='https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf',
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.0,  # Model needs to be factual and deterministic
    max_new_tokens=512,  # Change to put 1024 ?
    # Context size
    context_window=8192, # Max is ~32k
    # Kwargs to pass to __call__()
    generate_kwargs={},
    # Set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 10},
    # Transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True
)

parsers = {}

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

# service_context_llm_base = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
Settings.llm = llm
Settings.embed_model = embed_model

semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, 
breakpoint_percentile_threshold=95, 
embed_model=embed_model)
parsers["semantic_splitter"] = semantic_splitter

# Token splitter 512
token_splitter_512 = TokenTextSplitter(chunk_size=512, chunk_overlap=50, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_512"] = token_splitter_512

# Token splitter 1024
token_splitter_1024 = TokenTextSplitter(chunk_size=1024, chunk_overlap=102, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_1024"] = token_splitter_1024

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336


llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 8
llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
llama_model_loader: - kv  10:                       llama.rope.freq_base f32              = 1000000.000000
llama_model_loader: - kv  11:                          general.file_type u32              = 15
llama_model_loader: - kv  12:                       tokenizer.ggml.model str              = llama
llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  15:                  t



Number of documents : 10




llama_kv_cache_init:        CPU KV buffer size =   704.00 MiB
llama_kv_cache_init:      Metal KV buffer size =   320.00 MiB
llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:      Metal compute buffer size =   560.00 MiB
llama_new_context_with_model:        CPU compute buffer size =   560.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 3
AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'general.quantization_version': '2', 'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation role

In [71]:
# Setup your API KEY here
api_key = input("Put your API key here")

# Load Mixtral 8x7b model
llm_mixtral = MistralAI(api_key=api_key, 
                        model="open-mixtral-8x7b", 
                        temperature=0.0,
                        max_tokens=1024)

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

In [73]:
eval_results_dict = {}
eval_results_dict_2 = {}
questions_per_chunk = 1
output_name = "MISQ4KM_bge_small_re2"

for parser_name, parser in parsers.items():
    print(f"\n{parser_name}\n")

    nodes = parser.get_nodes_from_documents(documents)

    Settings.llm = llm_mixtral
    Settings.embed_model = embed_model

    qa_dataset = generate_question_context_pairs(
        nodes,
        llm=llm_mixtral,
        num_questions_per_chunk=questions_per_chunk
    )

    Settings.llm = llm
    Settings.embed_model = embed_model

    vector_index = VectorStoreIndex(nodes)
    retriever = vector_index.as_retriever(similarity_top_k=3)

    # Create retriever evaluator
    # It evaluates a retriever using a set of metrics. (here : hit rate, is the correct context among the retrieved ones
    # and MRR, how well the correct context is positioned among retrieved contexts
    retriever_evaluator = RetrieverEvaluator.from_metric_names(["mrr", "hit_rate"], retriever=retriever)

    # Evaluate
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)  # Can't put this line in a function otherwise it raises an error
    eval_results_dict[parser_name] = eval_results

    # Create query engine
    query_engine = get_query_engine(sentence_index=vector_index, similarity_top_k=3, rerank_top_n=2)

    # Get queries created from Mixtral
    queries = list(qa_dataset.queries.values())

    # Compute faithfulness evaluation
    # Evaluates whether a response is faithful to the contexts
    # (i.e. whether the response is supported by the contexts or hallucinated.)
    # For faithfulness score the base prompt had to be changed since it did not respect the instructions given and so the scores were 90% of the time at None or something with apple
    # because of the base preprompt
    faithfulness_custom_template = PromptTemplate(
    """In this task, you will act as a faithfulness evaluator for a language model's responses. Your job is to determine whether the model's response is faithful to the given context, i.e. 
    whether the response is supported by the context or if it is hallucinated.

    Here are the criteria for your evaluation:

    * If the response is not supported by the context, you should give it a score of 0.
    * If the response is fully supported by the context, you should give it a score of 1.
    * If the response is partially supported by the context, you may give it a score between 0 and 1, where a score closer to 1 indicates greater faithfulness to the context.

    Here are two examples to help you understand the task:

    Example 1:
    Context: "The capital of France is Paris."
    Model response: "The capital of France is Rome."
    Faithfulness score: 0 (The response is not supported by the context, as the capital of France is Paris, not Rome.)

    Example 2:
    Context: "The capital of France is Paris. The Eiffel Tower is a famous landmark in Paris."
    Model response: "The Eiffel Tower is a famous landmark in the capital of France."
    Faithfulness score: 1 (The response is fully supported by the context, as the Eiffel Tower is indeed a famous landmark in Paris, which is the capital of France.)

    Based on given context:
    \n"{context_str}"\n

    And the model answer :
    \n"{query_str}"\n

    Evaluate the faithfulness of the following model response. Put your score as "My score = \n""")
    faithfulness_mixtral = FaithfulnessEvaluator(llm=llm_mixtral, eval_template=faithfulness_custom_template)

    # Compute relevancy evaluation
    # Evaluates the relevancy of retrieved contexts and response to a query.
    # This evaluator considers the query string, retrieved contexts, and response string.
    relevancy_mixtral = RelevancyEvaluator(llm=llm_mixtral)

    # Compute answer relevancy evaluation
    # Evaluates the relevancy of response to a query.
    # This evaluator considers the query string and response string.
    # Focuses on assessing how pertinent the generated answer is to the given prompt
    # For answer relevancy score the base prompt had to be changed since it did not respect the instructions given and so the scores were 90% of the time at None
    answer_relevancy_custom_template = PromptTemplate(
    """Your goal is to evaluate the answer relevancy of an other model to a question. You have to score the model's answer between 0 and 1. 1 meaning the 
    model perfectly answered the question, 0 meaning it does not at all answer the question. For example, to the question "What is the capital of France?" 
    The answer "The capital of France is Paris." will have an answer relevancy score of 1, whereas the answer "France is a country in Europe with many famous cities like Paris and Lyon." 
    will have an answer relevancy score of 0.5 and "The capital of Spain is Madrid." will have an answer relevancy score of 0.\n\n

    Based on this query :
    \n"{query}"\n

    And the model answer :
    \n"{response}"\n

    How would you rate the model answer relevancy to the question ? Put your score as "My score = \n"""
    )
    answer_relevancy_mixtral = AnswerRelevancyEvaluator(llm=llm_mixtral, eval_template=answer_relevancy_custom_template)

    # Compute correctness evaluation
    # Evaluate the relevance and correctness of a generated answer against a reference answer.
    correctness_mixtral = CorrectnessEvaluator(llm=llm_mixtral)

    # Initiate BatchEvalRunner to compute FaithFulness and Relevancy Evaluation.
    runner = BatchEvalRunner({"faithfulness": faithfulness_mixtral, "relevancy": relevancy_mixtral,
                              "answer_relevancy": answer_relevancy_mixtral, "correctness": correctness_mixtral},
                              workers=8)

    # Compute evaluation
    eval_results_2 = await runner.aevaluate_queries(query_engine, queries=queries)
    eval_results_dict_2[parser_name] = eval_results_2

    # Add number of nodes
    eval_results_dict_2[parser_name]["nodes_nbr"] = len(nodes)
    
# Format metrics and save it
df_hrr_mrr = compute_hit_hrr_results(eval_results_dict)
df_other_metrics = compute_correctness_relevancy_answer_relevancy_faithfulness_results(eval_results_dict_2)
df_results = pd.merge(df_hrr_mrr, df_other_metrics, on="retriever_name")
df_results["retriever_name"] = df_results["retriever_name"].apply(lambda x: x + "_" + output_name)
df_results.to_csv(os.path.join(results_folder, output_name + ".csv"), index=False)

# Save full results
with open(os.path.join(results_folder, output_name + "_hrr_mrr_full_results.txt"), "w") as f:
    f.write(jsonpickle.encode(eval_results_dict))

with open(os.path.join(results_folder, output_name + "_other_metrics_full_results.txt"), "w") as f:
    f.write(jsonpickle.encode(eval_results_dict_2))


semantic_splitter



100%|██████████| 49/49 [00:40<00:00,  1.22it/s]

llama_print_timings:        load time =   14048.92 ms
llama_print_timings:      sample time =      14.97 ms /   146 runs   (    0.10 ms per token,  9754.79 tokens per second)
llama_print_timings: prompt eval time =   14047.73 ms /   505 tokens (   27.82 ms per token,    35.95 tokens per second)
llama_print_timings:        eval time =   13843.06 ms /   145 runs   (   95.47 ms per token,    10.47 tokens per second)
llama_print_timings:       total time =   28089.91 ms /   650 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =   14048.92 ms
llama_print_timings:      sample time =      30.65 ms /   340 runs   (    0.09 ms per token, 11094.43 tokens per second)
llama_print_timings: prompt eval time =   22957.39 ms /  1340 tokens (   17.13 ms per token,    58.37 tokens per second)
llama_print_timings:        eval time =   26499.09 ms /   339 runs   (   78.17 ms per token,    12.79 tokens per second)
llama_print_ti


token_splitter_512



100%|██████████| 122/122 [01:28<00:00,  1.38it/s]
Llama.generate: prefix-match hit

llama_print_timings:        load time =   14048.92 ms
llama_print_timings:      sample time =      31.33 ms /   273 runs   (    0.11 ms per token,  8713.97 tokens per second)
llama_print_timings: prompt eval time =   39878.79 ms /   780 tokens (   51.13 ms per token,    19.56 tokens per second)
llama_print_timings:        eval time =   51463.24 ms /   272 runs   (  189.20 ms per token,     5.29 tokens per second)
llama_print_timings:       total time =   92134.24 ms /  1052 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =   14048.92 ms
llama_print_timings:      sample time =      42.64 ms /   439 runs   (    0.10 ms per token, 10296.46 tokens per second)
llama_print_timings: prompt eval time =   12590.05 ms /   498 tokens (   25.28 ms per token,    39.56 tokens per second)
llama_print_timings:        eval time =  105837.68 ms /   438 runs   (  241.64 ms per token,     4.1


token_splitter_1024



100%|██████████| 52/52 [00:41<00:00,  1.26it/s]
Llama.generate: prefix-match hit

llama_print_timings:        load time =   14048.92 ms
llama_print_timings:      sample time =       9.41 ms /    90 runs   (    0.10 ms per token,  9567.34 tokens per second)
llama_print_timings: prompt eval time =   17785.35 ms /   635 tokens (   28.01 ms per token,    35.70 tokens per second)
llama_print_timings:        eval time =    8332.64 ms /    89 runs   (   93.63 ms per token,    10.68 tokens per second)
llama_print_timings:       total time =   26304.46 ms /   724 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =   14048.92 ms
llama_print_timings:      sample time =      38.70 ms /   433 runs   (    0.09 ms per token, 11189.79 tokens per second)
llama_print_timings: prompt eval time =   32842.42 ms /  1984 tokens (   16.55 ms per token,    60.41 tokens per second)
llama_print_timings:        eval time =   44582.05 ms /   432 runs   (  103.20 ms per token,     9.69 

ValueError: could not convert string to float: "0. The model's response is not related to the context provided, which discusses a scenario in a fantasy world with various magical items, enemies, and choices for the character. The model's response focuses on the combination of two specific magical items and their potential uses, but this information is not supported by the context."

In [10]:
# Save raw results
# Convert it to dataframe
# Plot graphs
# Test various embeddings, rerankings, model, parsers
# Add testing time : make 10 queries and compute time spent

## Results of full evaluation

In [128]:
def plot_results(folder_path: str, output_file: str):
    # Get a list of files in the folder
    file_list = os.listdir(folder_path)

    # Filter only CSV files if needed
    csv_files = [file for file in file_list if file.endswith('.csv')]

    # Initialize an empty list to store DataFrames
    df_list = []

    # Iterate over each CSV file
    for csv_file in csv_files:
        # Construct the full path to the CSV file
        file_path = os.path.join(folder_path, csv_file)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        df_list.append(df)

    # Combine dataframes
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df["retriever_name"] = combined_df["retriever_name"].apply(lambda x: x.replace("semantic_splitter", "ssp"))
    combined_df["retriever_name"] = combined_df["retriever_name"].apply(lambda x: x.replace("token_splitter", "tsp"))

    # Convert metrics to percentage
    metric_columns = ['hit_rate', 'mrr', 'faithfulness', 'relevancy', 'correctness', 'answer_relevancy', 'mean_score']
    combined_df[metric_columns] *= 100

    # Create figure with subplots for each metric
    fig = go.Figure()

    # Create subplots for each metric in a 2-column layout
    fig = make_subplots(rows=math.ceil(len(metric_columns) / 2), cols=2, subplot_titles=[col.capitalize() for col in metric_columns])

    for i, col in enumerate(metric_columns, start=1):
        fig.add_trace(
            go.Bar(
                x=combined_df['retriever_name'],
                y=combined_df[col],
                text=combined_df[col].round(2),  # Display values rounded to 2 decimal places as text on bars
                textposition='auto',  # Automatically place text on bars
                marker_color=px.colors.qualitative.Plotly[i-1],  # Use Plotly qualitative color palette
                showlegend=False,  # Hide legend for individual plots
            ),
            row=(i + 1) // 2, col=(i % 2) + 1  # Place subplots in 2 columns per row
        )

    # Update layout for the entire figure
    fig.update_layout(
        height=1500,  # Adjust height as needed
        width=1000,  # Adjust width as needed
        title='Comparison of Metrics by Retriever Name',
        xaxis_title='Retriever Name',
        yaxis_title='Percentage',
        yaxis_tickformat='.0f',  # Format y-axis ticks as integer
    )

    # Save the figure as an HTML file
    fig.write_html(output_file)

    print(f"Figure saved as HTML: {output_file}")

In [129]:
plot_results(folder_path=results_folder, output_file=os.path.join(results_folder, "test.html"))

Figure saved as HTML: data_evaluation/batch_2/results/test.html


In [47]:
# from mistralai.client import MistralClient
# from mistralai.models.chat_completion import ChatMessage

# api_key = input("Enter API key")
# model = "open-mixtral-8x7b"

# client = MistralClient(api_key=api_key)

# chat_response = client.chat(
#     model=model,
#     messages=[ChatMessage(role="user", content="""I want to evaluate an LLM's faithfulness. A faithfulness evaluator is defined as such : "Evaluates whether a response is faithful to the 
#                           contexts (i.e. whether the response is supported by the contexts or hallucinated.). This evaluator only considers the response string and the list of context 
#                           strings. The score has to be between 0 and 1, 0 meaning the answer is not supported by context and so the modle hallucinated, 1 meaning the answer is fully supported
#                           by the given context.
#                           Based on given information, write me a preprompt, with at least two examples, to give to my model ?""")]
# )

# print(chat_response.choices[0].message.content)