In [104]:
import os
import streamlit as st
import glob
import base64
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter, SemanticSplitterNodeParser, TokenTextSplitter
from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage, Settings
from llama_index.core.schema import TextNode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.core.evaluation import generate_question_context_pairs, RetrieverEvaluator
from transformers import AutoTokenizer
import nest_asyncio

# To allow nested event loops
nest_asyncio.apply()

# Define constants
results_folder = os.path.join("data_evaluation", "full_results")

In [92]:
def compute_results(eval_results_dict: dict):
    """Display results from evaluate."""
    full_df = pd.DataFrame()
    for name, eval_results in eval_results_dict.items():
        metric_dicts = []
        for eval_result in eval_results:
            metric_dict = eval_result.metric_vals_dict
            metric_dicts.append(metric_dict)

        df = pd.DataFrame(metric_dicts)

        hit_rate = df["hit_rate"].mean()
        mrr = df["mrr"].mean()

        metric_df = pd.DataFrame(
            {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
        )

        full_df = pd.concat([full_df, metric_df])

    return full_df

In [96]:
# Craft questions and context pairs which can be used in the assessment of the RAG system of both Retrieval and Response Evaluations
input_folder = "./data_evaluation/batch_1"
documents = SimpleDirectoryReader(input_dir=input_folder, recursive=True).load_data()
print(f"\n\nNumber of documents : {len(documents)}\n\n")

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url='https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf',
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.0,  # Model needs to be factual and deterministic
    max_new_tokens=512,
    # Context size
    context_window=8192, # Max is ~32k
    # Kwargs to pass to __call__()
    generate_kwargs={},
    # Set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 10},
    # Transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True
)

parsers = {}

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

Settings.llm = llm
Settings.embed_model = embed_model

semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, 
breakpoint_percentile_threshold=95, 
embed_model=embed_model)
parsers["semantic_splitter"] = semantic_splitter

# Token splitter 512
token_splitter_512 = TokenTextSplitter(chunk_size=512, chunk_overlap=50, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_512"] = token_splitter_512

# Token splitter 1024
token_splitter_1024 = TokenTextSplitter(chunk_size=1024, chunk_overlap=102, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_1024"] = token_splitter_1024


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
l



Number of documents : 2




llama_kv_cache_init:        CPU KV buffer size =   704.00 MiB
llama_kv_cache_init:      Metal KV buffer size =   320.00 MiB
llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:      Metal compute buffer size =   560.00 MiB
llama_new_context_with_model:        CPU compute buffer size =   560.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 3
AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'general.quantization_version': '2', 'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation role

In [94]:
eval_results_dict = {}
for parser_name, parser in parsers.items():
    print(parser_name, "\n")

    nodes = parser.get_nodes_from_documents(documents)

    qa_dataset = generate_question_context_pairs(
        nodes,
        llm=llm,
        num_questions_per_chunk=2
    )

    vector_index = VectorStoreIndex(nodes)
    retriever = vector_index.as_retriever(similarity_top_k=3)

    retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever)

    # Evaluate
    eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)  # Can't put this line in a function otherwise it raises an error
    eval_results_dict[parser_name] = eval_results

semantic_splitter 



  0%|          | 0/101 [00:00<?, ?it/s]
llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =      12.02 ms /   154 runs   (    0.08 ms per token, 12810.91 tokens per second)
llama_print_timings: prompt eval time =   18053.94 ms /   756 tokens (   23.88 ms per token,    41.87 tokens per second)
llama_print_timings:        eval time =   11712.03 ms /   153 runs   (   76.55 ms per token,    13.06 tokens per second)
llama_print_timings:       total time =   29966.20 ms /   909 tokens
  1%|          | 1/101 [00:29<49:57, 29.97s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       4.81 ms /    62 runs   (    0.08 ms per token, 12889.81 tokens per second)
llama_print_timings: prompt eval time =    2715.31 ms /   117 tokens (   23.21 ms per token,    43.09 tokens per second)
llama_print_timings:        eval time =    4339.40 ms /    61 runs   (   71.14 ms per token,    1

token_splitter_512 



  0%|          | 0/261 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       5.46 ms /    65 runs   (    0.08 ms per token, 11909.12 tokens per second)
llama_print_timings: prompt eval time =   10631.16 ms /   254 tokens (   41.85 ms per token,    23.89 tokens per second)
llama_print_timings:        eval time =    5145.43 ms /    64 runs   (   80.40 ms per token,    12.44 tokens per second)
llama_print_timings:       total time =   15882.94 ms /   318 tokens
  0%|          | 1/261 [00:15<1:08:51, 15.89s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       4.19 ms /    52 runs   (    0.08 ms per token, 12419.39 tokens per second)
llama_print_timings: prompt eval time =    7575.13 ms /   539 tokens (   14.05 ms per token,    71.15 tokens per second)
llama_print_timings:        eval time =    3721.38 ms /    51 

token_splitter_1024 



  0%|          | 0/105 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =      14.24 ms /   177 runs   (    0.08 ms per token, 12428.90 tokens per second)
llama_print_timings: prompt eval time =   20554.30 ms /   857 tokens (   23.98 ms per token,    41.69 tokens per second)
llama_print_timings:        eval time =   15493.65 ms /   176 runs   (   88.03 ms per token,    11.36 tokens per second)
llama_print_timings:       total time =   36469.64 ms /  1033 tokens
  1%|          | 1/105 [00:36<1:03:13, 36.48s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =   13122.86 ms
llama_print_timings:      sample time =       5.12 ms /    67 runs   (    0.08 ms per token, 13096.17 tokens per second)
llama_print_timings: prompt eval time =    9079.79 ms /   565 tokens (   16.07 ms per token,    62.23 tokens per second)
llama_print_timings:        eval time =    6003.13 ms /    66 

In [86]:
batch_1_results = os.path.join(results_folder, "results_2_docs_Semantic_Token1024_Token512.csv")

df = pd.read_csv(batch_1_results, sep=";")
df

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,semantic_splitter,1.0,0.935185
1,token_splitter_512,1.0,0.892857
2,token_splitter_1024,1.0,0.875


Sentence Splitter accuracy was way below others that is why it is not going to be used (cf results_2_docs_Sentence_Semantic_Token.csv)

On two documents (7 chunks for SemanticSplitter, 5 for Token512 and 2 for Token1024) : Semantic has the higher score and increased token size for token splitter seems to lower HR and MRR

Try to evaluate on more documents/chunks to confirm

In [90]:
batch_2_results = os.path.join(results_folder, "results_10_docs_Semantic_Token1024_Token512.csv")

df = pd.read_csv(batch_2_results, sep=";")
df

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,semantic_splitter,0.801724,0.698276
1,token_splitter_512,0.717143,0.607143
2,token_splitter_1024,0.721519,0.622363


On 10 documents (49 chunks for Semantic, 121 for Token512 and 52 for Token1024) : Semantic splitter is still better than TokenSplitter, but the Token1024 is slightly better than the 512

Timer : 10.51 for Semantic, 20.40 for Token512, 14.57 for Token1024 ==> Meaning that, with increased database volume the Semantic Splitter is going to be more and more slow than the TokenSplitter BUT the results are way better so it is a trade-off

In [95]:
batch_3_results = os.path.join(results_folder, "results_23_docs_Semantic_Token1024_Token512.csv")

df = compute_results(eval_results_dict)
df.to_csv(batch_3_results, index=False, sep=";")

df = pd.read_csv(batch_3_results, sep=";")
df

Unnamed: 0,Retriever Name,Hit Rate,MRR
0,semantic_splitter,0.783133,0.684739
1,token_splitter_512,0.766615,0.649923
2,token_splitter_1024,0.763359,0.685751


Finally, on 23 documents (101 chunks for Semantic, 261 for Token512 and 105 for Token 1024) : Semantic splitter is still better than TokenSplitter but TokenSPlitter has increased its HitRate and MRR

Timer : 26.36 for Semantic, 46.23 for Token512 and 24.17 for Token1024 ==> Meaning that, with increased database volume the Semantic Splitter is going to be more and more slow than the TokenSplitter and the results tend to decrease

In [97]:
nodes = token_splitter_512.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes)
retriever = vector_index.as_retriever(similarity_top_k=3)

In [103]:
len(nodes[1].text)

1820

# Full evaluation

In [125]:
import os
import streamlit as st
import glob
import base64
import pandas as pd
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter, SemanticSplitterNodeParser, TokenTextSplitter
from llama_index.core import Document, VectorStoreIndex, StorageContext, load_index_from_storage, Settings
from llama_index.core.schema import TextNode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.core.evaluation import generate_question_context_pairs, RetrieverEvaluator, FaithfulnessEvaluator, RelevancyEvaluator, AnswerRelevancyEvaluator
from transformers import AutoTokenizer
import nest_asyncio
from llama_index.llms.mistralai import MistralAI

# To allow nested event loops
nest_asyncio.apply()

# Define constants
results_folder = os.path.join("data_evaluation", "full_results")

In [126]:
# Craft questions and context pairs which can be used in the assessment of the RAG system of both Retrieval and Response Evaluations
input_folder = "./data_evaluation/batch_1"
documents = SimpleDirectoryReader(input_dir=input_folder, recursive=True).load_data()
print(f"\n\nNumber of documents : {len(documents)}\n\n")

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url='https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf',
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.0,  # Model needs to be factual and deterministic
    max_new_tokens=512,
    # Context size
    context_window=8192, # Max is ~32k
    # Kwargs to pass to __call__()
    generate_kwargs={},
    # Set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 10},
    # Transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True
)

parsers = {}

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

Settings.llm = llm
Settings.embed_model = embed_model

semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, 
breakpoint_percentile_threshold=95, 
embed_model=embed_model)
parsers["semantic_splitter"] = semantic_splitter

# Token splitter 512
token_splitter_512 = TokenTextSplitter(chunk_size=512, chunk_overlap=50, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_512"] = token_splitter_512

# Token splitter 1024
token_splitter_1024 = TokenTextSplitter(chunk_size=1024, chunk_overlap=102, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_1024"] = token_splitter_1024


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /Users/Calu/Library/Caches/llama_index/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
l



Number of documents : 2




llama_kv_cache_init:      Metal KV buffer size =   320.00 MiB
llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
llama_new_context_with_model:        CPU  output buffer size =     0.12 MiB
llama_new_context_with_model:      Metal compute buffer size =   560.00 MiB
llama_new_context_with_model:        CPU compute buffer size =   560.01 MiB
llama_new_context_with_model: graph nodes  = 1030
llama_new_context_with_model: graph splits = 3
AVX = 0 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | SSSE3 = 0 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'general.quantization_version': '2', 'tokenizer.chat_template': "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endi

In [127]:
nodes

[TextNode(id_='2e0b8065-bc9c-4f24-8404-f9471eccfe02', embedding=None, metadata={'file_path': '/Users/Calu/Desktop/Code/Python/LLM/3.0_BG3_Chatbot_LLM_RAG_App/chatbot/data_evaluation/batch_1/Arnell_Hallowleaf.txt', 'file_name': 'Arnell_Hallowleaf.txt', 'file_type': 'text/plain', 'file_size': 2846, 'creation_date': '2024-05-03', 'last_modified_date': '2024-05-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f7284ba3-b5cd-4a37-bc72-5787e2839307', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/Calu/Desktop/Code/Python/LLM/3.0_BG3_Chatbot_LLM_RAG_App/chatbot/data_evaluation/batch_1/Arnell_Hallowleaf.txt', 'file_name': 'Arnell_Hallowleaf.txt', 'file_type': 'text/plain', 'file_size': 284

In [118]:
# Setup your API KEY here
api_key = input("Put your API key here")

# Load Mixtral 8x7b model
llm_mixtral = MistralAI(api_key=api_key, endpoint="open-mixtral-8x7b")

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

Settings.llm = llm_mixtral
Settings.embed_model = embed_model

# Create context question pairs
qa_dataset = generate_question_context_pairs(
    nodes,
    llm=llm_mixtral,
    num_questions_per_chunk=1
)

  0%|          | 0/5 [00:00<?, ?it/s]


AttributeError: 'MistralClient' object has no attribute 'complete'

In [None]:
# Craft questions and context pairs which can be used in the assessment of the RAG system of both Retrieval and Response Evaluations
input_folder = "./data_evaluation/batch_1"
documents = SimpleDirectoryReader(input_dir=input_folder, recursive=True).load_data()
print(f"\n\nNumber of documents : {len(documents)}\n\n")

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url='https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf',
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',  # Q6_K was used too but quite slow
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.0,  # Model needs to be factual and deterministic
    max_new_tokens=512,
    # Context size
    context_window=8192, # Max is ~32k
    # Kwargs to pass to __call__()
    generate_kwargs={},
    # Set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 10},
    # Transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True
)

parsers = {}

# Semantic splitter
embed_model = HuggingFaceEmbedding(
model_name="BAAI/bge-small-en-v1.5",
embed_batch_size=128,
normalize=True)

Settings.llm = llm
Settings.embed_model = embed_model

# Semantic splitter
semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, 
breakpoint_percentile_threshold=95, 
embed_model=embed_model)
parsers["semantic_splitter"] = semantic_splitter

# Token splitter 512
token_splitter_512 = TokenTextSplitter(chunk_size=512, chunk_overlap=50, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_512"] = token_splitter_512

# Token splitter 1024
token_splitter_1024 = TokenTextSplitter(chunk_size=1024, chunk_overlap=102, separator="\n\n")  # Don't put tokenizer from mistral model as it does not tokenize anything, resulting in a single chunk per document
parsers["token_splitter_1024"] = token_splitter_1024
