# Determining ideal chunk size

In [1]:
import nest_asyncio

nest_asyncio.apply()

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings
)
from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.openai import OpenAI

import openai
import time
import getpass

openai_key = getpass.getpass("Enter API key for OpenAI: ")
# os.environ["OPENAI_API_KEY"] = openai_key
openai.api_key = openai_key

In [2]:
# Loading documents
documents = SimpleDirectoryReader("./data_samples/").load_data()


In [13]:
# Question generation
data_generator = DatasetGenerator.from_documents(documents)
eval_questions = data_generator.generate_questions_from_nodes()

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [15]:
# Evaluators
# We will use GPT-4 for evaluating the responses
gpt4o = OpenAI(temperature=0, model="gpt-4o")

# Define service context for GPT-4 for evaluation
# service_context_gpt4o = ServiceContext.from_defaults(llm=gpt4o)
Settings.llm = gpt4o

# Define Faithfulness and Relevancy Evaluators which are based on GPT-4o
faithfulness_gpt4o = FaithfulnessEvaluator()
relevancy_gpt4o = RelevancyEvaluator()

In [3]:
def evaluate_response_time_and_accuracy(chunk_size, eval_questions, eval_documents):
    """
    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.
    
    Parameters:
    chunk_size (int): The size of data chunks being processed.
    
    Returns:
    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # create vector index
    # llm = OpenAI(model="gpt-3.5-turbo")
    # service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size)
    Settings.chunk_size = chunk_size
    
    vector_index = VectorStoreIndex.from_documents(
        eval_documents,
    )
    # build query engine
    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

    # Iterate over each question in eval_questions to compute metrics.
    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),
    # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness_gpt4o.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy_gpt4o.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

In [11]:
eval_documents = documents[:20]
data_generator = DatasetGenerator.from_documents(documents[:20])
eval_questions = data_generator.generate_questions_from_nodes(num = 20)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [17]:
eval_questions

['What was the goal of enhancing the clinical usefulness of DSM-5?',
 'Why are reliable diagnoses essential in the field of mental health?',
 'How has the understanding of mental disorders and their treatments evolved over time?',
 'What progress has been made in areas such as cognitive neuroscience, brain imaging, epidemiology, and genetics in the last two decades?',
 'Why is it important for DSM to evolve in the context of other clinical research initiatives?',
 'What arguments support the idea that boundaries between disorder categories are more fluid than previously recognized?',
 'How does DSM-5 aim to improve the validity of a diagnosis?',
 'Why is clinical training and experience necessary to use DSM for determining a diagnosis?',
 'What information is included in the diagnostic criteria of DSM-5?',
 'How does DSM-5 aim to provide a clear and concise description of each mental disorder?',
 'What was the purpose of the predecessor of DSM published in 1844?',
 'How has the underst

In [16]:
chunk_sizes = [128, 256, 512, 1024, 2048]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, eval_questions, eval_documents)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

Chunk size 128 - Average Response time: 1.74s, Average Faithfulness: 1.00, Average Relevancy: 1.00
Chunk size 256 - Average Response time: 2.14s, Average Faithfulness: 1.00, Average Relevancy: 1.00
Chunk size 512 - Average Response time: 2.54s, Average Faithfulness: 0.95, Average Relevancy: 0.95
Chunk size 1024 - Average Response time: 4.56s, Average Faithfulness: 0.90, Average Relevancy: 0.90
Chunk size 2048 - Average Response time: 4.29s, Average Faithfulness: 0.90, Average Relevancy: 0.90
