### Import libraries and environment variables

In [1]:
import nest_asyncio
import random

nest_asyncio.apply()
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.prompts import PromptTemplate

from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

import openai
import time
import os
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

### Read Docs

In [2]:
data_dir = "../data"
documents = SimpleDirectoryReader(data_dir).load_data()

In [3]:
len(documents)

37

### Create evaluation questions and pick k out of them

In [4]:
num_eval_questions = 25

eval_documents = documents[0:20] # 문서중 20개만 뽑기 
data_generator = DatasetGenerator.from_documents(eval_documents) # 질문을 뽑아낼 수 있는 문서 객체 만듦
eval_questions = data_generator.generate_questions_from_nodes() # 질문 목록 만들어냄 
k_eval_questions = random.sample(eval_questions, num_eval_questions) # 25개만 뽑아냄

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


### Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context 

In [5]:

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

In [9]:
# We will use GPT-4 for evaluating the responses
gpt4 = OpenAI(temperature=0, model="gpt-4o")
service_context_gpt4 = gpt4
# 답변이 문맥에 얼마나 충실한지? 
faithfulness_gpt4 = FaithfulnessEvaluator()

# context_str과 query_str이 주어졌을 때, 정보가 문맥에 의해 명확하게 뒷받침되는지 평가하는 프롬프트 
faithfulness_new_prompt_template = PromptTemplate(""" Please tell if a given piece of information is directly supported by the context.
    You need to answer with either YES or NO.
    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

    Information: Apple pie is generally double-crusted.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: YES

    Information: Apple pies taste bad.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: NO

    Information: Paris is the capital of France.
    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
    Answer: NO

    Information: {query_str}
    Context: {context_str}
    Answer:

    """)

# 충실성 평가할 새로운 프롬프트 템플릿을 새로 설정함 
faithfulness_gpt4.update_prompts({"your_prompt_key": faithfulness_new_prompt_template}) 
# 질문이 응답과 얼마나 관련이 높은지? 
relevancy_gpt4 = RelevancyEvaluator()

### Function to evaluate metrics for each chunk size

In [10]:
# 청크 사이즈, 평가할 질문 목록을 받아서 평균 응답 시간, 충실성, 관련성을 계산함
def evaluate_response_time_and_accuracy(chunk_size, eval_questions):

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    llm = OpenAI(model="gpt-3.5-turbo")

    Settings.llm = llm
    Settings.chunk_size = chunk_size
    Settings.chunk_overlap = chunk_size // 5 

    # 인덱스 생성
    vector_index = VectorStoreIndex.from_documents(eval_documents)
    # 쿼리 엔진 생성
    query_engine = vector_index.as_query_engine(similarity_top_k=5)
    num_questions = len(eval_questions)

    # 각 질문에 대해 평가
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time
        
        faithfulness_result = faithfulness_gpt4.evaluate_response(
            response=response_vector
        ).passing
        
        relevancy_result = relevancy_gpt4.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy


### Test different chunk sizes 

In [11]:
chunk_sizes = [128, 256]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

Chunk size 128 - Average Response time: 1.97s, Average Faithfulness: 0.64, Average Relevancy: 1.00
Chunk size 256 - Average Response time: 1.35s, Average Faithfulness: 0.80, Average Relevancy: 1.00


## 결론 
1. 문서를 어떤 방식으로 청킹해야 하는지 , 어떤 청크 사이즈가 더 적절한지 판단하기 위한 방법
2. 청크 사이즈에 따라 달라지는 평균 응답 시간, 충실성, 관련성을 파악하고 최적의 청크 사이즈를 찾는 방법