### Import libraries and environment variables

In [42]:
import nest_asyncio
import random

nest_asyncio.apply()
from dotenv import load_dotenv
# from openai import OpenAI
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core.prompts import PromptTemplate

from llama_index.core.evaluation import (
    DatasetGenerator,
    FaithfulnessEvaluator,
    RelevancyEvaluator
)
from llama_index.llms.openai import OpenAI

import openai
import time
import os
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

### Read Docs

In [43]:
data_dir = "../data"
documents = SimpleDirectoryReader(data_dir).load_data()

In [44]:
len(documents)
# documents

36

### Create evaluation questions and pick k out of them

In [45]:
num_eval_questions = 25

eval_documents = documents[0:20] # 문서중 20개만 뽑기 
data_generator = DatasetGenerator.from_documents(eval_documents) # 질문을 뽑아낼 수 있는 문서 객체 만듦
eval_questions = data_generator.generate_questions_from_nodes() # 질문 목록 만들어냄 
k_eval_questions = random.sample(eval_questions, num_eval_questions) # 25개만 뽑아냄

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


### Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context 

In [46]:

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

In [47]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

# Define and configure Settings directly
Settings.llm = OpenAI(temperature=0, model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

# Set up faithfulness evaluation with a custom prompt
faithfulness_new_prompt_template = PromptTemplate("""Please tell if a given piece of information is directly supported by the context.
    You need to answer with either YES or NO.
    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

    Information: Apple pie is generally double-crusted.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: YES

    Information: Apple pies taste bad.
    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
    Answer: NO

    Information: Paris is the capital of France.
    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
    Answer: NO

    Information: {query_str}
    Context: {context_str}
    Answer:
""")

# Initialize faithfulness evaluator with the custom prompt template
faithfulness_gpt4 = FaithfulnessEvaluator()
faithfulness_gpt4.update_prompts({"your_prompt_key": faithfulness_new_prompt_template})

# Initialize relevancy evaluator
relevancy_gpt4 = RelevancyEvaluator()


In [48]:
# # We will use GPT-4 for evaluating the responses
# gpt4 = OpenAI(temperature=0, model="gpt-4o")

# # Define service context for GPT-4 for evaluation
# service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)

# # 답변이 문맥에 얼마나 충실한지? 
# faithfulness_gpt4 = FaithfulnessEvaluator(service_context=service_context_gpt4)

# # context_str과 query_str이 주어졌을 때, 정보가 문맥에 의해 명확하게 뒷받침되는지 평가하는 프롬프트 
# faithfulness_new_prompt_template = PromptTemplate(""" Please tell if a given piece of information is directly supported by the context.
#     You need to answer with either YES or NO.
#     Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.

#     Information: Apple pie is generally double-crusted.
#     Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
#     Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
#     It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
#     Answer: YES

#     Information: Apple pies taste bad.
#     Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.
#     Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.
#     It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).
#     Answer: NO

#     Information: Paris is the capital of France.
#     Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.
#     Answer: NO

#     Information: {query_str}
#     Context: {context_str}
#     Answer:

#     """)

# # 충실성 평가할 새로운 프롬프트 템플릿을 새로 설정함 
# faithfulness_gpt4.update_prompts({"your_prompt_key": faithfulness_new_prompt_template}) 
# # 질문이 응답과 얼마나 관련이 높은지? 
# relevancy_gpt4 = RelevancyEvaluator(service_context=service_context_gpt4)

### Function to evaluate metrics for each chunk size

In [49]:
import openai
import time

# 평가 함수 정의
def evaluate_response_time_and_accuracy(chunk_size, eval_questions):
    """
    주어진 청크 크기에 대해 GPT-3.5-turbo로 생성된 응답의 평균 응답 시간, 충실성, 관련성을 평가합니다.
    
    Parameters:
    chunk_size (int): 데이터 청크 크기
    
    Returns:
    tuple: 평균 응답 시간, 충실성, 관련성 점수를 포함하는 튜플
    """

    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # gpt-3.5-turbo 모델을 직접 사용하여 응답 생성
    def query_gpt_35(query):
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": query}]
        )
        return response.choices[0].message['content']

    # 벡터 인덱스 설정 (ServiceContext 사용 없이)
    # 여기에 vector_index와 query_engine을 설정하는 다른 방법을 넣어야 할 수 있습니다.
    # vector_index = VectorStoreIndex.from_documents(eval_documents, chunk_size=chunk_size, chunk_overlap=chunk_size//5)
    # query_engine = vector_index.as_query_engine(similarity_top_k=5)

    num_questions = len(eval_questions)

    # 각 질문에 대해 응답 생성 및 평가
    for question in eval_questions:
        start_time = time.time()
        
        # gpt-3.5-turbo로 응답 생성
        response_vector = query_gpt_35(question)
        
        elapsed_time = time.time() - start_time
        
        # GPT-4로 충실성 평가
        faithfulness_result = faithfulness_gpt4.evaluate_response(response=response_vector).passing
        
        # GPT-4로 관련성 평가
        relevancy_result = relevancy_gpt4.evaluate_response(query=question, response=response_vector).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    # 평균값 계산
    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy


### Test different chunk sizes 

In [52]:
chunk_sizes = [128, 256]

for chunk_size in chunk_sizes:
  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)
  print(f"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}")

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [51]:
! pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.52.1
    Uninstalling openai-1.52.1:
      Successfully uninstalled openai-1.52.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ragas 0.2.2 requires openai>1, but you have openai 0.28.0 which is incompatible.
llama-index-agent-openai 0.3.4 requires openai>=1.14.0, but you have openai 0.28.0 which is incompatible.
llama-index-legacy 0.9.48.post3 requires openai>=1.1.0, but you have openai 0.28.0 which is incompatible.
langchain-openai 0.2.3 requires openai<2.0.0,>=1.52.0, but you have openai 0.28.0 which is incompatible.
llama-index-embeddings-openai 0.2.5 requires openai>=1.1.0, but you have openai 0.28.0 which