## Fin-Prophet: An RAG QA System for Finance

1. Import Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install llama_index openai



In [None]:
! pip install ipykernel tiktoken cohere pypdf sentence-transformers torchvision torchaudio torchtext deeplake[all]



In [None]:
! pip install llama-index-vector-stores-deeplake llama-index-llms-cohere llama-index-embeddings-cohere

Collecting llama-index-vector-stores-deeplake
  Using cached llama_index_vector_stores_deeplake-0.1.2-py3-none-any.whl (4.3 kB)
Collecting llama-index-llms-cohere
  Using cached llama_index_llms_cohere-0.1.6-py3-none-any.whl (5.0 kB)
Collecting llama-index-embeddings-cohere
  Using cached llama_index_embeddings_cohere-0.1.8-py3-none-any.whl (3.8 kB)
Installing collected packages: llama-index-vector-stores-deeplake, llama-index-llms-cohere, llama-index-embeddings-cohere
Successfully installed llama-index-embeddings-cohere-0.1.8 llama-index-llms-cohere-0.1.6 llama-index-vector-stores-deeplake-0.1.2


In [None]:
# Global
import os
import getpass
import textwrap
import time
import locale
import json
import re
import torch
from time import time
import json

# Llama Index
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex, ServiceContext, StorageContext
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SimpleNodeParser, SentenceWindowNodeParser
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.evaluation import generate_question_context_pairs, CorrectnessEvaluator, RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner, RetrieverEvaluator
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

# OpenAI
import openai
from openai import OpenAI

# Warnings
import warnings
warnings.filterwarnings("ignore")

2. Get API Keys

In [None]:
if torch.cuda.is_available:
  print('GPU available')
else:
  print('Please set GPU via Edit -> Notebook Settings.')

GPU available


In [None]:
os.environ["OPENAI_API_KEY"] = "..."

3. Data Ingestion and Preprocessing

3.1. Get Data

In [None]:
# Define the folder path where data is stored
data_folder = "/content/drive/MyDrive/rag_data"

reader = SimpleDirectoryReader(
    input_dir=data_folder
)

# Create an empty list to store all documents
documents = []

# Iterate through the data returned by the reader and append each document to the 'documents' list
for docs in reader.iter_data():
    for doc in docs:
        documents.append(doc)

In [None]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0:4])

<class 'list'> 

504 

<class 'llama_index.core.schema.Document'>
[Document(id_='5e250469-3fa4-420e-b2dd-293c18906fc2', embedding=None, metadata={'page_label': '1', 'file_name': 'Introduction-to-Financial-Analysis-1702314047._print.pdf', 'file_path': '/content/drive/MyDrive/rag_data/Introduction-to-Financial-Analysis-1702314047._print.pdf', 'file_type': 'application/pdf', 'file_size': 8646753, 'creation_date': '2024-04-20', 'last_modified_date': '2024-04-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Introduction to Financial\nAnalysis', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), Document(id_='e17934c6-45ad-4978-87af-6b515e6dbd8d', embedding=Non

3.2. Text Cleaning

In [None]:
# Function to clean text by replacing multiple consecutive spaces with a single space
def clean_text(text):
    cleaned_text = re.sub(r'\s+', ' ', text)
    return cleaned_text

# List comprehension to clean the text of each document in the 'documents' list
cleaned_documents = [clean_text(doc.text) for doc in documents]

In [None]:
print(type(cleaned_documents), "\n")
print(len(cleaned_documents), "\n")
print(type(cleaned_documents[0]))
print(cleaned_documents[0:4])

<class 'list'> 

504 

<class 'str'>
['Introduction to Financial Analysis', 'Introduction to Financial Analysis Kenneth S.Bigel OPEN TOUR O NEW YORK, NY', 'Introduction to Financial Analysis Copyright © 2022 by Kenneth S. Bigel is licensed under a Creative Commons Attribution 4.0 International License , except wher e otherwise noted. Cover image: New Y ork City (28) by Jesús Quiles is licensed under CC-BY 2.0 Idea icon made by Freepik from www .flaticon.com', '']


3.3. Merging Documents

In [None]:
# Merging Documents
document = Document(text="\n\n".join([doc for doc in cleaned_documents]))

text_content = document.get_text()
print(type(document))
print(text_content[:500])

<class 'llama_index.core.schema.Document'>
Introduction to Financial Analysis

Introduction to Financial Analysis Kenneth S.Bigel OPEN TOUR O NEW YORK, NY

Introduction to Financial Analysis Copyright © 2022 by Kenneth S. Bigel is licensed under a Creative Commons Attribution 4.0 International License , except wher e otherwise noted. Cover image: New Y ork City (28) by Jesús Quiles is licensed under CC-BY 2.0 Idea icon made by Freepik from www .flaticon.com



Contents About the Author xvi Author's Acknowledgements xix Open T ouro Acknow


4. Basic RAG Pipeline

4.1. Indexing Documents

In [None]:
!pip install llama-index-embeddings-huggingface

Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.2.0-py3-none-any.whl (7.1 kB)
Installing collected packages: llama-index-embeddings-huggingface
Successfully installed llama-index-embeddings-huggingface-0.2.0


In [None]:
# Imports required in this cell (otherwise it will show an error)
from llama_index.llms.openai import OpenAI

# Initialize an OpenAI language model (llm) with specific configurations
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

# Create a ServiceContext object with default configurations
# This context incorporates required settings and services for generating vector representations
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5") # Flag Embedding: focus on RAG LLMs.


# Create a VectorStoreIndex object by indexing the 'document' (text data) using the provided service context
index = VectorStoreIndex.from_documents(
    [document],  # List containing the document(s) to be indexed
    service_context=service_context)  # Incorporates the context for vector generation and indexing


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

4.2. Basic Query Retrieval

In [None]:
# Initiate Query Engine
query_engine = index.as_query_engine()

# Submit a Query String
response = query_engine.query(
    "What is the basic accounting equation as described in \"Introduction to Financial Analysis\"?")

print(str(response))

The basic accounting equation, as described in "Introduction to Financial Analysis," is Assets = Liabilities + Equity.


5. LLama Index: Deeplake RAG Pipeline

5.1. Nodes

5.1.1. Sentence Splitter

In [None]:
# Create a SimpleNodeParser for parsing nodes from text content
sentence_node_parser = SentenceSplitter.from_defaults(
    paragraph_separator=r"\n(?:●|-|\s{2,}|\.\s|？|！)\n",  # Regular expression pattern for paragraph separation
    chunk_size=512,
    include_prev_next_rel=True,   # Include previous and next relationships for nodes
    include_metadata=True         # Include metadata for nodes (such as document information)
)

# Get nodes from the cleaned documents using the sentence_node_parser
sentence_nodes = sentence_node_parser.get_nodes_from_documents([document])

# Print information about the data and nodes
print(f"Number of Documents: {len(cleaned_documents)}")  # Assuming 'cleaned_documents' contains preprocessed data
print(f"Number of nodes: {len(sentence_nodes)} with the current chunk size of {sentence_node_parser.chunk_size}")

Number of Documents: 504
Number of nodes: 330 with the current chunk size of 512


5.1.2. Sentence Window Node Parser

In [None]:
# Define a function that splits sentences based on the provided regular expression pattern
def custom_sentence_splitter(text):
    # Apply the regex pattern to split sentences
    sentences = re.split(r"\n(?:●|-|\s{2,}|\.\s|？|！)\n", text)
    return sentences

# Use the defined function as the sentence splitter
window_parser = SentenceWindowNodeParser.from_defaults(
    sentence_splitter=custom_sentence_splitter,  # Pass the callable function or list of functions
    window_size=3,
    include_prev_next_rel=True,
    include_metadata=True
)

window_nodes = window_parser.get_nodes_from_documents([document])

# Print the nodes text
print([x.text for x in window_nodes])
print(window_nodes[0].metadata["original_text"])
print(window_nodes[0].metadata["window"])

Introduction to Financial Analysis

Introduction to Financial Analysis Kenneth S.Bigel OPEN TOUR O NEW YORK, NY

Introduction to Financial Analysis Copyright © 2022 by Kenneth S. Bigel is licensed under a Creative Commons Attribution 4.0 International License , except wher e otherwise noted. Cover image: New Y ork City (28) by Jesús Quiles is licensed under CC-BY 2.0 Idea icon made by Freepik from www .flaticon.com
Introduction to Financial Analysis

Introduction to Financial Analysis Kenneth S.Bigel OPEN TOUR O NEW YORK, NY

Introduction to Financial Analysis Copyright © 2022 by Kenneth S. Bigel is licensed under a Creative Commons Attribution 4.0 International License , except wher e otherwise noted. Cover image: New Y ork City (28) by Jesús Quiles is licensed under CC-BY 2.0 Idea icon made by Freepik from www .flaticon.com Contents About the Author xvi Author's Acknowledgements xix Open T ouro Acknowledgements xxii Preface xxiv Part I. Financial Statements and Ratio Analysis, and F 

5.2. Indexing

In [None]:
# Define a function to chunk the original text
def chunk_text(text, max_tokens_per_chunk):
    chunked_text = []
    current_chunk = []
    current_chunk_token_count = 0

    # Extract text from the document
    tokens = text.split()  # Tokenize the original text

    for token in tokens:
        if current_chunk_token_count + len(token) < max_tokens_per_chunk:
            current_chunk.append(token)
            current_chunk_token_count += len(token)
        else:
            chunked_text.append(" ".join(current_chunk))
            current_chunk = [token]
            current_chunk_token_count = len(token)

    if current_chunk:
        chunked_text.append(" ".join(current_chunk))

    return chunked_text

In [None]:
# Initialize an OpenAI language model (LLM) for question answering
llm = OpenAI(model="gpt-3.5-turbo-1106")

# Define a local path for storing vectors using DeepLakeVectorStore
dataset_path = "/content/nodes/deep_lake_db"

# Create a DeepLakeVectorStore instance for vector storage with specified configurations
vector_store = DeepLakeVectorStore(
    dataset_path=dataset_path,  # Path to store vectors
    overwrite=True,  # Overwrite if the dataset exists
    exec_option="compute_engine"  # Execution option (e.g., compute engine)
)

# Define an embedding model (text-embedding-ada-002) from OpenAI
embed_model = OpenAIEmbedding()

# Create a ServiceContext incorporating the embedding model and the LLM
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

# Create a StorageContext incorporating the vector store for storage
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Define the maximum token count per chunk
max_tokens_per_chunk = 8192  # Limit of the model

# Chunk the nodes
chunked_document = chunk_text(document.get_text(), max_tokens_per_chunk)

# Initialize an empty list to store nodes
window_nodes = []

# Iterate through each chunk in chunked_document
for index, chunk in enumerate(chunked_document):
    # Create a Document object for each chunk of text
    doc = Document(text=chunk, id_=str(index))  # Assign a unique ID to each document
    # Create nodes from the Document object
    nodes_from_chunk = window_parser.get_nodes_from_documents([doc])
    # Extend the window_nodes list with nodes from the current chunk
    window_nodes.extend(nodes_from_chunk)

# Create a VectorStoreIndex for indexing chunked nodes with associated service and storage contexts
vector_index = VectorStoreIndex(
    window_nodes,
    service_context=service_context)



5.3. Post Processing

In [None]:
# Create a MetadataReplacementPostProcessor instance
postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"  # Specifies the target metadata key for replacement
)


5.4. Reranking

In [None]:
# Create a SentenceTransformerRerank instance
rerank = SentenceTransformerRerank(
    top_n=3, model="BAAI/bge-reranker-base"
)

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

5.5. Query Engine

In [None]:
# Create a query engine based on the vector index, with post-processing steps
query_engine = vector_index.as_query_engine(
    similarity_top_k=10,                   # Retrieve the top 10 most similar results
    node_postprocessors=[postproc, rerank] # Apply post-processing techniques (postproc and rerank) to the retrieved nodes
)

# Record the current time
now = time()

# Execute a query using the prepared query engine
response = query_engine.query(
    "What is the basic accounting equation as described in \"Introduction to Financial Analysis\"?",  # Query asking about the main characters of a book
)

# Calculate and print the elapsed time for the query execution
print(f"Elapsed: {round(time() - now, 2)}s")

Elapsed: 2.37s


In [None]:
# Process the response
if response and response.source_nodes:
    # Print response
    print(response)
    # Accessing and printing the top-k results from source_nodes

    print("Top-K Results:")
    for rank, node_with_score in enumerate(response.source_nodes):
        print(f"{rank + 1}. Score: {node_with_score.score} - Node ID: {node_with_score.node.id_}")
        # Access other metadata or information from the node_with_score as needed
else:
    print("No results found.")

# Calculate and display the elapsed time
print(f"Elapsed: {round(time() - now, 2)}s")

The basic accounting equation, as described in "Introduction to Financial Analysis," is Assets equals Liabilities plus Equity, or A = L + E. This equation is fundamental to understanding the financial position of a company and is depicted on the Balance Sheet.
Top-K Results:
1. Score: 0.9965841770172119 - Node ID: f977bc2c-15f8-4e6c-887f-1ad4a42fc02a
2. Score: 0.9675183892250061 - Node ID: ebece71f-ef41-401e-b6cc-c3315c980adf
3. Score: 0.8595015406608582 - Node ID: 49d99b8a-d820-4f42-8a17-f1475b890c5d
Elapsed: 2.38s


6. Model Evaluation

6.1. Generate Questions

In [None]:
from llama_index.core.schema import TextNode  # Import the Node class

# Initialize OpenAI client
client = OpenAI()

# Function to generate the evaluation dataset
def generate_question(text):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106",
            messages=[
                {"role": "system", "content": "You are a world class expert for generating questions based on provided context. \
                        You make sure the question can be answered by the text."},
                {
                    "role": "user",
                    "content": text,
                },
            ],
        )
        return response.choices[0].message.content
    except:
        question_string = "No question generated"
        return question_string

In [None]:
def chunk_nodes(window_nodes, max_tokens_per_chunk):
    chunks = []
    current_chunk = []
    current_token_count = 0

    for node in window_nodes:
        # Extract text from the TextNode object
        node_text = node.text
        node_token_count = len(node_text.split())  # Count tokens based on splits

        if current_token_count + node_token_count > max_tokens_per_chunk:
            # Join the texts of nodes in the current chunk to form a single string
            chunks.append(" ".join([n.text for n in current_chunk]))
            current_chunk = [node]
            current_token_count = node_token_count
        else:
            current_chunk.append(node)
            current_token_count += node_token_count

    if current_chunk:  # Add the last chunk if it exists
        chunks.append(" ".join([n.text for n in current_chunk]))

    return chunks

In [None]:
# Define the maximum token count per chunk
max_tokens_per_chunk = 8192  # Limit of the model

# Chunk the nodes
chunked_nodes = chunk_nodes(window_nodes, max_tokens_per_chunk)

# Create Node objects from the chunked text and add them to a new list
chunked_node_objects = [TextNode(text=node_text) for node_text in chunked_nodes]

qc_dataset = generate_question_context_pairs(
    chunked_node_objects,
    llm=llm,
    num_questions_per_chunk=1
)
# We can save the dataset as a json file for later use.
qc_dataset.save_json("qc_dataset.json")

100%|██████████| 10/10 [01:02<00:00,  6.20s/it]


In [None]:
# Load the questions dataset
qc_dataset = EmbeddingQAFinetuneDataset.from_json(
    "qc_dataset.json"
)

6.2. Evaluation

The metrics that we will use are:

**Relevancy** evaluates whether the retrieved context and answer are relevant to the query.

**Faithfulness** evaluates the integrity of the answer, it faithfully represents the information in the retrieved context (the response from a query engine matches any source nodes) or, in other words, whether there’s a hallucination.

In [None]:
# Evaluation with top k 6 (it might exceed the tokens amount)

i = 6

query_engine = vector_index.as_query_engine(
    similarity_top_k=i,                   # Retrieve the top 10 most similar results
    node_postprocessors=[postproc, rerank] # Apply post-processing techniques (postproc and rerank) to the retrieved nodes
)
# While we use GPT3.5-Turbo to answer questions
llm2 = OpenAI(model="gpt-3.5-turbo-16k", max_tokens=256)

service_context_gpt = ServiceContext.from_defaults(llm=llm2)


faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt)
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt)

# Run evaluation
queries = list(qc_dataset.queries.values())
batch_eval_queries = queries[:10]

runner = BatchEvalRunner(
{"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
workers=8,
)
eval_results = await runner.aevaluate_queries(
    query_engine, queries=batch_eval_queries
)
faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
print(f"top_{i} faithfulness_score: {faithfulness_score}")

relevancy_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['relevancy'])
print(f"top_{i} relevancy_score: {relevancy_score}")



top_6 faithfulness_score: 1.0
top_6 relevancy_score: 1.0


In [None]:
qc_dataset.queries.values()

dict_values(['Explain the difference between the roles of the controller and treasurer in a corporation, and how they report to the vice president of finance or chief financial officer (CFO). Provide examples of their functions within the organization.', "Explain the differences between the FIFO and LIFO inventory costing methods, and discuss the potential impact of each method on a company's financial statements and tax liabilities.", "Explain the concept of liquidity and its importance in financial analysis. Provide examples of liquid and non-liquid assets and their impact on a company's financial health.", 'As a financial analyst, how would you gather information from different departments within a corporation in order to project future financial outcomes? Provide examples of the types of data you would collect from each department.', 'Explain the concept of free cash flow and its significance in corporate valuation and investment decision-making. Provide examples to illustrate your

In [None]:
# Evaluation with top k 2 (it might exceed the tokens amount)

i=2

query_engine = vector_index.as_query_engine(
    similarity_top_k=i,                   # Retrieve the top 10 most similar results
    node_postprocessors=[postproc, rerank] # Apply post-processing techniques (postproc and rerank) to the retrieved nodes
)

# While we use GPT3.5-Turbo to answer questions
llm2 = OpenAI(model="gpt-3.5-turbo-16k", max_tokens=256)

service_context_gpt = ServiceContext.from_defaults(llm=llm2)

faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt)
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt)

# Run evaluation
queries = list(qc_dataset.queries.values())
batch_eval_queries = queries[:5]

# Split queries into chunks
chunk_size = 3  # Define the size of each chunk
chunks = [batch_eval_queries[i:i + chunk_size] for i in range(0, len(batch_eval_queries), chunk_size)]

# Initialize variables to accumulate scores and counts
total_faithfulness_score = 0
total_relevancy_score = 0
num_chunks = len(chunks)


# Run evaluation for each chunk
for chunk in chunks:
    # Instantiate the runner for each chunk
    runner = BatchEvalRunner(
        {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
        workers=8,
    )
    eval_results = await runner.aevaluate_queries(query_engine, queries=chunk)

    # Calculate metrics for the current chunk
    faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
    relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])

    # Accumulate scores for averaging
    total_faithfulness_score += faithfulness_score
    total_relevancy_score += relevancy_score

    # Print the scores for the current chunk
    print(f"top_{i} faithfulness_score: {faithfulness_score}")
    print(f"top_{i} relevancy_score: {relevancy_score}")


# Calculate averages
average_faithfulness_score = total_faithfulness_score / num_chunks
average_relevancy_score = total_relevancy_score / num_chunks

# Print the averages
print(f"Average Faithfulness Score: {average_faithfulness_score}")
print(f"Average Relevancy Score: {average_relevancy_score}")

top_2 faithfulness_score: 0.6666666666666666
top_2 relevancy_score: 1.0




top_2 faithfulness_score: 0.5
top_2 relevancy_score: 1.0
Average Faithfulness Score: 0.5833333333333333
Average Relevancy Score: 1.0


In [None]:
chunks

[['Explain the difference between the roles of the controller and treasurer in a corporation, and how they report to the vice president of finance or chief financial officer (CFO). Provide examples of their functions within the organization.',
  "Explain the differences between the FIFO and LIFO inventory costing methods, and discuss the potential impact of each method on a company's financial statements and tax liabilities.",
  "Explain the concept of liquidity and its importance in financial analysis. Provide examples of liquid and non-liquid assets and their impact on a company's financial health."],
 ['As a financial analyst, how would you gather information from different departments within a corporation in order to project future financial outcomes? Provide examples of the types of data you would collect from each department.',
  'Explain the concept of free cash flow and its significance in corporate valuation and investment decision-making. Provide examples to illustrate your e

In [None]:
# Evaluation with top k 4 (it might exceed the tokens amount)

i=4

query_engine = vector_index.as_query_engine(
    similarity_top_k=i,                   # Retrieve the top 10 most similar results
    node_postprocessors=[postproc, rerank] # Apply post-processing techniques (postproc and rerank) to the retrieved nodes
)
# While we use GPT3.5-Turbo to answer questions
llm2 = OpenAI(model="gpt-3.5-turbo-16k", max_tokens=256)

service_context_gpt = ServiceContext.from_defaults(llm=llm2)

faithfulness_evaluator = FaithfulnessEvaluator(service_context=service_context_gpt)
relevancy_evaluator = RelevancyEvaluator(service_context=service_context_gpt)

# Run evaluation
queries = list(qc_dataset.queries.values())
batch_eval_queries = queries[:10]

# Split queries into chunks
chunk_size = 3  # Define the size of each chunk
chunks = [batch_eval_queries[i:i + chunk_size] for i in range(0, len(batch_eval_queries), chunk_size)]

# Initialize variables to accumulate scores and counts
total_faithfulness_score = 0
total_relevancy_score = 0
num_chunks = len(chunks)


# Run evaluation for each chunk
for chunk in chunks:
    # Instantiate the runner for each chunk
    runner = BatchEvalRunner(
        {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
        workers=8,
    )
    eval_results = await runner.aevaluate_queries(query_engine, queries=chunk)

    # Calculate metrics for the current chunk
    faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
    relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])

    # Accumulate scores for averaging
    total_faithfulness_score += faithfulness_score
    total_relevancy_score += relevancy_score

    # Print the scores for the current chunk
    print(f"top_{i} faithfulness_score: {faithfulness_score}")
    print(f"top_{i} relevancy_score: {relevancy_score}")

# Calculate averages
average_faithfulness_score = total_faithfulness_score / num_chunks
average_relevancy_score = total_relevancy_score / num_chunks

# Print the averages
print(f"Average Faithfulness Score: {average_faithfulness_score}")
print(f"Average Relevancy Score: {average_relevancy_score}")



top_4 faithfulness_score: 1.0
top_4 relevancy_score: 1.0




top_4 faithfulness_score: 1.0
top_4 relevancy_score: 1.0




top_4 faithfulness_score: 1.0
top_4 relevancy_score: 1.0
top_4 faithfulness_score: 1.0
top_4 relevancy_score: 1.0
Average Faithfulness Score: 1.0
Average Relevancy Score: 1.0
