In [1]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

In [2]:
import logging
import sys
import pandas as pd

from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
from llama_index.llms.openai import OpenAI
from llama_index.llms.ollama import Ollama
from llama_index.core.embeddings import resolve_embed_model
import openai
from llama_index.core.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
)
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Response

import streamlit as st

openai.api_key = st.secrets.openai_key

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

2024-03-02 08:06:37.541 INFO    streamlit.runtime.secrets: Secrets found in multiple locations: C:\Users\joyde\.streamlit\secrets.toml, D:\documents\github\webinars\creating_gpt_chatbots_for_enterprise_usecases\.streamlit\secrets.toml. When multiple secret.toml files exist, local secrets will take precedence over global secrets.


In [3]:
# LOAD THE TEXT AS `Document`'s
reader = SimpleDirectoryReader(input_dir="./data", recursive=True)
documents = reader.load_data()

## Retriever evaluation

In [4]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.node_parser import SentenceSplitter

In [5]:
judge_llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
vector_index = VectorStoreIndex.from_documents(documents=documents, embed_model=embed_model)
retriever = vector_index.as_retriever(similarity_top_k=2)

In [6]:
node_parser = SentenceSplitter(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)
print(f'{len(nodes)}')

42


In [7]:
retrieved_nodes = retriever.retrieve("Who is responsible for completing the Confirmation Appraisal Form?")

for node in retrieved_nodes:
    display_source_node(node, source_length=1000)

**Node ID:** c5107950-2a6d-46e6-8bfb-468871b05aa8<br>**Similarity:** 0.7106077511023658<br>**Text:** Human Resource Policy Manual Version 1.0   Karvy Financial Services Ltd  
 
This document is a proprietary information of KFSL  and should not be reproduced or altered without requisite p ermissions.  
 
       
Confidential   Page 26 of 28  
 
 
 
 
 
 
 
CONFIRMATION APPRAISAL FORM  
 
Employee Name:  
Employee Number:  Date of Joining:  
Department:  Location:  
Immediate Supervisor:  Due Date for Confirmation:  
 
Comments on employee review:  
Please give your assessment of the employee’s performance du ring the probation period  
(You are requested to keep in mind that the employee is new to the organization, and focus on 
whether He/She has demonstrated an ability to understand all aspects of the function he/she is 
performing, as well as the basic skills a nd behaviors required to perform the role effectively)  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Recommendation  
(Please tick your recommendation)  
 
Recommended for confirmation  Recommended for extension of probation f...<br>

**Node ID:** 7d551334-eb8f-4d96-a10d-60ef7b9b2fe1<br>**Similarity:** 0.6860221231511127<br>**Text:** Human Resource Policy Manual Version 1.0   Karvy Financial Services Ltd  
 
This document is a proprietary information of KFSL  and should not be reproduced or altered without requisite p ermissions.  
 
       
Confidential   Page 11 of 28  
Step 2. (D-20) Within 10 days of receipt of the confirmation appraisal form the supervisor should 
have a formal discussion with the appraisee . This discu ssion should revolve around the 
appraisee’s performance on KRA for the specific period, any lim itations he/she has in executing  
his/her duties etc  
 
Step 3. Post the personal discussion the supervisor and the appraisee should arrive at a 
consensus on the pe rformance during the last five months. Incase they are not able to arrive at a 
consensus, the matter has to be referred to the skip level supervisor and HR. The decision of the 
skip level supervisor and HR shall be final.  
 
Step 4. (D-15) Based on discussio n the supervisor needs to inform HR either on confirmation of 
services...<br>

In [8]:
# import random

# sample_nodes = random.sample(nodes, 5)

# qa_dataset = generate_question_context_pairs(
#     sample_nodes, llm=judge_llm, num_questions_per_chunk=2
# )

# print(f'number of queries={len(qa_dataset.queries)}')

# qa_dataset.save_json("pg_eval_dataset.json")

In [9]:
# qa_dataset.save_json("pg_eval_dataset.json")

In [12]:
qa_dataset = EmbeddingQAFinetuneDataset.from_json("pg_eval_dataset.json")

In [13]:
from llama_index.core.evaluation import RetrieverEvaluator

metrics = ["mrr", "hit_rate"]

retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [14]:
# try it out on a sample query
sample_id, sample_query = list(qa_dataset.queries.items())[8]
sample_expected = qa_dataset.relevant_docs[sample_id]

eval_result = retriever_evaluator.evaluate(sample_query, sample_expected)
print(eval_result)

Query: What is the process for employees to avail their mandatory leave, and why is it recommended to submit a tentative leave schedule at the beginning of the year?
Metrics: {'mrr': 0.0, 'hit_rate': 0.0}



In [15]:
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [16]:
import pandas as pd


def display_retrieval_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()
    columns = {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}

    metric_df = pd.DataFrame(columns)

    return metric_df

In [17]:
display_retrieval_results("top-2 eval", eval_results)

Unnamed: 0,retrievers,hit_rate,mrr
0,top-2 eval,0.0,0.0


## LLM evaluation

In [18]:
# data_generator = DatasetGenerator.from_documents(documents)
# eval_questions = data_generator.generate_questions_from_nodes(50)

In [25]:
import json
eval_questions_filename = 'eval_questions.json'

# with open(eval_questions_filename, "w") as fp:
#     json.dump(eval_questions, fp)

with open(eval_questions_filename, "r") as fp:
    eval_questions = json.load(fp)

In [26]:
print(len(eval_questions))
print(eval_questions[:2])

50
['What is the file type of the document "HR_Policy_Manual_KFSLnew.pdf"?', 'When was the Human Resource Policy Manual Version 1.0 created?']


In [27]:
judge_llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
rag_llm = Ollama(model="phi", request_timeout=300)

embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
vector_index = VectorStoreIndex.from_documents(documents=documents, embed_model=embed_model)
query_engine = vector_index.as_query_engine(llm=rag_llm)

relevancy_evaluator = RelevancyEvaluator(llm=judge_llm)
faithfulness_evaluator = FaithfulnessEvaluator(llm=judge_llm)

In [28]:
import time
from tqdm import tqdm

def evaluate_generation(eval_questions, query_engine, relevancy_evaluator, faithfulness_evaluator):
    evals = []
    for eval_q in tqdm(eval_questions):
        import time
        time.sleep(30)
        response_vector = query_engine.query(eval_q)
        relevancy_result = relevancy_evaluator.evaluate_response(query=eval_q, response=response_vector)
        faithfulness_result = faithfulness_evaluator.evaluate_response(response=response_vector)
        this_df = {
            "Query": eval_q,
            "Response": str(response_vector),
            "Source": (
                response_vector.source_nodes[0].node.get_content()[:1000] + "..."
            ),
            "Relevancy": relevancy_result.passing,
            "Faithfulness": faithfulness_result.passing,
        }
        evals.append(this_df)
    eval_df = pd.DataFrame(evals)
    return eval_df

In [29]:
llm_eval_results = evaluate_generation(eval_questions[:10], query_engine, relevancy_evaluator, faithfulness_evaluator)

  0%|                                                                                                                               | 0/10 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 10%|███████████▉                                                                                                           | 1/10 [00:52<07:50, 52.23s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 20%|███████████████████████▊                                                                                               | 2/10 [01:54<07:46, 58.33s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 30%|███████████████████████████████████▍                                                                                  | 3/10 [05:06<13:55, 119.42s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 40%|███████████████████████████████████████████████▏                                                                      | 4/10 [06:26<10:21, 103.65s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 50%|███████████████████████████████████████████████████████████▌                                                           | 5/10 [07:34<07:33, 90.77s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 60%|███████████████████████████████████████████████████████████████████████▍                                               | 6/10 [08:38<05:26, 81.70s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 70%|███████████████████████████████████████████████████████████████████████████████████▎                                   | 7/10 [10:00<04:05, 81.97s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 80%|███████████████████████████████████████████████████████████████████████████████████████████████▏                       | 8/10 [11:12<02:37, 78.64s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████            | 9/10 [12:21<01:15, 75.63s/it]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [13:34<00:00, 81.48s/it]


In [30]:
import pandas as pd


def display_llm_results(name, eval_results):
    """Display results from evaluate."""
    metric_cols = ['Relevancy', 'Faithfulness']
    metric_dicts = []
    for metric_col in metric_cols:
        metric_val = len(eval_results[eval_results[metric_col]])/len(eval_results)
        metric_dict = [metric_col, metric_val]
        metric_dicts.append(metric_dict)

    metric_df = pd.DataFrame(metric_dicts, columns=['name', 'val'])
    return metric_df

display_llm_results('llm eval results', llm_eval_results)

Unnamed: 0,name,val
0,Relevancy,0.7
1,Faithfulness,0.9
