## Install the requirements

In [1]:
!pip install pandas llama-index \
    llama-index-embeddings-huggingface \
    llama-index-llms-ollama \
    python-dotenv \
    matplotlib




[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

## setup RAG pipeline for evaluation

In [3]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir='./data', recursive=True)
documents = reader.load_data()

In [5]:
print(len(documents))

28


In [6]:
from llama_index.core.embeddings import resolve_embed_model

embed_model = "local:BAAI/bge-small-en-v1.5"
embed_model = resolve_embed_model(embed_model)
embed_model

HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x0000015392858550>, tokenizer_name='BAAI/bge-small-en-v1.5', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [7]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import VectorStoreIndex

node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes, embed_model=embed_model)
vector_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x15393914d30>

In [8]:
retriever = vector_index.as_retriever(similarity_top_k=2)
retriever

<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever at 0x15394eadf60>

In [11]:
from llama_index.core.response.notebook_utils import display_source_node

query = 'Who is responsible for completing the Confirmation Appraisal Form?'
retrieved_nodes = retriever.retrieve(query)

for node in retrieved_nodes:
    display_source_node(node, source_length=1000)
    print()

**Node ID:** 0a3e8b3b-3c4d-49be-8f15-6dd5bd369490<br>**Similarity:** 0.6993850229044759<br>**Text:** Human Resource Policy Manual Version 1.0   Karvy Financial Services Ltd  
 
This document is a proprietary information of KFSL  and should not be reproduced or altered without requisite p ermissions.  
 
       
Confidential   Page 26 of 28  
 
 
 
 
 
 
 
CONFIRMATION APPRAISAL FORM  
 
Employee Name:  
Employee Number:  Date of Joining:  
Department:  Location:  
Immediate Supervisor:  Due Date for Confirmation:  
 
Comments on employee review:  
Please give your assessment of the employee’s performance du ring the probation period  
(You are requested to keep in mind that the employee is new to the organization, and focus on 
whether He/She has demonstrated an ability to understand all aspects of the function he/she is 
performing, as well as the basic skills a nd behaviors required to perform the role effectively)  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Recommendation  
(Please tick your recommendation)  
 
Recommended for confirmation  Recommended for extension of probation f...<br>




**Node ID:** 4204fa1f-4b19-4de9-822c-abe7f0bdf195<br>**Similarity:** 0.6679656776365127<br>**Text:** Human Resource Policy Manual Version 1.0   Karvy Financial Services Ltd  
 
This document is a proprietary information of KFSL  and should not be reproduced or altered without requisite p ermissions.  
 
       
Confidential   Page 11 of 28  
Step 2. (D-20) Within 10 days of receipt of the confirmation appraisal form the supervisor should 
have a formal discussion with the appraisee . This discu ssion should revolve around the 
appraisee’s performance on KRA for the specific period, any lim itations he/she has in executing  
his/her duties etc  
 
Step 3. Post the personal discussion the supervisor and the appraisee should arrive at a 
consensus on the pe rformance during the last five months. Incase they are not able to arrive at a 
consensus, the matter has to be referred to the skip level supervisor and HR. The decision of the 
skip level supervisor and HR shall be final.  
 
Step 4. (D-15) Based on discussio n the supervisor needs to inform HR either on confirmation of 
services...<br>




In [12]:
len(nodes)

42

## setup question and context pairs for retrieval evaluation

In [13]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model='gpt-3.5-turbo')

In [14]:
from llama_index.core.evaluation import generate_question_context_pairs

qa_dataset = generate_question_context_pairs(
    nodes, llm=llm,
    num_questions_per_chunk=2
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [13:15<00:00, 18.95s/it]


In [19]:
list(qa_dataset.queries.values())[:2]

['How does the Human Resource Policy Manual play a role in the operations of Karvy Financial Services Ltd?',
 'What are the potential implications for employees who do not adhere to the policies outlined in the Human Resource Policy Manual at Karvy Financial Services Ltd?']

In [20]:
qa_dataset.save_json('hr_retrieval_dataset.json')

In [21]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

qa_dataset = EmbeddingQAFinetuneDataset.from_json('hr_retrieval_dataset.json')

In [22]:
list(qa_dataset.queries.values())[:2]

['How does the Human Resource Policy Manual play a role in the operations of Karvy Financial Services Ltd?',
 'What are the potential implications for employees who do not adhere to the policies outlined in the Human Resource Policy Manual at Karvy Financial Services Ltd?']

## Calculate the retrieval metrics

In [23]:
from llama_index.core.evaluation import RetrieverEvaluator

metrics = ['mrr', 'hit_rate']

retrieval_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)

In [24]:
eval_results = await retrieval_evaluator.aevaluate_dataset(qa_dataset)

In [26]:
import pandas as pd

def display_retrieval_results(name, eval_results):
    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)
    full_df = pd.DataFrame(metric_dicts)
    hit_rate = full_df['hit_rate'].mean()
    mrr = full_df['mrr'].mean()
    columns = {'retrievers': [name], 'hit_rate': [hit_rate], 'mrr': [mrr]}
    metric_df = pd.DataFrame(columns)
    return metric_df

In [27]:
display_retrieval_results('top 2 eval', eval_results)

Unnamed: 0,retrievers,hit_rate,mrr
0,top 2 eval,0.833333,0.72619


## LLM evaluation

In [28]:
import random

eval_questions = list(qa_dataset.queries.values())
eval_questions = random.sample(eval_questions, 20)
print(len(eval_questions))

20


In [29]:
eval_questions

['How does the company handle personal calls made by employees and what is the process for deducting personal call charges from the bill amount?',
 'What are the potential implications for employees who do not adhere to the policies outlined in the Human Resource Policy Manual at Karvy Financial Services Ltd?',
 'Explain the entitlement and guidelines for Privilege Leave as outlined in the Leave Policy of KFSL.',
 'Describe the process that an employee at Karvy Financial Services Ltd must follow in order to obtain reimbursement for travel and hotel/guest house stay expenses after a transfer has been completed.',
 'Explain the roles and responsibilities of the Business Head and CEO as outlined in the document, and discuss the potential consequences of unauthorized reproduction or alteration of the manual.',
 "What are the steps involved in the pre-employment screening process at KFSL, and who is responsible for approving the recommended candidate's compensation and grade?",
 'What are t

In [30]:
import nest_asyncio

nest_asyncio.apply()

In [44]:
from llama_index.llms.ollama import Ollama
from llama_index.core.evaluation import (
    FaithfulnessEvaluator, RelevancyEvaluator
)
judge_llm = OpenAI(model='gpt-3.5-turbo', temperature=0.1)
rag_llm = Ollama(model='phi', request_timeout=300)

vector_index = VectorStoreIndex(nodes, embed_model=embed_model)
query_engine = vector_index.as_query_engine(llm=rag_llm, similarity_top_k=2)

relevancy_evaluator = RelevancyEvaluator(llm=judge_llm)
faithfulness_evaluator = FaithfulnessEvaluator(llm=judge_llm)

In [46]:
import time
from tqdm import tqdm

def evaluate_generation(eval_questions, query_engine,
                        relevancy_evaluator, faithfulness_evaluator):
    evals = []
    for eval_q in tqdm(eval_questions):
        time.sleep(30)
        try:
            response_vector = query_engine.query(eval_q)
        except:
            continue
        relevancy_result = relevancy_evaluator.evaluate_response(query=eval_q, response=response_vector)
        faithfulness_result = faithfulness_evaluator.evaluate_response(response=response_vector)
        this_df = {
            'query': eval_q,
            'response': str(response_vector),
            'source': (
                response_vector.source_nodes[0].node.get_content()[:1000] + '...'
            ),
            'relevancy': relevancy_result.passing,
            'faithfulness': faithfulness_result.passing,
        }
        evals.append(this_df)
    eval_df = pd.DataFrame(evals)
    return eval_df

generation_eval_results = evaluate_generation(
    eval_questions, query_engine,
    relevancy_evaluator, faithfulness_evaluator)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [1:01:03<00:00, 183.19s/it]


In [47]:
generation_eval_results

Unnamed: 0,query,response,source,relevancy,faithfulness
0,How does the company handle personal calls mad...,The company has a policy in place to ensure t...,The employees need to highlight the ir persona...,True,True
1,What are the potential implications for employ...,"As per the provided context information, empl...",Human Resource Policy Manual Version 1.0 Kar...,True,True
2,Explain the entitlement and guidelines for Pri...,The leave policy at KFSL provides all employe...,Human Resource Policy Manual Version 1.0 Kar...,True,True
3,Describe the process that an employee at Karvy...,The process starts with the employee obtainin...,Human Resource Policy Manual Version 1.0 Kar...,True,True
4,Explain the roles and responsibilities of the ...,The role of the Business Head is to oversee t...,Human Resource Policy Manual Version 1.0 Kar...,True,True
5,What are the steps involved in the pre-employm...,The steps involved in the pre-employment scre...,Human Resource Policy Manual Version 1.0 Kar...,True,True
6,What are the key documents required for hiring...,The key documents required for hiring tempora...,Human Resource Policy Manual Version 1.0 Kar...,False,False
7,What are the prescribed monthly limits for mob...,The prescribed monthly limit for Mobile Phone...,Human Resource Policy Manual Version 1.0 Kar...,False,False
8,In the Reference Check Form provided by Karvy ...,The Reference Check Form requests information...,Human Resource Policy Manual Version 1.0 Kar...,True,True
9,What is the role of the HR Head in settling di...,The HR Head has the final authority to decide...,Human Resource Policy Manual Version 1.0 Kar...,True,True


In [48]:
def display_llm_results(name, eval_results):
    metric_cols = ['relevancy', 'faithfulness']
    metric_dicts = []
    for metric_col in metric_cols:
        metric_val = len(eval_results[eval_results[metric_col]])/len(eval_results)
        metric_dict = [metric_col, metric_val]
        metric_dicts.append(metric_dict)
    metric_df = pd.DataFrame(metric_dicts, columns=['name', 'val'])
    return metric_df

display_llm_results('llm eval resutls', generation_eval_results)

Unnamed: 0,name,val
0,relevancy,0.777778
1,faithfulness,0.833333
