## Evaluation for Traditional RAG approach

In [1]:
from database.GraphDB import KnowledgeGraphDB
from langchain_community.embeddings.sentence_transformer import (
    HuggingFaceEmbeddings,
)
from agent.llm.llm_utils import *
from agent import BedRockLLMs, CoreLLMs, Gemini

2024-06-15 00:12:23.129738: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-15 00:12:23.129775: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-15 00:12:23.130623: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
kg = KnowledgeGraphDB(uri="bolt://localhost:8687", user="neo4j", password="quanghung2004")


In [3]:
def document_to_plan_text(data):
    id_text = dict()
    for doc in data:
        id = doc[0].metadata['id']
        text = doc[0].page_content
        if id not in id_text:
            id_text[id] = []
        id_text[id].append(text)
    
    plain_text = ''
    for k, v in id_text.items():
        plain_text += f'**ID {k}**\n'
        plain_text += ' '.join(v) + '\n\n'
    
    return plain_text

In [4]:
def RAG(questions, top_k):
    data = kg.summary_db.similarity_search_with_relevance_scores(questions,max(20, int(top_k*2)))
    return document_to_plan_text(data)

In [5]:
from dotenv import load_dotenv
load_dotenv()
import os

In [6]:
access_key = os.getenv('ACCESS_KEY')
secret_key = os.getenv('SECRET_KEY') 
# secret_token = os.getenv('SECRET_TOKEN')
model_name_cv = os.getenv('MODEL_NAME_CV')
model_name_jd = os.getenv('MODEL_NAME_ROUTING')
model_name = os.getenv('MODEL_NAME')
region_name = os.getenv('REGION_NAME')
llm_jd_extraction_args = {
    "model_name": model_name_jd,
    "access_key": access_key,
    "secret_key": secret_key,
    # "secret_token": secret_token,
    "region_name": region_name
}

In [7]:
# llm = BedRockLLMs(**llm_jd_extraction_args)
llm = CoreLLMs(model_name="microsoft/Phi-3-small-8k-instruct")
# llm = Gemini()

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- tokenization_phi3_small.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- configuration_phi3_small.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- positional_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:
- triton_flash_blocksparse_attn.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-small-8k-instruct:

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
from FlagEmbedding import FlagReranker
import numpy as np

reranker_model = 'BAAI/bge-reranker-v2-m3'
reranker = FlagReranker(reranker_model)

In [9]:
def create_query(query, docs):
    
    if isinstance(query, list):
        query = query[0]
    if isinstance(docs, str):
        docs = [docs]
        
    pairs = []
    for doc in docs:
        pairs.append([query, doc])
    return pairs


def rerank(query, docs, k):
    pairs = create_query(query, docs)
    scores = np.array(reranker.compute_score(pairs))
    docs = np.array(docs)
    
    top_k_indices = np.argpartition(scores, -k)[-k:]
    top_k_elements = scores[top_k_indices]
    # print(top_k_indices)
    
    top_k_indices = top_k_indices[np.argsort(-top_k_elements)]
    return docs[top_k_indices].tolist()

def RAG_rerank(questions, top_k):
    data = kg.summary_db.similarity_search(questions,max(int(top_k*3), 30))
    pick_doc = rerank(questions, [doc.page_content for doc in data], max(top_k*2,20))
    top_doc = []
    for doc in data:
        if doc.page_content in pick_doc:
            top_doc.append((doc,None))
    
    return document_to_plan_text(top_doc)

In [10]:
def evaluate_pipeline(llm, question, top_k):
    context = RAG_rerank(question, top_k)

    message = f"""  
    You are given a job description and a list of candidates. Your task is to pick the candidate that suitable with the job description.
    You must only choose the top candidate that fit with the number of candidates required.
    **Job Description**
    {question}
    
    **Candidates**
    {context}
    
    Return the chosen candidate in JSON format:
    ```json
    [
        {{
            "candidate_id": [1, 2, 3, 4, 5, 6]
        }}
    ]
    ```
    The order of the list is the ranking of the candidates.
    """
    messages = [
        {
            "role": "system","content":"You are a helpful assistant for HR department. The current recruitment date is May 2024"
        },
        {
            "role": "user",
            "content": message
        }
    ]
    
    response = llm(messages)
    ids = get_json_from_text_response(response)[0]["candidate_id"]
    
    query = f"""
    MATCH (c:Application)
    WHERE id(c) IN {ids}
    RETURN c.file as file
    """
    files = kg.query(query)
    files = [f['file'].split('/')[-1] for f in files]
    return files
    

In [11]:
questions = "Find 2 candidate having around 2-4 years of experience as Back-end Development"

In [12]:
res = evaluate_pipeline(llm, questions, 10)

  x = [xi.to_sparse_csr() for xi in x]


In [13]:
print(res)

['TranVanDanTruong-CV-BackEndDeveloper.pdf', 'CV TRAN TUAN KIET - NodeJs-TopCV.vn (2).pdf', 'DANG_TRUONG_SON.pdf', 'NguyenTienPhat_CV_Backend_Developer.pdf', 'HUYNH-TRUNG-NGHIA-CV-EN-2024.pdf', 'Nguyen Xuan Giang CV.pdf']


In [14]:
import json
questions = json.loads(open('data/benchmark/qa_cv.json', 'r', encoding='utf-8').read())


In [15]:
answers = []
i = 0
from tqdm import tqdm
for question in tqdm(questions):
    prompt = question['prompt']
    answer = dict()
    answer['prompt'] = prompt
    answer['answer'] = question['answer']
    try:
        res = evaluate_pipeline(llm, prompt, len(question['answer']))
        answer['predict'] = res
    except:
        answer['predict'] = []
    answers.append(answer)
    with open('qa_cv_result_rerank.json', 'w') as f:
        f.write(json.dumps(answers))
    i += 1
    

 23%|██▎       | 9/39 [00:24<01:08,  2.27s/it]--- Logging error ---
Traceback (most recent call last):
  File "/home/quanghung20gg/anaconda3/lib/python3.11/logging/__init__.py", line 1110, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "/home/quanghung20gg/anaconda3/lib/python3.11/logging/__init__.py", line 953, in format
    return fmt.format(record)
           ^^^^^^^^^^^^^^^^^^
  File "/home/quanghung20gg/anaconda3/lib/python3.11/logging/__init__.py", line 687, in format
    record.message = record.getMessage()
                     ^^^^^^^^^^^^^^^^^^^
  File "/home/quanghung20gg/anaconda3/lib/python3.11/logging/__init__.py", line 377, in getMessage
    msg = msg % self.args
          ~~~~^~~~~~~~~~~
TypeError: not all arguments converted during string formatting
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/quanghung20gg/anaconda3/lib/python3.11/site-packages/ipykernel_la

No JSON response found in text response


100%|██████████| 39/39 [01:41<00:00,  2.60s/it]


In [16]:
print(answers)

[{'prompt': 'Find 12 Marketers in Ha Noi', 'answer': ['Portfolio - Nguyen Tien Dat.pdf', 'Nguyễn-Hoàng-Việt-Content.pdf', 'CV_TRẦN THỊ HẢI YẾN - Hải Yến Trần.pdf', 'CV123.pdf', 'Intern Digital MKT_Nguyen Thi Mai Huong.pdf', 'LuongTienHuy-resume.pdf', 'CV Thiên Khôi - Duyên Nguyễn.pdf', 'CV thương mại điện tử - Nguyễn Mai Phương - Nguyen Mai Phuong.pdf', 'CV_PHẠM-VIỆT-HƯNG_-NHÂN-VIÊN-CONTENT-WRITER - Hưng Phạm Việt.pdf', 'CV-Le Nhat Linh.pdf', 'Digital Marketing Resume-Nguyen Thanh Tuyen.pdf', 'NGUYEN-THI-THANH-HANG-CV - Marketing Executive.pdf'], 'predict': ['Nguyen-Thi-Van-Anh (1).pdf']}, {'prompt': 'Find 5 Marketers in Ho Chi Minh City', 'answer': ['CV - Nguyễn Thị Kiều Oanh .pdf', 'TTS MARKETING - BÙI VĨNH HUY (1).pdf', 'CV Content Creator.pdf', 'Phi Ha Nhi_Brand Marketing Exe_Strategic Planner.pdf', 'CV Marketing Executive - Le Thi Thuy Trang.pdf'], 'predict': ['Nguyen-Thi-Van-Anh (1).pdf', 'Phi Ha Nhi_Brand Marketing Exe_Strategic Planner.pdf', 'CV-Le Nhat Linh.pdf', 'CV123.pdf', 

In [17]:
recall = 0
precision = 0
long_recall = 0
long_precision = 0
len_long = 0
short_recall = 0
short_precision = 0
len_short = 0

for answer in answers:
    num_ans = len(answer['answer'])
    correct = 0
    ans = set(answer['answer'])
    if len(ans) == 0:
        print("Err",answer)
    for p in answer['predict']:
        if p in ans:
            correct += 1
    recall += correct / num_ans
    precision += correct / (len(answer['predict'])+1e-5)
    if num_ans > 7:
        long_recall += correct / num_ans
        long_precision += correct / (len(answer['predict'])+1e-5)
        len_long+= 1
    else:
        short_recall += correct / num_ans
        short_precision += correct / (len(answer['predict'])+1e-5)
        len_short += 1

In [18]:
recall /= len(answers)
precision /= len(answers)
long_recall /= len_long
long_precision /= len_long
short_recall /= len_short
short_precision /= len_short
f1 = 2 * recall * precision / (recall + precision)
short_f1 = 2 * short_recall * short_precision / (short_recall + short_precision)
long_f1 = 2 * long_recall * long_precision / (long_recall + long_precision)

In [19]:
len_long

8

In [20]:
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1: {f1}")
print("=====================================")

print(f"Short Recall: {short_recall}")
print(f"Short Precision: {short_precision}")
print(f"Short F1: {short_f1}")
print("=====================================")

print(f"Long Recall: {long_recall}")
print(f"Long Precision: {long_precision}")
print(f"Long F1: {long_f1}")
print("=====================================")

Recall: 0.2608948458948459
Precision: 0.3576906083455737
F1: 0.30171946495878155
Short Recall: 0.27096774193548384
Short Precision: 0.31666502841687894
Short F1: 0.2920395594976964
Long Recall: 0.22186237373737372
Long Precision: 0.5166647305692655
Long F1: 0.3104245270946311


## Haiku

Recall: 0.3510887260887261 \
Precision: 0.3828438125961463 \
F1: 0.36627929508666834

=====================================

Short Recall: 0.3890937019969279 \
Short Precision: 0.38102795277133 \
Short F1: 0.3850185896998829 \

=====================================

Long Recall: 0.20381944444444444 \
Long Precision: 0.3898802694173093 \
Long F1: 0.2676948566998643 \

=====================================

## Llama 3 8b

Recall: 0.3098719798719799 \
Precision: 0.3294436310402031 \
F1: 0.3193582276553054

=====================================

Short Recall: 0.2930875576036866 \
Short Precision: 0.3172799484740911 \
Short F1: 0.3047043109239347

=====================================

Long Recall: 0.37491161616161617 \
Long Precision: 0.3765779009838872 \
Long F1: 0.3757429112381944

=====================================

## Phi-3-small
Recall: 0.2710502460502461 \
Precision: 0.36965641133703153 \
F1: 0.31276547571875735

=====================================

Short Recall: 0.2629032258064516 \
Short Precision: 0.3086005324490866 \
Short F1: 0.2839249061602026

=====================================

Long Recall: 0.3026199494949495 \
Long Precision: 0.6062479420278181 \
Long F1: 0.40371702710388807

=====================================

## Mistral 7B

Recall: 0.2850027750027751 \
Precision: 0.4032058488291777 \
F1: 0.33395334447806935

=====================================

Short Recall: 0.2936251920122888 \
Short Precision: 0.4151281094206125 \
Short F1: 0.34396191338197385

=====================================

Long Recall: 0.2515909090909091 \
Long Precision: 0.35700708903736783 \
Long F1: 0.2951693510627643

=====================================

## Llama 3 70b aws

## Gemini 
Recall: 0.36257224257224263 \
Precision: 0.360434545384675 \
F1: 0.3615002337389273

=====================================

Short Recall: 0.33348694316436256 \
Short Precision: 0.3307977101147777 \
Short F1: 0.33213688320928825

=====================================

Long Recall: 0.4752777777777778 \
Long Precision: 0.4752772820555267 \
Long F1: 0.475277529916523

=====================================