In [1]:
from database.GraphDB import KnowledgeGraphDB
from langchain_community.embeddings.sentence_transformer import (
    HuggingFaceEmbeddings,
)
from agent.llm.llm_utils import *
from agent import BedRockLLMs, CoreLLMs

2024-06-09 00:37:06.059569: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 00:37:06.059683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 00:37:06.113772: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
kg = KnowledgeGraphDB(uri="bolt://localhost:8687", user="neo4j", password="quanghung2004")


In [3]:
def document_to_plan_text(data):
    id_text = dict()
    for doc in data:
        id = doc[0].metadata['id']
        text = doc[0].page_content
        if id not in id_text:
            id_text[id] = []
        id_text[id].append(text)
    
    plain_text = ''
    for k, v in id_text.items():
        plain_text += f'**ID {k}**\n'
        plain_text += ' '.join(v) + '\n\n'
    
    return plain_text

In [4]:
def RAG(questions):
    data = kg.summary_db.similarity_search_with_relevance_scores(questions,20)
    return document_to_plan_text(data)

In [5]:
from dotenv import load_dotenv
load_dotenv()
import os

In [6]:
access_key = os.getenv('ACCESS_KEY')
secret_key = os.getenv('SECRET_KEY') 
# secret_token = os.getenv('SECRET_TOKEN')
model_name_cv = os.getenv('MODEL_NAME_CV')
model_name_jd = os.getenv('MODEL_NAME_ROUTING')
model_name = os.getenv('MODEL_NAME')
region_name = os.getenv('REGION_NAME')
llm_jd_extraction_args = {
    "model_name": model_name_jd,
    "access_key": access_key,
    "secret_key": secret_key,
    # "secret_token": secret_token,
    "region_name": region_name
}

In [7]:
llm = BedRockLLMs(**llm_jd_extraction_args)
# llm = CoreLLMs()

In [8]:
def evaluate_pipeline(llm, question):
    context = RAG(question)
    message = f"""  
    You are given a job description and a list of candidates. Your task is to pick the candidate that suitable with the job description.
    
    **Job Description**
    {question}
    
    **Candidates**
    {context}
    
    Return the chosen candidate in JSON format:
    ```json
    [
        {{
            "candidate_id": [1, 2, 3, 4, 5, 6]
        }}
    ]
    ```
    The order of the list is the ranking of the candidates.
    """
    messages = [
        {
            "role": "system","content":"You are a helpful assistant for HR department. The current recruitment date is May 2024"
        },
        {
            "role": "user",
            "content": message
        }
    ]
    
    response = llm(messages)
    ids = get_json_from_text_response(response)[0]["candidate_id"]
    
    query = f"""
    MATCH (c:Application)
    WHERE id(c) IN {ids}
    RETURN c.file as file
    """
    files = kg.query(query)
    files = [f['file'].split('/')[-1] for f in files]
    return files
    

In [9]:
questions = "Find 2 candidate having around 2-4 years of experience as Back-end Development"

In [10]:
res = evaluate_pipeline(llm, questions)



In [11]:
print(res)

['TranVanDanTruong-CV-BackEndDeveloper.pdf', 'CV TRAN TUAN KIET - NodeJs-TopCV.vn (2).pdf', 'Intern-Fresher-Backend-NguyenVanTuan-0835666356.pdf', 'Cv-PhanCaoVu-DA_DE.pdf', 'NguyenTienPhat_CV_Backend_Developer.pdf', 'NodeFlair_Resume_BackendNodejs.pdf']


In [12]:
import json
questions = json.loads(open('qa_cv.json').read())


In [13]:
answers = []
i = 0
from tqdm import tqdm
for question in tqdm(questions):
    prompt = question['prompt']
    answer = dict()
    answer['prompt'] = prompt
    answer['answer'] = question['answer']
    try:
        res = evaluate_pipeline(llm, prompt)
        answer['predict'] = res
    except:
        answer['predict'] = []
    answers.append(answer)
    with open('qa_cv_result.json', 'w') as f:
        f.write(json.dumps(answers))
    i += 1
    

100%|██████████| 39/39 [03:10<00:00,  4.89s/it]


In [14]:
answers

[{'prompt': 'Find 12 Marketers in Ha Noi',
  'answer': ['Portfolio - Nguyen Tien Dat.pdf',
   'Nguyễn-Hoàng-Việt-Content.pdf',
   'CV_TRẦN THỊ HẢI YẾN - Hải Yến Trần.pdf',
   'CV123.pdf',
   'Intern Digital MKT_Nguyen Thi Mai Huong.pdf',
   'LuongTienHuy-resume.pdf',
   'CV Thiên Khôi - Duyên Nguyễn.pdf',
   'CV thương mại điện tử - Nguyễn Mai Phương - Nguyen Mai Phuong.pdf',
   'CV_PHẠM-VIỆT-HƯNG_-NHÂN-VIÊN-CONTENT-WRITER - Hưng Phạm Việt.pdf',
   'CV-Le Nhat Linh.pdf',
   'Digital Marketing Resume-Nguyen Thanh Tuyen.pdf',
   'NGUYEN-THI-THANH-HANG-CV - Marketing Executive.pdf'],
  'predict': ['CV-Le Nhat Linh.pdf']},
 {'prompt': 'Find 5 Marketers in Ho Chi Minh City',
  'answer': ['CV - Nguyễn Thị Kiều Oanh .pdf',
   'TTS MARKETING - BÙI VĨNH HUY (1).pdf',
   'CV Content Creator.pdf',
   'Phi Ha Nhi_Brand Marketing Exe_Strategic Planner.pdf',
   'CV Marketing Executive - Le Thi Thuy Trang.pdf'],
  'predict': ['NGUYEN-THI-THANH-HANG-CV - Marketing Executive.pdf',
   'CV Marketing Exec

In [16]:
print(answers)

[{'prompt': 'Find 12 Marketers in Ha Noi', 'answer': ['Portfolio - Nguyen Tien Dat.pdf', 'Nguyễn-Hoàng-Việt-Content.pdf', 'CV_TRẦN THỊ HẢI YẾN - Hải Yến Trần.pdf', 'CV123.pdf', 'Intern Digital MKT_Nguyen Thi Mai Huong.pdf', 'LuongTienHuy-resume.pdf', 'CV Thiên Khôi - Duyên Nguyễn.pdf', 'CV thương mại điện tử - Nguyễn Mai Phương - Nguyen Mai Phuong.pdf', 'CV_PHẠM-VIỆT-HƯNG_-NHÂN-VIÊN-CONTENT-WRITER - Hưng Phạm Việt.pdf', 'CV-Le Nhat Linh.pdf', 'Digital Marketing Resume-Nguyen Thanh Tuyen.pdf', 'NGUYEN-THI-THANH-HANG-CV - Marketing Executive.pdf'], 'predict': ['CV-Le Nhat Linh.pdf']}, {'prompt': 'Find 5 Marketers in Ho Chi Minh City', 'answer': ['CV - Nguyễn Thị Kiều Oanh .pdf', 'TTS MARKETING - BÙI VĨNH HUY (1).pdf', 'CV Content Creator.pdf', 'Phi Ha Nhi_Brand Marketing Exe_Strategic Planner.pdf', 'CV Marketing Executive - Le Thi Thuy Trang.pdf'], 'predict': ['NGUYEN-THI-THANH-HANG-CV - Marketing Executive.pdf', 'CV Marketing Executive - Le Thi Thuy Trang.pdf', 'Portfolio - Nguyen Tien 

In [17]:
recall = 0
precision = 0
long_recall = 0
long_precision = 0
len_long = 0
short_recall = 0
short_precision = 0
len_short = 0

for answer in answers:
    num_ans = len(answer['answer'])
    correct = 0
    ans = set(answer['answer'])
    for p in answer['predict']:
        if p in ans:
            correct += 1
    recall += correct / num_ans
    precision += correct / (len(answer['predict'])+1e-5)
    if num_ans > 7:
        long_recall += correct / num_ans
        long_precision += correct / (len(answer['predict'])+1e-5)
        len_long+= 1
    else:
        short_recall += correct / num_ans
        short_precision += correct / (len(answer['predict'])+1e-5)
        len_short += 1

In [18]:
recall /= len(answers)
precision /= len(answers)
long_recall /= len_long
long_precision /= len_long
short_recall /= len_short
short_precision /= len_short
f1 = 2 * recall * precision / (recall + precision)
short_f1 = 2 * short_recall * short_precision / (short_recall + short_precision)
long_f1 = 2 * long_recall * long_precision / (long_recall + long_precision)

In [19]:
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1: {f1}")
print("=====================================")

print(f"Short Recall: {short_recall}")
print(f"Short Precision: {short_precision}")
print(f"Short F1: {short_f1}")
print("=====================================")

print(f"Long Recall: {long_recall}")
print(f"Long Precision: {long_precision}")
print(f"Long F1: {long_f1}")
print("=====================================")

Recall: 0.3048590298590299
Precision: 0.3628191507913521
F1: 0.3313233756322104
Short Recall: 0.3108294930875576
Short Precision: 0.3123643954364484
Short F1: 0.31159505405956034
Long Recall: 0.28172348484848486
Long Precision: 0.5583313277916035
Long F1: 0.374487581045318


## Haiku

Recall: 0.3048590298590299 \
Precision: 0.3628191507913521 \
F1: 0.3313233756322104 

=====================================

Short Recall: 0.3108294930875576 \
Short Precision: 0.3123643954364484 \
Short F1: 0.31159505405956034 

=====================================

Long Recall: 0.28172348484848486 \
Long Precision: 0.5583313277916035 \
Long F1: 0.374487581045318 

=====================================

## Llama 3 8b

## Llama 3 70b aws