In [None]:
#!pip install pypdf
#!pip install pdf2image
#brew install poppler
#!pip install pytesseract

In [7]:
import cv2
from glob import glob
from langchain_openai import OpenAIEmbeddings
import numpy as np
import pandas as pd
from pdf2image import convert_from_path
import pickle
from pypdf import PdfReader 
import pytesseract
import re
import os
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
JD_PATH = "jd_data/*"
RESUME_PATH = "resume_data/*/*"
OPENAI_API_KEY = "API-KEY"
EMBEDDING_MODEL_NAME = "text-embedding-3-large"
OUTPUT_PATH = "output/"

JD_EMBEDDINGS_FILENAME = "jd_embeddings_large.pkl"
RESUME_EMBEDDINGS_FILENAME = "resume_embeddings_large.pkl"

In [9]:
# Refer Readme File for additional details, Change the file paths accordingly
poppler_path = r'D:\Apps\Release-24.07.0-0\poppler-24.07.0\Library\bin'
# Update the path to the location where Tesseract is installed
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [10]:
class DirectoryReader:
    def __init__(self, path_to_jds, path_to_resumes):
        self.path_to_jds = path_to_jds
        self.path_to_resumes = path_to_resumes
        self.jd_data = {}
        self.resume_data = {}
        
    def read_jd_files(self):
        file_list = glob(self.path_to_jds, recursive = True)
        for file in file_list:
            with open(file, "r", encoding="utf-8") as f:
                data = f.read()
                data = data.strip().lower()
                job_name = file.split("/")[-1].replace(".txt", "")
                # print(file, job_name)
                self.jd_data[job_name] = data
        return self.jd_data
    
    def read_resume_files(self):
        file_list = glob(self.path_to_resumes, recursive = True)
        for file in file_list:
            print(file)
            reader = PdfReader(file)
            data = ""
            for page in reader.pages:
                data = data + page.extract_text() + "\n"
            data = data.strip().lower()
            
            
            # job_title = file.split("/")[1].replace(" ", "_").lower()
            # resume_name = file.split("/")[2].replace("-", "_").lower().replace(".pdf", "")

            file_parts = os.path.normpath(file).split(os.sep)

            # The job title would be the name of the directory just before the file name
            job_title = file_parts[-2].replace(" ", "_").lower()
            
            # The resume name would be the file name without the extension
            resume_name = os.path.basename(file_parts[-1]).replace("-", "_").lower().replace(".pdf", "")    

            if len(data) > 1:
                self.resume_data[job_title + "_" + resume_name] = data
            else: #to solve for incorrect startxref pointer(3), since they are images in pdf
                pages = convert_from_path(file, poppler_path=poppler_path)
                extracted_text = []
                for page in pages:
                    # Step 1: Preprocess the image (deskew)
                    preprocessed_image = self.deskew(np.array(page))
                    # Step 2: Extract text using OCR
                    text = self.extract_text_from_image(preprocessed_image)
                    extracted_text.append(text)
                self.resume_data[job_title + "_" + resume_name] = "\n".join(extracted_text).strip().lower()
        return self.resume_data
    
    def deskew(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        gray = cv2.bitwise_not(gray)
        coords = np.column_stack(np.where(gray > 0))
        angle = cv2.minAreaRect(coords)[-1]

        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        (h, w) = image.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, angle, 1.0)
        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated
    
    def extract_text_from_image(self, image):
        text = pytesseract.image_to_string(image)
        return text

In [11]:
dir_reader = DirectoryReader(JD_PATH, RESUME_PATH)

In [12]:
jd_data = dir_reader.read_jd_files()
resume_data = dir_reader.read_resume_files()

incorrect startxref pointer(3)


resume_data\Big Data Analyst\resume-1.pdf
resume_data\Big Data Analyst\resume-2.pdf
resume_data\Big Data Analyst\resume-3.pdf
resume_data\Business Intelligence Analyst\resume-10.pdf
resume_data\Business Intelligence Analyst\resume-11.pdf
resume_data\Business Intelligence Analyst\resume-12.pdf
resume_data\Business Intelligence Analyst\resume-13.pdf
resume_data\Business Intelligence Analyst\resume-4.pdf


incorrect startxref pointer(3)


resume_data\Business Intelligence Analyst\resume-5.pdf
resume_data\Business Intelligence Analyst\resume-6.pdf


incorrect startxref pointer(3)


resume_data\Business Intelligence Analyst\resume-7.pdf
resume_data\Business Intelligence Analyst\resume-8.pdf
resume_data\Business Intelligence Analyst\resume-9.pdf
resume_data\Data Analyst\resume-14.pdf
resume_data\Data Analyst\resume-15.pdf
resume_data\Data Analyst\resume-16.pdf
resume_data\Data Analyst\resume-17.pdf
resume_data\Data Analyst\resume-18.pdf
resume_data\Data Analyst\resume-19.pdf
resume_data\Data Analyst\resume-20.pdf
resume_data\Data Analyst\resume-21.pdf
resume_data\Data Analyst\resume-22.pdf
resume_data\Data Analyst\resume-23.pdf
resume_data\Data Analyst\resume-24.pdf
resume_data\Data Analyst\resume-25.pdf
resume_data\Data Analyst\resume-26.pdf
resume_data\Data Analyst\resume-27.pdf
resume_data\Data Analyst\resume-28.pdf
resume_data\Data Architect\resume-29.pdf


incorrect startxref pointer(3)


resume_data\Data Architect\resume-30.pdf


incorrect startxref pointer(3)


resume_data\Data Architect\resume-31.pdf
resume_data\Data Architect\resume-32.pdf
resume_data\Data Architect\resume-33.pdf
resume_data\Data Architect\resume-34.pdf
resume_data\Data Architect\resume-35.pdf
resume_data\Data Architect\resume-36.pdf
resume_data\Data Architect\resume-37.pdf
resume_data\Data Architect\resume-38.pdf
resume_data\Data Architect\resume-39.pdf
resume_data\Data Engineer\resume-40.pdf
resume_data\Data Engineer\resume-41.pdf
resume_data\Data Engineer\resume-42.pdf
resume_data\Data Engineer\resume-43.pdf
resume_data\Data Engineer\resume-44.pdf
resume_data\Data Engineer\resume-45.pdf
resume_data\Data Engineer\resume-46.pdf
resume_data\Data Engineer\resume-47.pdf
resume_data\Data Engineer\resume-48.pdf
resume_data\Data Engineer\resume-49.pdf
resume_data\Data Engineer\resume-50.pdf
resume_data\Data Engineer\resume-51.pdf
resume_data\Data Engineer\resume-52.pdf
resume_data\Data Engineer\resume-53.pdf
resume_data\Data Scientist\resume-54.pdf
resume_data\Data Scientist\res

incorrect startxref pointer(3)


resume_data\Data Scientist\resume-72.pdf
resume_data\Machine Learning Engineer\resume-73.pdf


incorrect startxref pointer(3)


resume_data\Machine Learning Engineer\resume-74.pdf


incorrect startxref pointer(3)


resume_data\Machine Learning Engineer\resume-75.pdf
resume_data\Machine Learning Engineer\resume-76.pdf
resume_data\Machine Learning Engineer\resume-77.pdf
resume_data\Machine Learning Engineer\resume-78.pdf
resume_data\Machine Learning Engineer\resume-79.pdf
resume_data\Machine Learning Engineer\resume-80.pdf
resume_data\Machine Learning Engineer\resume-81.pdf
resume_data\Machine Learning Engineer\resume-82.pdf
resume_data\Machine Learning Engineer\resume-83.pdf
resume_data\MLOps Engineer\resume-84.pdf
resume_data\MLOps Engineer\resume-85.pdf
resume_data\MLOps Engineer\resume-86.pdf
resume_data\MLOps Engineer\resume-87.pdf
resume_data\MLOps Engineer\resume-88.pdf


In [13]:
print("Number of JDs -> ", len(jd_data))
print("Number of Resumes -> ", len(resume_data))

Number of JDs ->  32
Number of Resumes ->  88


In [14]:
class EmbeddingModel:
    def __init__(self):
        self.embedding_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model=EMBEDDING_MODEL_NAME)
        
    def get_embeddings(self, data_dict):
        output_dict = {}
        keys = list(data_dict.keys())
        values = list(data_dict.values())
        embeddings = self.embedding_model.embed_documents(values)
        for i in range(0, len(keys)):
            output_dict[keys[i]] = embeddings[i]
        return output_dict
        
    def save_embeddings(self, embedding, file_name):
        with open(OUTPUT_PATH + file_name, 'wb') as handle:
            pickle.dump(embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def read_embeddings(self, file_name):
        with open(OUTPUT_PATH + file_name, 'rb') as handle:
            output_dict = pickle.load(handle)
        return output_dict

In [15]:
embedding_model = EmbeddingModel()

In [16]:
#Generate and Save JD Embeddings
jd_embeddings = embedding_model.get_embeddings(jd_data)
embedding_model.save_embeddings(jd_embeddings, JD_EMBEDDINGS_FILENAME)

In [17]:
#Generate and Save Resume Embeddings
resume_embeddings = embedding_model.get_embeddings(resume_data)
embedding_model.save_embeddings(resume_embeddings, RESUME_EMBEDDINGS_FILENAME)

### Read Embeddings

In [18]:
jd_embeddings = embedding_model.read_embeddings(JD_EMBEDDINGS_FILENAME)
resume_embeddings = embedding_model.read_embeddings(RESUME_EMBEDDINGS_FILENAME)

### Calculate cosine similarity

In [19]:
resume_jd_combi_to_match = {"data_engineer": "de", "data_analyst": "dataanalyst", 
                            "big_data_analyst": "bigdataanalyst", 
                           "mlops_engineer": "mlops", "data_scientist": "ds", "data_architect": "da",
                           "machine_learning_engineer": "mle", "business_intelligence_analyst": "bianalyst"}

jd_pattern = re.compile(r'\d+_[a-z]+$')
resume_pattern = re.compile(r'_resume_\d+$')

In [28]:
SIMILARITY_DICT = {}
for key1 in jd_embeddings.keys():

    for key2 in resume_embeddings.keys():
  
        cleaned_jd_category = jd_pattern.sub('', key1).replace('jd_data\\', '').replace('jd_data/', '')
        print("JD", cleaned_jd_category)
        cleaned_resume_category = resume_pattern.sub('', key2)
        print("Res", cleaned_resume_category)
        if resume_jd_combi_to_match[cleaned_resume_category] == cleaned_jd_category:
            sim_score = cosine_similarity(np.array(jd_embeddings[key1]).reshape(1, -1), 
                                          np.array(resume_embeddings[key2]).reshape(1, -1))[0][0]
            print(sim_score)
            if key2 not in SIMILARITY_DICT.keys():
                SIMILARITY_DICT[key2] = {}
            SIMILARITY_DICT[key2][key1] = {"score": sim_score}
        else:
            continue

JD bianalyst
Res big_data_analyst
JD bianalyst
Res big_data_analyst
JD bianalyst
Res big_data_analyst
JD bianalyst
Res business_intelligence_analyst
0.5060060482967327
JD bianalyst
Res business_intelligence_analyst
0.5384419551283793
JD bianalyst
Res business_intelligence_analyst
0.49280038579932084
JD bianalyst
Res business_intelligence_analyst
0.4152638537615121
JD bianalyst
Res business_intelligence_analyst
0.4456743295315385
JD bianalyst
Res business_intelligence_analyst
0.433608582893469
JD bianalyst
Res business_intelligence_analyst
0.5355024951759682
JD bianalyst
Res business_intelligence_analyst
0.5115681493032047
JD bianalyst
Res business_intelligence_analyst
0.5070309571995366
JD bianalyst
Res business_intelligence_analyst
0.44577086621022755
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analyst
JD bianalyst
Res data_analy

In [30]:
def get_top_matching_job(resume_name):
    score_list = [[key, SIMILARITY_DICT[resume_name][key]['score']] 
                  for key in SIMILARITY_DICT[resume_name].keys()]
    score_list = sorted(score_list, key=lambda x: x[1], reverse=True)
    return score_list[0]

In [31]:
get_top_matching_job("big_data_analyst_resume_1")

['jd_data\\bigdataanalyst4_quess', 0.5563235144955632]

In [32]:
output = []
for key in SIMILARITY_DICT.keys():
    top_matching_job = get_top_matching_job(key)
    output.append([key, top_matching_job[0], int(round(top_matching_job[1] * 100.0))])
    print("Resume Name: ", key, "\nJD Name: ", top_matching_job[0], 
          "\nMatching Score: ", int(round(top_matching_job[1] * 100.0)))
    print("----------")

Resume Name:  business_intelligence_analyst_resume_10 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  54
----------
Resume Name:  business_intelligence_analyst_resume_11 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  55
----------
Resume Name:  business_intelligence_analyst_resume_12 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  56
----------
Resume Name:  business_intelligence_analyst_resume_13 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  50
----------
Resume Name:  business_intelligence_analyst_resume_4 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  52
----------
Resume Name:  business_intelligence_analyst_resume_5 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  51
----------
Resume Name:  business_intelligence_analyst_resume_6 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  63
----------
Resume Name:  business_intelligence_analyst_resume_7 
JD Name:  jd_data\bianalyst3_nicbl 
Matching Score:  57
----------
Resume Name:  business_intel

In [33]:
match_df = pd.DataFrame(output, columns=["resume_name", "jd_name", "matching_score"])

In [34]:
match_df.head(100)

Unnamed: 0,resume_name,jd_name,matching_score
0,business_intelligence_analyst_resume_10,jd_data\bianalyst3_nicbl,54
1,business_intelligence_analyst_resume_11,jd_data\bianalyst3_nicbl,55
2,business_intelligence_analyst_resume_12,jd_data\bianalyst3_nicbl,56
3,business_intelligence_analyst_resume_13,jd_data\bianalyst3_nicbl,50
4,business_intelligence_analyst_resume_4,jd_data\bianalyst3_nicbl,52
...,...,...,...
83,mlops_engineer_resume_84,jd_data\mlops3_exela,47
84,mlops_engineer_resume_85,jd_data\mlops4_exl,60
85,mlops_engineer_resume_86,jd_data\mlops4_exl,54
86,mlops_engineer_resume_87,jd_data\mlops1_mindtree,51
