### Avoids scroll-in-the-scroll in the entire Notebook

In [1]:
from IPython.display import Javascript
def resize_colab_cell():
    display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

# Preprocessing **`Job Description`**

### Importing dataset from Hugging Face

In [2]:
!pip install datasets

<IPython.core.display.Javascript object>

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.1-py3-none-a

In [3]:
from datasets import load_dataset

dataset = load_dataset("jacob-hugging-face/job-descriptions")
print(dataset)

<IPython.core.display.Javascript object>

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.77M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
        num_rows: 853
    })
})


### Extracting *`Company Name`*

In [4]:
def JD_extract_company(job_num):
    return dataset["train"][job_num]["company_name"]

<IPython.core.display.Javascript object>

### Extracting *`Position`*

In [5]:
def JD_extract_position(job_num) :
    return dataset["train"][job_num]["position_title"]

<IPython.core.display.Javascript object>

### Extracting *`Required Skills`*

In [6]:
def JD_extract_required_skills(job_num):
    model_response = eval(dataset["train"][job_num]["model_response"])
    return model_response["Required Skills"] if "Required Skills" in model_response else "N/A"

<IPython.core.display.Javascript object>

### Extracting *`Educational Requirements`*

In [7]:
def JD_extract_educational_requirements(job_num):
    model_response = eval(dataset["train"][job_num]["model_response"])
    return model_response["Educational Requirements"] if "Educational Requirements" in model_response else "N/A"

<IPython.core.display.Javascript object>

# Preprocessing **`Resume Data`**
> needs attention (to upload `.zip` file)

### Extracting text from resume PDF

In [8]:
!pip install PyPDF2

<IPython.core.display.Javascript object>

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/232.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [9]:
import PyPDF2


def extract_resume_text(filePath):
    reader = PyPDF2.PdfReader(open(filePath, "rb"))

    text = ""
    for i in range(len(reader.pages)): text += reader.pages[i].extract_text()
    return text

<IPython.core.display.Javascript object>

### Extracting *`Category (Job Role)`*

In [10]:
def extract_category(text):
     return text.strip().split("\n")[0].strip() if text.strip() else None

<IPython.core.display.Javascript object>

### Extracting *`Education`*

In [11]:
import re


def extract_education(text):
    education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master']
    education = []

    for word in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(word))
        match = re.search(pattern, text)
        if match: education.append(match.group())

    return education

<IPython.core.display.Javascript object>

### Extracting *`Skills`*

In [12]:
!pip install spacy
!python -m spacy info
!python -m pip freeze | grep spacy
!python -m spacy download en_core_web_sm

<IPython.core.display.Javascript object>

2023-09-16 11:13:54.625299: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[1m

spaCy version    3.6.1                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-5.15.109+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.6.0)        

en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
2023-09-16 11:14:08.474446: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in pe

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_skills(text):
    skills = []
    doc = nlp(text)

    for word in doc:
        if "NN" in word.tag_: skills.append(word.text)

    return list(set(skills))

<IPython.core.display.Javascript object>

### Uploading Resume PDFs as `Zip` and Extracting
> needs attention (to upload `.zip` file)

In [14]:
from google.colab import files
import zipfile
import io


resume_folder = files.upload()

for folder in resume_folder.keys():
    if folder.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(resume_folder[folder]), "r") as zip_ref: zip_ref.extractall("/content/")
        path = ("/content/" + folder).replace(".zip", "")
    else: print("Enter a ZIP file containing resume PDFs")


print("Path: ", path)

<IPython.core.display.Javascript object>

Saving small_tempResume.zip to small_tempResume.zip
Path:  /content/small_tempResume


# Getting **`Embeddings`** and **`Cosine Similarity`** sores

### Function to get Embeddings from Tokenized array

In [15]:
!pip install transformers

<IPython.core.display.Javascript object>

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.2


In [16]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


def get_embeddings(tokenized_array):
    tokenized_array_input = " ".join(tokenized_array)
    tokenized_array_encoding = tokenizer(tokenized_array_input, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        tokenized_array_embedding = model(**tokenized_array_encoding).last_hidden_state.mean(dim=1).numpy()

    return tokenized_array_embedding

<IPython.core.display.Javascript object>

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

### Function to get *`Cosine Similarity`*

In [20]:
def get_cosine_similarity(JD_req_skills_embeddings, JD_req_edu_embeddings, CV_skills_embeddings, CV_edu_embeddings):
    skills_score = cosine_similarity(JD_req_skills_embeddings, CV_skills_embeddings)
    edu_score = cosine_similarity(JD_req_edu_embeddings, CV_edu_embeddings)
    return (skills_score + edu_score) / 2.0

<IPython.core.display.Javascript object>

### Function to get Cosine Similarity for Positions

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf_vectorizer = TfidfVectorizer()

<IPython.core.display.Javascript object>

In [22]:
def get_cosine_sim_for_category(position, category):

    tfidf_matrix = tfidf_vectorizer.fit_transform([position, category])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim[0][0]

<IPython.core.display.Javascript object>

# Driver Function

In [41]:
import PyPDF2, glob, os


def main():

    # considering first 10 jobs from the dataset
    jobs = 10
    pdf_files = glob.glob(os.path.join(path, "*.pdf"))

    # outer loop (jobs)
    for job_num in range(jobs):
        CV_and_score = []

        JD_req_skills_embeddings = get_embeddings(JD_extract_required_skills(job_num))
        JD_req_edu_embeddings = get_embeddings(JD_extract_educational_requirements(job_num))
        JD_position = JD_extract_position(job_num)


        # inner loop (resume/CVs)
        for cv in pdf_files:
            thisPDF_text = extract_resume_text(cv)

            CV_skills_embeddings = get_embeddings(extract_skills(thisPDF_text))
            CV_education_embeddings = get_embeddings(extract_education(thisPDF_text))
            CV_category = extract_category(thisPDF_text)

            score = (get_cosine_similarity(JD_req_skills_embeddings, JD_req_edu_embeddings, CV_skills_embeddings, CV_education_embeddings) + get_cosine_sim_for_category(JD_position, CV_category)) / 2.0
            CV_and_score.append((score[0][0], os.path.basename(cv)))
        # inner loop end



        CV_and_score.sort(key=lambda x: x[0], reverse=True)
        top_CVs = [filename for _, filename in CV_and_score[:5]]
        top_scores = [score for score, _ in CV_and_score[:5]]

        print(f"{JD_extract_company(job_num)} ({JD_position}): \nTop 5 CVs: {top_CVs} \nCorresponding Scores: {top_scores}\n")
    #outer loop end

<IPython.core.display.Javascript object>

In [42]:
if __name__ == "__main__": main()

<IPython.core.display.Javascript object>

Google (Sales Specialist): 
Top 5 CVs: ['3547447.pdf', '11919526.pdf', '11963737.pdf', '11981094.pdf', '11995013.pdf'] 
Corresponding Scores: [0.2826975, 0.2372889, 0.20863354, 0.17126028, 0.1555791]

Apple (Apple Solutions Consultant): 
Top 5 CVs: ['11919526.pdf', '11963737.pdf', '3547447.pdf', '11981094.pdf', '11995013.pdf'] 
Corresponding Scores: [0.31987262, 0.3091702, 0.29276848, 0.29208955, 0.27843538]

Netflix (Licensing Coordinator - Consumer Products): 
Top 5 CVs: ['11919526.pdf', '11963737.pdf', '3547447.pdf', '11981094.pdf', '11995013.pdf'] 
Corresponding Scores: [0.31913945, 0.30961245, 0.2928903, 0.2918102, 0.27712846]

Robert Half (Web Designer): 
Top 5 CVs: ['11919526.pdf', '11958994.pdf', '11963737.pdf', '11981094.pdf', '3547447.pdf'] 
Corresponding Scores: [0.48231077, 0.44088, 0.30813766, 0.2905259, 0.28965634]

TrackFive (Web Developer): 
Top 5 CVs: ['11919526.pdf', '11963737.pdf', '3547447.pdf', '11981094.pdf', '11995013.pdf'] 
Corresponding Scores: [0.31874403, 0.3