### Avoids scroll-in-the-scroll in the entire Notebook

In [1]:
from IPython.display import Javascript
def resize_colab_cell():
    display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

# Functions for **`Embeddings`** and **`Cosine Similarity`** sores

### Function to get Embeddings from Tokenized array

In [2]:
!pip install transformers

<IPython.core.display.Javascript object>

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.0 MB/s[0m eta [36m0:00:0

In [3]:
import torch
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


def get_embeddings(tokenized_array):
    tokenized_array_input = " ".join(tokenized_array)
    tokenized_array_encoding = tokenizer(tokenized_array_input, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        tokenized_array_embedding = model(**tokenized_array_encoding).last_hidden_state.mean(dim=1).numpy()

    return tokenized_array_embedding

<IPython.core.display.Javascript object>

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

### Function to get *`Cosine Similarity`*

In [4]:
def get_cosine_similarity(JD_req_skills_embeddings, JD_req_edu_embeddings, CV_skills_embeddings, CV_edu_embeddings):
    skills_score = cosine_similarity(JD_req_skills_embeddings, CV_skills_embeddings)
    edu_score = cosine_similarity(JD_req_edu_embeddings, CV_edu_embeddings)
    return (skills_score + edu_score) / 2.0

<IPython.core.display.Javascript object>

### Function to get Cosine Similarity for Positions

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf_vectorizer = TfidfVectorizer()

<IPython.core.display.Javascript object>

In [6]:
def get_cosine_sim_for_category(position, category):

    tfidf_matrix = tfidf_vectorizer.fit_transform([position, category])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cosine_sim[0][0]

<IPython.core.display.Javascript object>

# Preprocessing **`Job Description`**

### Importing dataset from Hugging Face

In [7]:
!pip install datasets

<IPython.core.display.Javascript object>

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

In [8]:
from datasets import load_dataset

dataset = load_dataset("jacob-hugging-face/job-descriptions")
print(dataset)

<IPython.core.display.Javascript object>

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.77M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
        num_rows: 853
    })
})


### Extracting *`Company Name`*

In [9]:
def JD_extract_company(job_num):
    return dataset["train"][job_num]["company_name"]

<IPython.core.display.Javascript object>

### Extracting *`Position`*

In [10]:
def JD_extract_position(job_num) :
    return dataset["train"][job_num]["position_title"]

<IPython.core.display.Javascript object>

### Extracting *`Required Skills`*

In [11]:
def JD_extract_required_skills(job_num):
    model_response = eval(dataset["train"][job_num]["model_response"])
    return model_response["Required Skills"] if "Required Skills" in model_response else "N/A"

<IPython.core.display.Javascript object>

### Extracting *`Educational Requirements`*

In [12]:
def JD_extract_educational_requirements(job_num):
    model_response = eval(dataset["train"][job_num]["model_response"])
    return model_response["Educational Requirements"] if "Educational Requirements" in model_response else "N/A"

<IPython.core.display.Javascript object>

# Preprocessing **`Resume Data`**
> needs attention (to mount google drive & `.zip` file upload)

### Extracting text from resume PDF

In [13]:
!pip install PyPDF2

<IPython.core.display.Javascript object>

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [14]:
import PyPDF2


def extract_resume_text(filePath):
    reader = PyPDF2.PdfReader(open(filePath, "rb"))

    text = ""
    for i in range(len(reader.pages)): text += reader.pages[i].extract_text()
    return text

<IPython.core.display.Javascript object>

### Extracting *`Category (Job Role)`*

In [15]:
def extract_category(text):
     return text.strip().split("\n")[0].strip() if text.strip() else None

<IPython.core.display.Javascript object>

### Extracting *`Education`*

In [16]:
import re


def extract_education(text):
    education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master']
    education = []

    for word in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(word))
        match = re.search(pattern, text)
        if match: education.append(match.group())

    return education

<IPython.core.display.Javascript object>

### Extracting *`Skills`*

In [17]:
!pip install spacy
!python -m spacy info
!python -m pip freeze | grep spacy
!python -m spacy download en_core_web_sm

<IPython.core.display.Javascript object>

[1m

spaCy version    3.6.1                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-5.15.109+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.6.0)        

en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_skills(text):
    skills = []
    doc = nlp(text)

    for word in doc:
        if "NN" in word.tag_: skills.append(word.text)

    return list(set(skills))

<IPython.core.display.Javascript object>

### Uploading Resume PDFs as `Zip` and Extracting
> needs attention (to upload `.zip` file)

In [19]:
from google.colab import drive
drive.mount('/content/drive')

<IPython.core.display.Javascript object>

Mounted at /content/drive


In [20]:
import zipfile, os


# location/path of zip file in colab runtime storage
resume_folder = "/content/drive/MyDrive/Resume-to-Job-Matcher/dataset/resume_PDF_dataset.zip"

if zipfile.is_zipfile(resume_folder):
    with zipfile.ZipFile(resume_folder, 'r') as zip_ref:
        zip_ref.extractall()
        path = os.getcwd() + "/" + os.path.splitext(os.path.basename(resume_folder))[0]
else:
    print("# ERROR: Please upload a valid ZIP file with resume PDFs.")

<IPython.core.display.Javascript object>

# Getting Embeddings for JD & CV
> processing time may differ based on the dataset size

### For progress bar

In [21]:
!pip install tqdm
from tqdm import tqdm

<IPython.core.display.Javascript object>



### For *`Job Description`*

In [50]:
JD_req_skills_embeddings_list = []
JD_req_edu_embeddings_list = []
JD_position_list = []

jobs = 10
for JD_num in tqdm(range(jobs), desc="Processing Jobs"):
    JD_req_skills_embeddings_list.append(get_embeddings(JD_extract_required_skills(JD_num)))
    JD_req_edu_embeddings_list.append(get_embeddings(JD_extract_educational_requirements(JD_num)))
    JD_position_list.append(JD_extract_position(JD_num))

<IPython.core.display.Javascript object>

Processing Jobs: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


### For *`Resume/CVs`*

In [60]:
import glob, os


CV_skills_embeddings_list = []
CV_education_embeddings_list = []
CV_category_list = []

pdf_files = glob.glob(os.path.join(path, "*.pdf"))

# for cv in pdf_files:
for cv in tqdm(pdf_files, desc="Processing resume/CVs"):
    thisPDF_text = extract_resume_text(cv)

    CV_skills_embeddings_list.append(get_embeddings(extract_skills(thisPDF_text)))
    CV_education_embeddings_list.append(get_embeddings(extract_education(thisPDF_text)))
    CV_category_list.append(extract_category(thisPDF_text))

<IPython.core.display.Javascript object>

Processing resume/CVs: 100%|██████████| 2484/2484 [50:14<00:00,  1.21s/it]


# Driver Function
> processing time may differ based on the dataset size

In [66]:
import PyPDF2, glob, os


def main():
    # outer loop (jobs)
    for JD_num in tqdm(range(jobs), desc="Progress"):

        try:
            CV_and_score = []


            # inner loop (resume/CVs)
            for CV_num, cv in enumerate(pdf_files):
                try:
                    score = (get_cosine_similarity(JD_req_skills_embeddings_list[JD_num], JD_req_edu_embeddings_list[JD_num], CV_skills_embeddings_list[CV_num], CV_education_embeddings_list[CV_num]) + get_cosine_sim_for_category(JD_position_list[JD_num], CV_category_list[CV_num])) / 2.0
                    CV_and_score.append((score[0][0], os.path.basename(cv)))
                except:
                    print(f"\n# WARNING: Problematic Resume: {os.path.basename(cv)}\n")
            # inner loop end


            CV_and_score.sort(key=lambda x: x[0], reverse=True)
            top_scores = [score for score, _ in CV_and_score[:5]]
            top_CVs = [filename for _, filename in CV_and_score[:5]]

            print(f"\nCompany: {JD_extract_company(JD_num)} ({JD_position_list[JD_num]}): \nTop 5 CVs: {top_CVs} \nCorresponding Scores: {top_scores}%\n")
            # i have kept the corresponding scores as it was, you can customise it accordingly (for example, score*100 to convert into %)

        except:
            print(f"\n# WARNING: Problematic Job Description: {JD_extract_company(JD_num)} ({JD_position_list[JD_num]})\n")

        print("\n>>>>>>>\n")
        # outer loop end

<IPython.core.display.Javascript object>

In [67]:
if __name__ == "__main__": main()

<IPython.core.display.Javascript object>

Progress:   0%|          | 0/10 [00:00<?, ?it/s]





Progress:  10%|█         | 1/10 [00:10<01:36, 10.72s/it]


Company: Google (Sales Specialist): 
Top 5 CVs: ['15581242.pdf', '29184740.pdf', '40987524.pdf', '15765660.pdf', '18062906.pdf'] 
Corresponding Scores: [0.7049648, 0.50413, 0.49955475, 0.49910304, 0.49852768]%


>>>>>>>





Progress:  20%|██        | 2/10 [00:19<01:16,  9.59s/it]


Company: Apple (Apple Solutions Consultant): 
Top 5 CVs: ['22259768.pdf', '92246939.pdf', '15433732.pdf', '88907739.pdf', '20176584.pdf'] 
Corresponding Scores: [0.55596614, 0.5497733, 0.5464427, 0.54528093, 0.5450497]%


>>>>>>>





Progress:  30%|███       | 3/10 [00:29<01:09,  9.98s/it]


Company: Netflix (Licensing Coordinator - Consumer Products): 
Top 5 CVs: ['27549075.pdf', '20993320.pdf', '39081840.pdf', '11624880.pdf', '67501448.pdf'] 
Corresponding Scores: [0.46959135, 0.43199837, 0.43017942, 0.42925933, 0.4282253]%


>>>>>>>





Progress:  40%|████      | 4/10 [00:39<01:00, 10.00s/it]


Company: Robert Half (Web Designer): 
Top 5 CVs: ['29524570.pdf', '21283733.pdf', '37058472.pdf', '76010167.pdf', '68240723.pdf'] 
Corresponding Scores: [0.7958744, 0.71623665, 0.5971394, 0.5934597, 0.584285]%


>>>>>>>





Progress:  50%|█████     | 5/10 [00:49<00:48,  9.71s/it]


Company: TrackFive (Web Developer): 
Top 5 CVs: ['12763627.pdf', '62994611.pdf', '29524570.pdf', '28790806.pdf', '93828034.pdf'] 
Corresponding Scores: [0.60572416, 0.47849566, 0.46284539, 0.44951662, 0.44933534]%


>>>>>>>





Progress:  60%|██████    | 6/10 [00:59<00:40, 10.02s/it]


Company: DesignUps (Frontend Web Developer): 
Top 5 CVs: ['12763627.pdf', '62994611.pdf', '29524570.pdf', '93828034.pdf', '28790806.pdf'] 
Corresponding Scores: [0.5239907, 0.44354293, 0.42927745, 0.42372882, 0.42169523]%


>>>>>>>





Progress:  70%|███████   | 7/10 [01:08<00:29,  9.74s/it]


Company: Equisolve, Inc. (Remote Website Designer): 
Top 5 CVs: ['35990852.pdf', '25949631.pdf', '37058472.pdf', '76010167.pdf', '68240723.pdf'] 
Corresponding Scores: [0.6518127, 0.53860545, 0.5345228, 0.52692485, 0.52200377]%


>>>>>>>





Progress:  80%|████████  | 8/10 [01:19<00:19,  9.88s/it]


Company: Zander Insurance Agency (Web Designer): 
Top 5 CVs: ['29524570.pdf', '21283733.pdf', '37058472.pdf', '76010167.pdf', '68240723.pdf'] 
Corresponding Scores: [0.7956472, 0.7175295, 0.5972743, 0.58931637, 0.58465517]%


>>>>>>>





Progress:  90%|█████████ | 9/10 [01:29<00:10, 10.14s/it]


Company: Tuff (Web Designer): 
Top 5 CVs: ['29524570.pdf', '21283733.pdf', '37058472.pdf', '76010167.pdf', '68240723.pdf'] 
Corresponding Scores: [0.7969471, 0.71867, 0.59884655, 0.5913156, 0.58617866]%


>>>>>>>





Progress: 100%|██████████| 10/10 [01:38<00:00,  9.84s/it]


Company: General Dynamics Information Technology (SR. Web Designer): 
Top 5 CVs: ['29524570.pdf', '21283733.pdf', '37058472.pdf', '76010167.pdf', '39776400.pdf'] 
Corresponding Scores: [0.60812926, 0.5077069, 0.4405937, 0.4373555, 0.43093234]%


>>>>>>>




