In [3]:
from google.colab import files
uploaded=files.upload()

Saving postings.csv to postings.csv
Saving Resume.csv to Resume.csv


In [4]:
import pandas as pd

#Loading 200 samples for resume & job descriptions data set
resumes_df = pd.read_csv('Resume.csv', nrows=200)
jobs_df = pd.read_csv('postings.csv', nrows=200)

In [5]:
resumes_df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [6]:
jobs_df.head()
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   job_id                      200 non-null    int64  
 1   company_name                169 non-null    object 
 2   title                       200 non-null    object 
 3   description                 200 non-null    object 
 4   max_salary                  75 non-null     float64
 5   pay_period                  87 non-null     object 
 6   location                    200 non-null    object 
 7   company_id                  169 non-null    float64
 8   views                       194 non-null    float64
 9   med_salary                  12 non-null     float64
 10  min_salary                  75 non-null     float64
 11  formatted_work_type         200 non-null    object 
 12  applies                     57 non-null     float64
 13  original_listed_time        200 non

In [7]:
#dropping rows with missing values
resumes_df = resumes_df.dropna(subset=['Resume_str', 'Category'])
jobs_df = jobs_df.dropna(subset=['description', 'title'])

In [8]:
#sampling 200 rows
jobs_df = jobs_df.sample(n=200, random_state=42)
resumes_df = resumes_df.sample(n=200, random_state=42)

In [9]:
#cleaning text columns
import re

def clean_text(text):
  return re.sub(r'<.*?>', '', str(text))

resumes_df['Resume_str'] = resumes_df['Resume_str'].apply(clean_text)
jobs_df['description'] = jobs_df['description'].apply(clean_text)
jobs_df['title'] = jobs_df['title'].apply(clean_text)

# Combine text for job descriptions (title + description)
jobs_df['job_text'] = jobs_df['title'] + ' ' + jobs_df['description']

# Fit on all text (resume + job_text)
combined_text = pd.concat([resumes_df['Resume_str'], jobs_df['job_text']])

Model 1: TF-IDF + Cosine Similarity

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

tfidf_vectorizer.fit(combined_text)

resume_tfidf_vecs = tfidf_vectorizer.transform(resumes_df['Resume_str'])
job_tfidf_vecs = tfidf_vectorizer.transform(jobs_df['job_text'])

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Get similarity matrix (resume x job)
similarity_matrix = cosine_similarity(resume_tfidf_vecs, job_tfidf_vecs)

# For each resume, get the index of the most similar job
matches_data = []

for i, resume_id in enumerate(resumes_df['ID']):
    resume_category = resumes_df.iloc[i]['Category']
    top_indices = np.argsort(-similarity_matrix[i])[:5]  # Top 5 matches

    for job_idx in top_indices:
        job = jobs_df.iloc[job_idx]
        match_score = similarity_matrix[i, job_idx]

        matches_data.append({
            'Resume_ID': resume_id,
            'Resume_Category': resume_category,
            'Job_ID': job['job_id'],
            'Job_Title': job['title'],
            'Job_Description': job['description'],
            'Match_Score': match_score
        })

matches_df = pd.DataFrame(matches_data)

In [12]:
# Preview matches
matches_df.head(10)

Unnamed: 0,Resume_ID,Resume_Category,Job_ID,Job_Title,Job_Description,Match_Score
0,28640735,HR,1448163866,Office Manager,Responsibilities: • Oversees and manages all a...,0.249139
1,28640735,HR,229924287,Administrative Assistant,The Administrative Assistant will organize and...,0.2332
2,28640735,HR,3277232283,HR & Administrative Assistant,HR & Administrative CoordinatorWork is Remote ...,0.192637
3,28640735,HR,95428182,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,0.182875
4,28640735,HR,3486250934,Chief Executive Officer,We are looking for an experienced Chief Execut...,0.160509
5,32977530,HR,2920450495,Service Coordinator,"QualificationsExperience:Data Entry, 4 years (...",0.180104
6,32977530,HR,3582587683,Accounts Analyst,Company DescriptionMcNaughton Bros. Inc. is a ...,0.179697
7,32977530,HR,2428973832,Staff Accountant,The ideal candidate will be responsible for ti...,0.177683
8,32977530,HR,95428182,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,0.176963
9,32977530,HR,3564676727,Corporate Controller,We are seeking a highly skilled and experience...,0.152896


Model 2: CountVectorizer + Cosine Similarity

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
count_vectorizer.fit(combined_text)

resume_count_vecs = count_vectorizer.transform(resumes_df['Resume_str'])
job_count_vecs = count_vectorizer.transform(jobs_df['job_text'])

similarity_matrix_count = cosine_similarity(resume_count_vecs, job_count_vecs)

# Compute cosine similarity matrix (resume x job)
similarity_matrix = cosine_similarity(resume_count_vecs, job_count_vecs)

# Build matches dataframe (Top 5 jobs per resume)
matches_data = []

for i, resume_id in enumerate(resumes_df['ID']):
    resume_category = resumes_df.iloc[i]['Category']
    top_indices = np.argsort(-similarity_matrix[i])[:5]

    for job_idx in top_indices:
        job = jobs_df.iloc[job_idx]
        match_score = similarity_matrix[i, job_idx]

        matches_data.append({
            'Resume_ID': resume_id,
            'Resume_Category': resume_category,
            'Job_ID': job['job_id'],
            'Job_Title': job['title'],
            'Job_Description': job['description'],
            'Match_Score': match_score
        })

matches_df = pd.DataFrame(matches_data)
matches_df.head(10)

Unnamed: 0,Resume_ID,Resume_Category,Job_ID,Job_Title,Job_Description,Match_Score
0,28640735,HR,1448163866,Office Manager,Responsibilities: • Oversees and manages all a...,0.421269
1,28640735,HR,229924287,Administrative Assistant,The Administrative Assistant will organize and...,0.417615
2,28640735,HR,3277232283,HR & Administrative Assistant,HR & Administrative CoordinatorWork is Remote ...,0.338888
3,28640735,HR,3260455681,Operations Specialist,Weyerhaeuser is searching for a Operations Spe...,0.290013
4,28640735,HR,95428182,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,0.290003
5,32977530,HR,2920450495,Service Coordinator,"QualificationsExperience:Data Entry, 4 years (...",0.316318
6,32977530,HR,3582587683,Accounts Analyst,Company DescriptionMcNaughton Bros. Inc. is a ...,0.304209
7,32977530,HR,9615617,Inside Customer Service Associate,Glastender Inc. is a family-owned manufacturer...,0.29267
8,32977530,HR,3377655935,Senior Account Manager,"Are you a proactive, detail-oriented professio...",0.262732
9,32977530,HR,2428973832,Staff Accountant,The ideal candidate will be responsible for ti...,0.261206


Model 3: BERT

In [15]:
!pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [16]:
from sentence_transformers import SentenceTransformer

In [17]:
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
# Generate embeddings
resume_vecs_bert = bert_model.encode(resumes_df['Resume_str'].tolist(), show_progress_bar=True)
job_vecs_bert = bert_model.encode(jobs_df['job_text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [20]:
similarity_matrix_bert = cosine_similarity(resume_vecs_bert, job_vecs_bert)

matches_data_bert = []

for i, resume_id in enumerate(resumes_df['ID']):
    resume_category = resumes_df.iloc[i]['Category']
    top_indices = np.argsort(-similarity_matrix_bert[i])[:5]

    for job_idx in top_indices:
        job = jobs_df.iloc[job_idx]
        match_score = similarity_matrix_bert[i, job_idx]

        matches_data_bert.append({
            'Resume_ID': resume_id,
            'Resume_Category': resume_category,
            'Job_ID': job['job_id'],
            'Job_Title': job['title'],
            'Job_Description': job['description'],
            'Match_Score': match_score
        })

matches_df_bert = pd.DataFrame(matches_data_bert)

matches_df_bert.head(10)

Unnamed: 0,Resume_ID,Resume_Category,Job_ID,Job_Title,Job_Description,Match_Score
0,28640735,HR,3386174836,Administrative Assistant Project Coordinator,We are seeking a Project Coordinator / Admin t...,0.60605
1,28640735,HR,1448163866,Office Manager,Responsibilities: • Oversees and manages all a...,0.58122
2,28640735,HR,3438976411,Accounting Specialist,We are Health and Rehab Solutions ! We are loo...,0.505324
3,28640735,HR,3277232283,HR & Administrative Assistant,HR & Administrative CoordinatorWork is Remote ...,0.49098
4,28640735,HR,229924287,Administrative Assistant,The Administrative Assistant will organize and...,0.486808
5,32977530,HR,1448163866,Office Manager,Responsibilities: • Oversees and manages all a...,0.625228
6,32977530,HR,3386174836,Administrative Assistant Project Coordinator,We are seeking a Project Coordinator / Admin t...,0.612224
7,32977530,HR,229924287,Administrative Assistant,The Administrative Assistant will organize and...,0.583158
8,32977530,HR,2428973832,Staff Accountant,The ideal candidate will be responsible for ti...,0.535065
9,32977530,HR,2372177891,Client Service Associate,"As a non-licensed Client Services Assistant, y...",0.506629


Model 4: spaCy

In [21]:
!pip install -U spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [24]:
# Medium-sized English model
import spacy
nlp = spacy.load("en_core_web_md")

# Compute document embeddings
def get_spacy_vector(text):
    return nlp(text).vector

# Applying to data
resumes_df['spacy_vec'] = resumes_df['Resume_str'].apply(get_spacy_vector)
jobs_df['spacy_vec'] = jobs_df['job_text'].apply(get_spacy_vector)

In [25]:
# Convert embeddings to arrays
resume_spacy_vecs = np.vstack(resumes_df['spacy_vec'].values)
job_spacy_vecs = np.vstack(jobs_df['spacy_vec'].values)

# Cosine similarity
similarity_matrix_spacy = cosine_similarity(resume_spacy_vecs, job_spacy_vecs)

# Match each resume to top 5 jobs
matches_data_spacy = []

for i, resume_id in enumerate(resumes_df['ID']):
    resume_category = resumes_df.iloc[i]['Category']
    top_indices = np.argsort(-similarity_matrix_spacy[i])[:5]

    for job_idx in top_indices:
        job = jobs_df.iloc[job_idx]
        match_score = similarity_matrix_spacy[i, job_idx]

        matches_data_spacy.append({
            'Resume_ID': resume_id,
            'Resume_Category': resume_category,
            'Job_ID': job['job_id'],
            'Job_Title': job['title'],
            'Job_Description': job['description'],
            'Match_Score': match_score
        })

matches_df_spacy = pd.DataFrame(matches_data_spacy)
matches_df_spacy.head(10)

Unnamed: 0,Resume_ID,Resume_Category,Job_ID,Job_Title,Job_Description,Match_Score
0,28640735,HR,2914254129,Director of Operations,Director of OperationsAlliance for Strategic G...,0.990551
1,28640735,HR,56482768,Appalachian Highlands Women's Business Center,FULL JOB DESCRIPTION – PROGRAM DIRECTOR Appala...,0.990192
2,28640735,HR,95428182,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,0.990059
3,28640735,HR,3486250934,Chief Executive Officer,We are looking for an experienced Chief Execut...,0.990054
4,28640735,HR,3514683108,Associate Brand Manager,Job Title: Associate Brand Manager / Brand Man...,0.989684
5,32977530,HR,95428182,Administrative Coordinator,Job Title: Administrative CoordinatorOrganizat...,0.982036
6,32977530,HR,196406165,Loan Coordinator,"Morlen Capital Advisors, a boutique commercial...",0.980429
7,32977530,HR,103860943,Customer Service / Reservationist,Sentinel Limousine of East Providence RI is a ...,0.979973
8,32977530,HR,2372177891,Client Service Associate,"As a non-licensed Client Services Assistant, y...",0.979926
9,32977530,HR,2428973832,Staff Accountant,The ideal candidate will be responsible for ti...,0.979314
