In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Loading Datasets
resumes_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/Cleaned_Resume.csv")
jobs_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/postings_enriched.csv")
job_skills_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/job_skills_cleaned.csv")
skills_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/skills_cleaned.csv")

In [5]:
#Data Merging - merging csvs to include Title + Description + Skill Names
job_skills_df = job_skills_df.merge(
    skills_df,
    on="skill_abr",
    how="left"
)

# Aggregate skills for each job
skills_per_job = (
    job_skills_df
    .groupby("job_id")["skill_name"]
    .apply(lambda x: " ".join(x.dropna()))
    .reset_index()
)

# Merge into job postings
jobs_df = jobs_df.merge(skills_per_job, on="job_id", how="left")

# Create final job_text field
jobs_df["job_text"] = (
    jobs_df["Cleaned_Title"].fillna("") + " " +
    jobs_df["Cleaned_Description"].fillna("") + " " +
    jobs_df["skill_name"].fillna("")
)

jobs_df[["job_id", "job_text"]].head()

Unnamed: 0,job_id,job_text
0,921716,marketing coordinator job descriptiona leading...
1,1829192,mental health therapistcounselor at aspen ther...
2,10998357,assitant restaurant manager the national exemp...
3,23221523,senior elder law trusts and estates associate...
4,35982263,service technician looking for hvac service te...


In [6]:
# Load BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# Prepare text (replace NaNs)
resumes_text_clean = resumes_df["Cleaned_Resume"].fillna("").tolist()
jobs_text_clean = jobs_df["job_text"].fillna("").tolist()

In [8]:
# Compute embeddings
print("Encoding resumes...")
resume_embeddings = bert_model.encode(resumes_text_clean, show_progress_bar=True, batch_size=64)

print("Encoding jobs...")
job_embeddings = bert_model.encode(jobs_text_clean, show_progress_bar=True, batch_size=64)

Encoding resumes...


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Encoding jobs...


Batches:   0%|          | 0/1936 [00:00<?, ?it/s]

In [9]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)

In [10]:
# Calculate mean top-5 similarity
top5_scores = []
for row in similarity_matrix:
    top5 = np.sort(row)[-5:]
    avg_top5 = np.mean(top5)
    top5_scores.append(avg_top5)

mean_top5_similarity = np.mean(top5_scores)

print("Mean top-5 similarity (BERT):", mean_top5_similarity)

Mean top-5 similarity (BERT): 0.66518426


In [12]:
print(resumes_df.columns.tolist())

['Resume_str', 'Category', 'Cleaned_Resume', 'Lower_Only', 'Skill_NGrams', 'Resume_Length']


In [15]:
#Listing Top 5 Jobs Per Resume
matches_data = []

for i, resume_row in resumes_df.iterrows():
    resume_id = i
    resume_category = resume_row["Category"]

    # Find top 5 job indices for this resume
    top_indices = np.argsort(-similarity_matrix[i])[:5]

    for rank, job_idx in enumerate(top_indices, start=1):
        job_row = jobs_df.iloc[job_idx]

        matches_data.append({
            "Resume_ID": resume_id,
            "Resume_Category": resume_category,
            "Job_Rank": rank,
            "Job_ID": job_row.get("job_id", None),
            "Job_Title": job_row.get("title", None),
            "Job_Description": job_row.get("job_text", None),
            "Match_Score": similarity_matrix[i, job_idx]
        })

matches_df = pd.DataFrame(matches_data)

In [17]:
# Show preview
matches_df.head(10)

Unnamed: 0,Resume_ID,Resume_Category,Job_Rank,Job_ID,Job_Title,Job_Description,Match_Score
0,0,HR,1,3903831023,Supervisor-Field Service Administrators,supervisorfield service administrators welcome...,0.660273
1,0,HR,2,3900979674,Restaurant General Manager,restaurant general manager hiring restaurant g...,0.659035
2,0,HR,3,3906233315,General Manager,general manager we create communities where em...,0.656406
3,0,HR,4,3905306920,General Manager,general manager we create communities where em...,0.656406
4,0,HR,5,3905306919,General Manager,general manager we create communities where em...,0.656406
5,1,HR,1,3886452319,Operations Specialist,operations specialist weareoneok fortune 500 ...,0.64389
6,1,HR,2,3906259450,Quality Engineer,quality engineer job title senior quality engi...,0.632201
7,1,HR,3,3901957825,Customer Experience Specialist,customer experience specialist what are we loo...,0.631199
8,1,HR,4,3894564020,Claims Administrative Support Specialist,claims administrative support specialist join ...,0.628584
9,1,HR,5,3886879683,Data Coordinator,data coordinator positiondata analyst\njob typ...,0.628535
