In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
#Loading Datasets
resumes_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/Cleaned_Resume.csv")
jobs_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/postings_enriched.csv")
job_skills_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/job_skills_cleaned.csv")
skills_df = pd.read_csv("/content/drive/MyDrive/IE7500 Project - Cleaned Data/skills_cleaned.csv")

In [23]:
job_skills_df.head()

Unnamed: 0,job_id,skill_abr
0,3884428798,mrkt
1,3884428798,pr
2,3884428798,wrt
3,3887473071,sale
4,3887465684,fin


In [24]:
skills_df.head()

Unnamed: 0,skill_abr,skill_name
0,art,artcreative
1,dsgn,design
2,advr,advertising
3,prdm,product management
4,dist,distribution


In [25]:
jobs_df.head()

Unnamed: 0,job_id,title,description,min_salary,max_salary,pay_period,skills_desc,remote_allowed,Cleaned_Title,Cleaned_Description,Job_Text,Cleaned_Skills,Remote,Normalized_Min_Salary,Normalized_Max_Salary,Top_Skills
0,921716,Marketing Coordinator,Job descriptionA leading real estate firm in N...,17.0,20.0,HOURLY,Requirements: \n\nWe are seeking a College or ...,,marketing coordinator,job descriptiona leading real estate firm in n...,marketing coordinator job descriptiona leading...,requirements \n\nwe are seeking a college or g...,0,17.0,20.0,"['mrkt', 'sale']"
1,1829192,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",30.0,50.0,HOURLY,,,mental health therapistcounselor,at aspen therapy and wellness we are committe...,mental health therapistcounselor at aspen ther...,,0,30.0,50.0,['hcpr']
2,10998357,Assitant Restaurant Manager,The National Exemplar is accepting application...,45000.0,65000.0,YEARLY,We are currently accepting resumes for FOH - A...,,assitant restaurant manager,the national exemplar is accepting application...,assitant restaurant manager the national exemp...,we are currently accepting resumes for foh as...,0,45000.0,65000.0,"['mgmt', 'mnfc']"
3,23221523,Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,140000.0,175000.0,YEARLY,This position requires a baseline understandin...,,senior elder law trusts and estates associate...,senior associate attorney elder law trusts a...,senior elder law trusts and estates associate...,this position requires a baseline understandin...,0,140000.0,175000.0,['othr']
4,35982263,Service Technician,Looking for HVAC service tech with experience ...,60000.0,80000.0,YEARLY,,,service technician,looking for hvac service tech with experience ...,service technician looking for hvac service te...,,0,60000.0,80000.0,['it']


# Data Merging
We will merge job skills into job postings to create a single textual represenation for each job posting.

---


(Title + Description + Skill Names)



In [26]:
# Merge job_skills and skills dictionary
job_skills_df = job_skills_df.merge(
    skills_df,
    on="skill_abr",
    how="left"
)

# Aggregate skills for each job
skills_per_job = (
    job_skills_df
    .groupby("job_id")["skill_name"]
    .apply(lambda x: " ".join(x.dropna()))
    .reset_index()
)

# Merge into job postings
jobs_df = jobs_df.merge(skills_per_job, on="job_id", how="left")

# Create final job_text field
jobs_df["job_text"] = (
    jobs_df["Cleaned_Title"].fillna("") + " " +
    jobs_df["Cleaned_Description"].fillna("") + " " +
    jobs_df["skill_name"].fillna("")
)

jobs_df[["job_id", "job_text"]].head()

Unnamed: 0,job_id,job_text
0,921716,marketing coordinator job descriptiona leading...
1,1829192,mental health therapistcounselor at aspen ther...
2,10998357,assitant restaurant manager the national exemp...
3,23221523,senior elder law trusts and estates associate...
4,35982263,service technician looking for hvac service te...


In [27]:
param_grid = {
    "max_features": [5000, 10000],
    "ngram_range": [(1,1), (1,2)]
}

In [28]:
def grid_search_vectorizer(
    resumes_text,
    jobs_text,
    param_grid,
    vectorizer_class
):
    keys, values = zip(*param_grid.items())
    combinations = list(itertools.product(*values))

    results = []

    for combo in combinations:
        params = dict(zip(keys, combo))
        print(f"Running {vectorizer_class.__name__} with params: {params}")

        vectorizer = vectorizer_class(
            max_features=params["max_features"],
            ngram_range=params["ngram_range"],
            stop_words='english'
        )

        resumes_text_clean = resumes_text.fillna("")
        jobs_text_clean = jobs_text.fillna("")

        combined_text = pd.concat([
            resumes_text_clean,
            jobs_text_clean
        ])

        vectorizer.fit(combined_text)

        resume_vecs = vectorizer.transform(resumes_text_clean)
        job_vecs = vectorizer.transform(jobs_text_clean)

        sim_matrix = cosine_similarity(resume_vecs, job_vecs)

        top5_scores = []
        for row in sim_matrix:
            top5 = np.sort(row)[-5:]
            avg_top5 = np.mean(top5)
            top5_scores.append(avg_top5)

        result = {
            "mean_top5_similarity": np.mean(top5_scores),
            **params,
            "vectorizer": vectorizer_class.__name__
        }

        results.append(result)

    return pd.DataFrame(results)

In [29]:
results_tfidf = grid_search_vectorizer(
    resumes_df["Cleaned_Resume"],
    jobs_df["job_text"],
    param_grid,
    TfidfVectorizer
)

Running TfidfVectorizer with params: {'max_features': 5000, 'ngram_range': (1, 1)}
Running TfidfVectorizer with params: {'max_features': 5000, 'ngram_range': (1, 2)}
Running TfidfVectorizer with params: {'max_features': 10000, 'ngram_range': (1, 1)}
Running TfidfVectorizer with params: {'max_features': 10000, 'ngram_range': (1, 2)}


In [30]:
results_count = grid_search_vectorizer(
    resumes_df["Cleaned_Resume"],
    jobs_df["job_text"],
    param_grid,
    CountVectorizer
)

Running CountVectorizer with params: {'max_features': 5000, 'ngram_range': (1, 1)}
Running CountVectorizer with params: {'max_features': 5000, 'ngram_range': (1, 2)}
Running CountVectorizer with params: {'max_features': 10000, 'ngram_range': (1, 1)}
Running CountVectorizer with params: {'max_features': 10000, 'ngram_range': (1, 2)}


In [31]:
final_results = pd.concat([results_tfidf, results_count])
final_results = final_results.sort_values("mean_top5_similarity", ascending=False)
final_results

Unnamed: 0,mean_top5_similarity,max_features,ngram_range,vectorizer
0,0.437074,5000,"(1, 1)",CountVectorizer
1,0.428885,5000,"(1, 2)",CountVectorizer
2,0.424855,10000,"(1, 1)",CountVectorizer
3,0.411273,10000,"(1, 2)",CountVectorizer
0,0.364816,5000,"(1, 1)",TfidfVectorizer
1,0.358758,5000,"(1, 2)",TfidfVectorizer
2,0.334416,10000,"(1, 1)",TfidfVectorizer
3,0.323585,10000,"(1, 2)",TfidfVectorizer


In [32]:
best_row = final_results.iloc[0]
print("Best configuration found:")
print(best_row)

Best configuration found:
mean_top5_similarity           0.437074
max_features                       5000
ngram_range                      (1, 1)
vectorizer              CountVectorizer
Name: 0, dtype: object


In [33]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load BERT model
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare text (replace NaNs!)
resumes_text_clean = resumes_df["Cleaned_Resume"].fillna("").tolist()
jobs_text_clean = jobs_df["job_text"].fillna("").tolist()

# Compute embeddings
print("Encoding resumes...")
resume_embeddings = bert_model.encode(resumes_text_clean, show_progress_bar=True, batch_size=64)

print("Encoding jobs...")
job_embeddings = bert_model.encode(jobs_text_clean, show_progress_bar=True, batch_size=64)

# Compute cosine similarity
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)

# Calculate mean top-5 similarity
top5_scores = []
for row in similarity_matrix:
    top5 = np.sort(row)[-5:]
    avg_top5 = np.mean(top5)
    top5_scores.append(avg_top5)

mean_top5_similarity = np.mean(top5_scores)

print("Mean top-5 similarity (BERT):", mean_top5_similarity)

# Save result in a dataframe
bert_result_df = pd.DataFrame([{
    "mean_top5_similarity": mean_top5_similarity,
    "max_features": None,
    "ngram_range": None,
    "vectorizer": "BERT"
}])

Encoding resumes...


Batches:   0%|          | 0/39 [00:00<?, ?it/s]

Encoding jobs...


Batches:   0%|          | 0/1936 [00:00<?, ?it/s]

Mean top-5 similarity (BERT): 0.66518426
