# Imports and Setup

In [17]:
import pandas as pd
import numpy as np
from pathlib import Path

from sentence_transformers import SentenceTransformer, util
import spacy

from sklearn.metrics import jaccard_score
from sklearn.preprocessing import MultiLabelBinarizer

import tqdm
import transformers.utils.logging

In [18]:
# Paths
DATA_DIR = Path('../data')
LABELED_PATH = DATA_DIR / "postings_cleaned_w_skills.csv"
MODEL_DIR = Path('../models')
NER_MODEL_DIR = MODEL_DIR / "ner_skill_model"
EMBEDDING_MODEL_PATH = MODEL_DIR / "job_description_embeddings.npy"

# Load data and models

In [19]:
postings = pd.read_csv(LABELED_PATH)
postings.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,level_principal,level_vice_president,level_chief,level_intern,level_junior,original_listed_time_ts,listed_time_ts,closed_time_ts,time_open,extracted_skills
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,False,False,False,False,False,2024-04-17 23:45:08,2024-04-17 23:45:08,,,"['adobe', 'microsoft office']"
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,False,False,False,False,False,2024-04-11 17:51:27,2024-04-11 17:51:27,,,['communication']
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,False,False,False,False,False,2024-04-16 14:26:54,2024-04-16 14:26:54,,,[]
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,False,False,False,False,False,2024-04-12 04:23:32,2024-04-12 04:23:32,,,"['communication', 'problem-solving']"
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,False,False,False,False,False,2024-04-18 14:52:23,2024-04-18 14:52:23,,,[]


In [20]:
nlp = spacy.load(NER_MODEL_DIR)

In [21]:
transformers.utils.logging.disable_progress_bar()
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


# Generate/load embeddings from job description text


In [5]:
#job_embeddings = model.encode(postings['description'], show_progress_bar=True, convert_to_numpy=True)

In [6]:
#np.save("models/job_description_embeddings.npy", embeddings)

In [22]:
job_embeddings = np.load(EMBEDDING_MODEL_PATH)

# Parse new text and compute similarity

In [23]:
def extract_skills(nlp, text):
    doc = nlp(text)
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return list(set(skills))


def compute_skills_similarity(user_skills, job_skills):
    # jaccard similarity
    mlb = MultiLabelBinarizer()
    all_skills = user_skills + [s for sublist in job_skills for s in sublist]
    mlb.fit([[s] for s in set(all_skills)])

    user_bin = mlb.transform([user_skills])
    jobs_bin = mlb.transform(job_skills)

    return np.array([
        jaccard_score(user_bin[0], jb, average="binary") for jb in jobs_bin
    ])

def compute_description_similarity(user_vec, job_vecs):
    # cosine similarity
    
    return util.cos_sim(user_vec, job_vecs)[0].cpu().numpy()

def hybrid_recommendations(df, user_vec, job_vecs, user_skills, weight_description=0.7, weight_skills=0.3):
    #ccombine cosine and Jaccard similarities

    description_similarity = compute_description_similarity(user_vec, job_vecs)
    skills_similarity = compute_skills_similarity(user_skills, df["extracted_skills"].tolist())

    score = weight_description * description_similarity + weight_skills * skills_similarity
    df = df[['company_name', 'title', 'location', 'extracted_skills']].copy()
    df["match_score"] = score
    df = df.sort_values(by="match_score", ascending=False).head(10)
    return df


# Test it out!

In [24]:
# DS Intern/Analyst
test_resume_1 = """
Work Experience
Marketing Science Company - Market Research Analyst Intern
April 2019 - March 2020 Pittsburgh, PA
● Received, cleaned, and prepped data from our client using SAS, SQL, and Excel to help data
scientists build marketing mix models that resulted in a lift in ROI of 6 basis points
● Developed a program in SAS that automated refinement of linear regression models for
specific segments of a customer base that saves 25 hours of labor each month
Projects
Fantasy Football Modeling
● Aggregated and prepped 5 years of NFL fantasy football projection data from 6 independent 
sources into a MySQL database
● Built a random forest model in SAS that combined the disparate sources into one
projection that outperformed the mean absolute error of the next best projection by 18%
Movie Recommendation Engine
● Aggregated data from imdb and rotten tomatoes and used k-nearest-neighbors in SAS to
build a better movie recommendation system
● Saves an average of 18 minutes on movie selection relative to prior methodology
"""

In [25]:
user_background_embedding = embedding_model.encode([test_resume_1], show_progress_bar=False, convert_to_numpy=True)
user_background_skill_lst = extract_skills(nlp, test_resume_1)

results = hybrid_recommendations(postings, user_background_embedding, job_embeddings, user_background_skill_lst)
results[['company_name', 'title', 'location', 'match_score']].head()

Unnamed: 0,company_name,title,location,match_score
68229,AAA Club Alliance,Membership Intelligence Analyst,"Wilmington, DE",0.443993
33636,Swish Analytics,NHL Data Scientist,United States,0.439339
6007,Blend,"Senior MMM Analyst, Data Science","Columbia, MD",0.436499
938,Quigley-Simpson,"Senior Analyst, Data & Analytics",Los Angeles Metropolitan Area,0.433112
64752,Mediahub Worldwide,"Senior Analyst, Data Science","Boston, MA",0.426827


In [26]:
# mid-level bioscientist
test_resume_2 = """
Core Competencies
Molecular & Cell Biology Techniques: PCR, qPCR, Western blot, flow cytometry, CRISPR/Cas9
Data Analysis: R, Python, GraphPad Prism
Laboratory Management: SOP development, training, compliance, inventory management
Scientific Communication: Manuscript writing, conference presentations, grant support
Project Leadership: Experimental planning, cross-functional team coordination, reproducibility

Professional Experience
Senior Research Associate – Molecular Biology
XYZ Biotech, Boston, MA | 2021 – Present
* Designed and executed experiments to validate novel gene targets for immunotherapy applications.
* Managed a small team of junior scientists and interns, providing training and supervision in molecular assays.
* Developed and optimized CRISPR-based screening workflows, reducing assay time by 30%.
* Authored and co-authored 3 publications in peer-reviewed journals and presented findings at national conferences.
* Maintained lab compliance with safety and quality standards, including GLP documentation.

Research Scientist II
ABC Pharmaceuticals, Cambridge, MA | 2018 – 2021
* Conducted mechanistic studies on signaling pathways in cancer cell lines, integrating data from RNA-seq and proteomics.
* Led collaborative projects with bioinformatics and clinical teams, contributing to drug target validation.
* Standardized cell-based assays across multiple research projects, improving reproducibility and throughput.
* Assisted in grant writing and reporting, supporting successful funding for multiple projects.

Postdoctoral Research Fellow
University of Massachusetts, Worcester, MA | 2016 – 2018
* Investigated molecular mechanisms of gene regulation in stem cells.
* Applied next-generation sequencing and epigenetic profiling to identify novel regulatory elements.
* Mentored undergraduate and graduate students in experimental techniques and data interpretation.
* Published 2 first-author papers in high-impact journals.
"""

In [27]:
user_background_embedding = embedding_model.encode([test_resume_2], show_progress_bar=False, convert_to_numpy=True)
user_background_skill_lst = extract_skills(nlp, test_resume_2)

results = hybrid_recommendations(postings, user_background_embedding, job_embeddings, user_background_skill_lst)
results[['company_name', 'title', 'location', 'match_score']].head()

Unnamed: 0,company_name,title,location,match_score
6413,ElevateBio,"Senior Scientist, Technology Development","Durham, NC",0.462519
2217,Stanford University,Life Science Research Professional 1,"Stanford, CA",0.460338
85893,Coriell Institute for Medical Research,Laboratory Technician,"Camden, NJ",0.459025
39767,Life Edit Therapeutics,"Senior Associate Scientist, Lead Development (...","Durham, NC",0.455495
65779,ElevateBio,"Scientist II, Lead Development","Durham, NC",0.451867
