In [2]:
import pandas as pd
import numpy as np
import json

import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English

import warnings 
warnings.filterwarnings('ignore')

In [3]:
jobs_df = pd.read_json("../data/linkedin_jobs_scraped.json", convert_dates=['posted_date','scraping_date'])
jobs_df.head()

Unnamed: 0,Job_ID,Job_txt,company,job-title,level,location,posted-time-ago,nb_candidats,scraping_date,posted_date,skills,match_score,missing_skills
8,4120976805,"Junior C++ Developer Luxoft Bulgaria Sofia, So...",Luxoft Bulgaria,Junior C++ Developer,Associate,"Sofia, Sofia City, Bulgaria",4 weeks ago,29.0,2025-02-06,2025-01-09,"[finance, business, continuous integration, li...",50.0,"finance,continuous integration,software,linux,..."
30,4120978886,"Junior/Regular C++ Developer Luxoft Sofia, Sof...",Luxoft,Junior/Regular C++ Developer,Mid-Senior level,"Sofia, Sofia City, Bulgaria",4 weeks ago,,2025-02-06,2025-01-09,"[finance, business, continuous integration, li...",46.2,"finance,kubernetes,continuous integration,lang..."
26,4144431991,"Back End Developer, Algorithms/Data Structures...",Owen Thomas | Pending B Corp™,"Back End Developer, Algorithms/Data Structures...",Mid-Senior level,European Union,,53.0,2025-02-06,NaT,"[blockchain, prometheus, go, mongodb, algorith...",43.8,"sentry,kubernetes,go,api,complex system,mongod..."
9,4132048865,Junior Developer - SAP Next Talent Program SAP...,SAP,Junior Developer - SAP Next Talent Program,Not Applicable,"Sofia, Sofia City, Bulgaria",2 weeks ago,56.0,2025-02-06,2025-01-23,"[spring boot, html, database, testing, design,...",43.5,"analytics,go,spring boot,database,html,javascr..."
7,3935983004,"Junior .NET Developer Accedia Sofia, Sofia Cit...",Accedia,Junior .NET Developer,Entry level,"Sofia, Sofia City, Bulgaria",9 months ago,196.0,2025-02-06,2024-05-12,"[sql, html, database, documentation, testing, ...",41.7,"database,software,html,documentation,.net,java..."


In [4]:
!pip install openai > /dev/null 2>&1

In [5]:
pip install transformers > /dev/null 2>&1

Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install torch > /dev/null 2>&1


##  BART Model

- Explicitly trained for classification tasks.

- BART does direct classification and provides scores for each skill independently. It’s optimized for this kind of multi-label classification task.

In [7]:
import pandas as pd
from transformers import pipeline

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Function to compute match score
def get_match_score(job_desc, candidate_skills):
    labels = candidate_skills.split(", ")  # Convert skills into a list
    result = classifier(job_desc, labels)
    
    # Get the highest score among the matched skills
    match_score = max(result["scores"]) * 100  # Convert to percentage
    return round(match_score, 2)  # Round to 2 decimal places



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps:0


In [8]:
skills_from_text = "flask,Scikit-learn,Apache Spark,analytics,support,databases,machine learning,data visualization,\
mysql,data analysis,tensorflow,database,data science,java,python,computer science,snowflake,engineering,information management,docker,linux"

skills_from_text = skills_from_text.lower()
skills_from_text

'flask,scikit-learn,apache spark,analytics,support,databases,machine learning,data visualization,mysql,data analysis,tensorflow,database,data science,java,python,computer science,snowflake,engineering,information management,docker,linux'

In [9]:
jobs_df["model_score"] = jobs_df["Job_txt"].apply(lambda x: get_match_score(x, skills_from_text))

In [10]:
jobs_df.head()

Unnamed: 0,Job_ID,Job_txt,company,job-title,level,location,posted-time-ago,nb_candidats,scraping_date,posted_date,skills,match_score,missing_skills,model_score
8,4120976805,"Junior C++ Developer Luxoft Bulgaria Sofia, So...",Luxoft Bulgaria,Junior C++ Developer,Associate,"Sofia, Sofia City, Bulgaria",4 weeks ago,29.0,2025-02-06,2025-01-09,"[finance, business, continuous integration, li...",50.0,"finance,continuous integration,software,linux,...",98.65
30,4120978886,"Junior/Regular C++ Developer Luxoft Sofia, Sof...",Luxoft,Junior/Regular C++ Developer,Mid-Senior level,"Sofia, Sofia City, Bulgaria",4 weeks ago,,2025-02-06,2025-01-09,"[finance, business, continuous integration, li...",46.2,"finance,kubernetes,continuous integration,lang...",98.76
26,4144431991,"Back End Developer, Algorithms/Data Structures...",Owen Thomas | Pending B Corp™,"Back End Developer, Algorithms/Data Structures...",Mid-Senior level,European Union,,53.0,2025-02-06,NaT,"[blockchain, prometheus, go, mongodb, algorith...",43.8,"sentry,kubernetes,go,api,complex system,mongod...",97.4
9,4132048865,Junior Developer - SAP Next Talent Program SAP...,SAP,Junior Developer - SAP Next Talent Program,Not Applicable,"Sofia, Sofia City, Bulgaria",2 weeks ago,56.0,2025-02-06,2025-01-23,"[spring boot, html, database, testing, design,...",43.5,"analytics,go,spring boot,database,html,javascr...",86.56
7,3935983004,"Junior .NET Developer Accedia Sofia, Sofia Cit...",Accedia,Junior .NET Developer,Entry level,"Sofia, Sofia City, Bulgaria",9 months ago,196.0,2025-02-06,2024-05-12,"[sql, html, database, documentation, testing, ...",41.7,"database,software,html,documentation,.net,java...",67.24


In [11]:
!pip install sentence-transformers > /dev/null 2>&1


##  MiniLM model

- A semantic similarity model, designed to measure how similar two pieces of text are

- MiniLM calculates overall similarity between two pieces of text, and doesn’t focus on individual labels, which makes it less suited for fine-grained classification tasks like skill-job matching.

In [12]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd

minilm_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Function to get similarity score using MiniLM
def get_minilm_similarity(job_desc, candidate_skills):
    skills_text = ", ".join(candidate_skills.split(", "))  # Convert list to text
    job_embedding = minilm_model.encode(job_desc, convert_to_tensor=True)
    skills_embedding = minilm_model.encode(skills_text, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(job_embedding, skills_embedding).item()
    return similarity * 100  # Convert to percentage


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
jobs_df["MiniLM_Score"] = jobs_df.apply(lambda row: get_minilm_similarity(row["Job_txt"], skills_from_text), axis=1)

In [14]:
jobs_df.head()

Unnamed: 0,Job_ID,Job_txt,company,job-title,level,location,posted-time-ago,nb_candidats,scraping_date,posted_date,skills,match_score,missing_skills,model_score,MiniLM_Score
8,4120976805,"Junior C++ Developer Luxoft Bulgaria Sofia, So...",Luxoft Bulgaria,Junior C++ Developer,Associate,"Sofia, Sofia City, Bulgaria",4 weeks ago,29.0,2025-02-06,2025-01-09,"[finance, business, continuous integration, li...",50.0,"finance,continuous integration,software,linux,...",98.65,31.552833
30,4120978886,"Junior/Regular C++ Developer Luxoft Sofia, Sof...",Luxoft,Junior/Regular C++ Developer,Mid-Senior level,"Sofia, Sofia City, Bulgaria",4 weeks ago,,2025-02-06,2025-01-09,"[finance, business, continuous integration, li...",46.2,"finance,kubernetes,continuous integration,lang...",98.76,27.527219
26,4144431991,"Back End Developer, Algorithms/Data Structures...",Owen Thomas | Pending B Corp™,"Back End Developer, Algorithms/Data Structures...",Mid-Senior level,European Union,,53.0,2025-02-06,NaT,"[blockchain, prometheus, go, mongodb, algorith...",43.8,"sentry,kubernetes,go,api,complex system,mongod...",97.4,19.979054
9,4132048865,Junior Developer - SAP Next Talent Program SAP...,SAP,Junior Developer - SAP Next Talent Program,Not Applicable,"Sofia, Sofia City, Bulgaria",2 weeks ago,56.0,2025-02-06,2025-01-23,"[spring boot, html, database, testing, design,...",43.5,"analytics,go,spring boot,database,html,javascr...",86.56,21.257222
7,3935983004,"Junior .NET Developer Accedia Sofia, Sofia Cit...",Accedia,Junior .NET Developer,Entry level,"Sofia, Sofia City, Bulgaria",9 months ago,196.0,2025-02-06,2024-05-12,"[sql, html, database, documentation, testing, ...",41.7,"database,software,html,documentation,.net,java...",67.24,21.083459
