In [7]:
import os
import re
import time

import h5py
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_tokenize
from nltk import ngrams
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

# Transformers and related libraries
import transformers
from transformers import pipeline, AutoTokenizer, AutoModel

nltk.download('punkt')

2024-01-05 14:47:11.487111: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /Users/koechian/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Datasets

- The official ESCO dataset was used as a framework for skill labelling. This ensured consistency in skill labelling and removed any overlapping fields.
- A dataset that contained sentences that appear in job descriptions that were labellled with accompanying skills was also used.

In [22]:
# jobs_train_df = pd.read_csv('../datasets/indeed-dataset.csv')
# jobs_test_df = pd.read_csv('../datasets/tech_validation_annotations.csv')
# jobs_df = pd.concat([jobs_train_df, jobs_test_df])
# jobs_df = jobs_df[['sentence','label']]

jobs_df = pd.read_csv('../datasets/indeed-dataset.csv')
jobs_df.drop_duplicates(subset=['Job Description'], keep='first', inplace=True)


print(f'The Indeed Dataset has {jobs_df.shape[0]} rows and two cols')

The Indeed Dataset has 24714 rows and two cols


In [23]:
jobs_df.head(10)

Unnamed: 0,Job Title,Job Description,Job Type,Categories,Location,City,State,Country,Zip Code,Address,...,Employer Phone,Employer Logo,Companydescription,Employer Location,Employer City,Employer State,Employer Country,Employer Zip Code,Uniq Id,Crawl Timestamp
0,Shift Manager,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Mission Hills, CA 91345",Mission Hills,CA,United States,91345.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_squar...,Del Taco is an American quick service restaura...,"Mission Hills, CA 91345",Mission Hills,CA,United States,91345.0,511f9a53920f4641d701d51d3589349f,2019-08-24 09:13:18 +0000
1,Operations Support Manager,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Atlanta, GA 30342",Atlanta,GA,United States,30342.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,"Based in Atlanta, FOCUS Brands Inc. is an inno...",,,,United States,,4955daf0a3facbe2acb6c429ba394e6d,2019-09-19 08:16:55 +0000
2,Senior Product Manager - Data,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Chicago, IL",Chicago,IL,United States,,,...,,,Vibes Corp. reputation was built and establish...,,,,United States,,a0e0d12df1571962b785f17f43ceae12,2019-09-18 02:13:10 +0000
3,Part-Time Office Concierge,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Festus, MO",Festus,MO,United States,,,...,,,,,,,United States,,56e411fd731f76ac916bf4fb169250e9,2019-10-24 16:39:13 +0000
4,Print & Marketing Associate,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Cedar Rapids, IA 52404",Cedar Rapids,IA,United States,52404.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,"Staples is The Worklife Fulfillment Company, h...","Cedar Rapids, IA 52404",Cedar Rapids,IA,United States,52404.0,3fff5c0ad6981bf4bff6260bd5feab63,2019-08-24 22:29:10 +0000
5,Cyber IT Risk & Strategy Senior Consultant,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Washington, DC 20003",Washington,DC,United States,20003.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,Think a career in professional services is not...,,,,United States,,3fcf91a3e406f0727fe30ee09e7910bf,2019-10-18 01:09:20 +0000
6,"Sales Associate, Retail Part Time","<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Eastanollee, GA 30538",Eastanollee,GA,United States,30538.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,GNC has been a leading source of health and we...,,,,United States,,9e23f19b5e9502a49ba97fd2e5b78906,2019-09-25 23:49:18 +0000
7,Home Lending Branch Manager-Spokane,"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Spokane, WA 99201",Spokane,WA,United States,99201.0,,...,,https://d2q79iu7y748jz.cloudfront.net/s/_logo/...,Today we have over 300 locations across the We...,,,,United States,,f570dac5fa316794e7460d6307c0be86,2019-10-24 12:23:37 +0000
8,Property Manager in Training (MIT),"<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Durham, NC",Durham,NC,United States,,,...,,,,,,,United States,,219550967b49d887ac6574d63b001d1b,2019-09-23 09:52:15 +0000
9,"Compliance Specialist, Marketing, Advertising ...","<div id=""jobDescriptionText"" class=""jobsearch-...",,,"Chicago, IL",Chicago,IL,United States,,,...,,,,,,,United States,,de5578ef740fbf9c6a65201bc5877306,2019-10-22 22:30:03 +0000


### Load and Prep the Esco Skills

In [24]:
esco_skills = pd.read_csv('../datasets/skills_en.csv')

# Remove "(text)" occurences
esco_skills['label_cleaned'] = esco_skills['preferredLabel'].apply(lambda x: re.sub(r'\([^)]*\)', '', x).strip())

# Count words in skills after cleaning
esco_skills['word_cnt'] = esco_skills['label_cleaned'].apply(lambda x: len(str(x).split()))
esco_df = pd.DataFrame(esco_skills, columns=['label_cleaned', 'altLabels', 'word_cnt'])

In [10]:
class EscoDataset(Dataset):
    def __init__(self, df, skill_col, backbone):
        texts = df
        self.tokenizer = AutoTokenizer.from_pretrained(backbone)
        self.texts = texts[skill_col].values.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        res = self.tokenizer(
            self.texts[idx],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=20
        )
        return {k:v[0] for k,v in res.items()}

    
class ClsPool(nn.Module):
    def forward(self, x):
        # batch * num_tokens * num_embedding
        return x[:, 0, :]    

    
class BertModel(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        
        self.backbone_name = backbone
        self.backbone = AutoModel.from_pretrained(backbone)
        self.pool = ClsPool()
    
    def forward(self, x):
        x = self.backbone(**x)["last_hidden_state"]
        x = self.pool(x)
        
        return x

### Loading the BERT Model

In [32]:
backbone = 'jjzha/jobbert-base-cased'
emb_label = 'jobbert'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and Dataloader
ds = EscoDataset(esco_df, 'label_cleaned', backbone)
dl = DataLoader(ds, shuffle=False, batch_size=32)

# Build custom model
model = BertModel(backbone)
model.eval()
model.to(device)

# Get embeddings for each skill
embs = []
with torch.no_grad():
    for i, x in enumerate(dl):
        x = {k:v.to(device) for k, v in x.items()}
        out = model(x)
        embs.extend(out.detach().cpu())
# Add them to the DataFrame
esco_df[emb_label] = embs

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at jjzha/jobbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the mo

1. Get Sentences -> Parses a HTML JD into sentences
2. Compute Simmilarity -> Returns the simmilarity between ESCO skills and the provided vector and returns the one with the most simmilarity

In [33]:
def get_sentences(job):
    """
    Given a raw html job description, parse it into sentences
    by using nltk's sentence tokenization + new line splitting
    """
    soup = BeautifulSoup(job, 'html.parser')
    # Found some ads using unicode bullet points
    for p in soup.find_all('p'):
        p.string = p.get_text().replace("•", "")
    text = soup.get_text()
    st = sent_tokenize(text)
    sentences = []
    for sent in st:
        sentences.extend([x for x in sent.split('\n') if x !=''])
    return sentences

def compute_similarity(vec, emb_type):
    """
    Compute vector similarity for a given vec and all the ESCO skills embeddings.
    If more embeddings were created, the type is specified by the input parameter.
    Return the ESCO skill id with max similarity
    """
    esco_embs = esco_df[emb_type]
    sims = []
    # Compute cosine similarities
    for i, esco_vec in enumerate(esco_embs):
        sims.append((i, cosine_similarity(vec, esco_vec.reshape(1, -1))))
    # Return max similarity and esco skill index
    idx, sim = max(sims, key=lambda x: x[1])
    return idx, sim.item()


def compute_similarity_opt(emb_vec, emb_type):
    """
    Compute vector similarity for a given vec and all the ESCO skills embeddings
    by constructing a matrix from ESCO embeddings to process it faster.
    Return the ESCO skill id with max similarity
    """
    esco_embs = [x for x in esco_df[emb_type]]
    esco_vectors = torch.stack(esco_embs)
    # Normalize the stacked embeddings and the input vector
    norm_esco_vectors = torch.nn.functional.normalize(esco_vectors, p=2, dim=1)
    norm_emb_vec = torch.nn.functional.normalize(emb_vec.T, p=2, dim=0)
    # Compute cosine similarities
    cos_similarities = torch.matmul(norm_esco_vectors, norm_emb_vec)
    # Return max similarity and esco skill index
    sim, idx = torch.max(cos_similarities, dim=0)
    return idx.item(), sim.item()

def compute_similarity_mat(emb_mat, emb_type):
    esco_embs = [x for x in esco_df[emb_type]]
    esco_vectors = torch.stack(esco_embs)
    emb_vectors = torch.stack(emb_mat)
    # Normalize the stacked embeddings and the input vectors
    norm_esco_vectors = torch.nn.functional.normalize(esco_vectors, p=2, dim=1)
    norm_emb_vecs = torch.nn.functional.normalize(emb_vectors.T, p=2, dim=0)
    # Compute cosine similarities
    cos_similarities = torch.matmul(norm_esco_vectors, norm_emb_vecs)
    # Return max similarity and esco skill index
    max_similarities, max_indices = torch.max(cos_similarities, dim=0)
    return max_indices.numpy(), max_similarities.numpy()

In [38]:
def get_embedding(x):
    x = tokenizer(x, return_tensors='pt')
    x = {k:v.to(device) for k, v in x.items()}
    return model(x).detach().cpu()

def process_sentence(sent):
    emb = get_embedding(sent)
    return compute_similarity_opt(emb, emb_label)


tokenizer = AutoTokenizer.from_pretrained(backbone)
model = BertModel(backbone)
model.to(device)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at jjzha/jobbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the mo

BertModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [47]:
# Used in performance optimization and output example
job_sample = jobs_df.iloc[30]['Job Description']
threshold = .5

In [49]:
sentences = get_sentences(job_sample)
print(sentences)

['Would you like to be part of a team focused on increasing adoption of Mobile Labs solutions by developing strategic accounts with the Fortune 1000?', 'Mobile Labs employees help our customers solve the chaos of developing and testing multiple apps across multiple platforms, operating systems and device types by providing a private device cloud, either on premise or hosted.', 'Our enterprise direct sales team is focused specifically on large companies.', 'Therefore, our sale people need to possess the skills and experience required to sell into complex processes and organizations.', 'In addition, Mobile Labs is helping customers all over the world achieve their strategic digital transformation goals.', 'Being able to align our solution to these outcomes is also critical to success.', 'A key to our success is our focus on helping our customers win.', 'It is core to our culture & sales methodology.', 'We need great people to help us grow and develop.', 'If the below description sounds l

In [50]:
sim_start_time = time.time()
res = []
sentences = get_sentences(job_sample)
for sent in sentences:
    idx, sim = process_sentence(sent)
    if sim > threshold:
        res.append((sent, esco_df.iloc[idx]['label_cleaned'], sim))

sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Execution time: {execution_time:.4f} seconds")

for r in res:
    print('=========================')
    print(f"sentence: {r[0]}\nESCO skill:{r[1]}\nSimilarity:{r[2]:.4f}") 

Execution time: 2.7882 seconds
sentence: Would you like to be part of a team focused on increasing adoption of Mobile Labs solutions by developing strategic accounts with the Fortune 1000?
ESCO skill:work with e-services available to clients
Similarity:0.7192
sentence: Mobile Labs employees help our customers solve the chaos of developing and testing multiple apps across multiple platforms, operating systems and device types by providing a private device cloud, either on premise or hosted.
ESCO skill:implement a virtual private network
Similarity:0.7466
sentence: Our enterprise direct sales team is focused specifically on large companies.
ESCO skill:develop online sales business plan
Similarity:0.7476
sentence: Therefore, our sale people need to possess the skills and experience required to sell into complex processes and organizations.
ESCO skill:demonstrate intercultural competences in hospitality services
Similarity:0.6923
sentence: In addition, Mobile Labs is helping customers all 

In [51]:
sentences = get_sentences(job_sample)

sim_start_time = time.time()
sent_embs = []

for sent in sentences:
    x = tokenizer(sent, return_tensors='pt')
    x = {k:v.to(device) for k, v in x.items()}
    emb = model(x).detach().cpu()
    sent_embs.append(emb.squeeze())
idxs, sims = compute_similarity_mat(sent_embs, emb_label)
# Calculate job description processing time
sim_end_time = time.time()
execution_time = sim_end_time - sim_start_time
print(f"Execution time: {execution_time:.4f} seconds")

for r in res:
    print('=========================')
    print(f"sentence: {r[0]}\nESCO skill:{r[1]}\nSimilarity:{r[2]:.4f}") 

Execution time: 1.2839 seconds
sentence: Would you like to be part of a team focused on increasing adoption of Mobile Labs solutions by developing strategic accounts with the Fortune 1000?
ESCO skill:work with e-services available to clients
Similarity:0.7192
sentence: Mobile Labs employees help our customers solve the chaos of developing and testing multiple apps across multiple platforms, operating systems and device types by providing a private device cloud, either on premise or hosted.
ESCO skill:implement a virtual private network
Similarity:0.7466
sentence: Our enterprise direct sales team is focused specifically on large companies.
ESCO skill:develop online sales business plan
Similarity:0.7476
sentence: Therefore, our sale people need to possess the skills and experience required to sell into complex processes and organizations.
ESCO skill:demonstrate intercultural competences in hospitality services
Similarity:0.6923
sentence: In addition, Mobile Labs is helping customers all 

In [54]:
def get_classifiers(mtype):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if mtype == "jobbert":
        token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first", device='cpu')
        token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first", device=device)
    elif mtype == "xlmr":        
        token_skill_classifier = pipeline(model="jjzha/escoxlmr_skill_extraction", aggregation_strategy="first", device=device)
        token_knowledge_classifier = pipeline(model="jjzha/escoxlmr_knowledge_extraction", aggregation_strategy="first", device=device)
    else:
        raise Exception("Unknown model name provided")
    return token_skill_classifier, token_knowledge_classifier


def extract_skills(job, token_skill_classifier, token_knowledge_classifier, out_treshold=.8, sim_threshold=.8):
    """
    Function that processes outputs from pre-trained, ready to use models
    that detect skills as a token classification task. There are two thresholds,
    out_threshold for filtering model outputs and sim_threshold for filtering
    based on vector similarity with ESCO skills
    """     
    sentences = get_sentences(job)
    pred_labels = []
    res = []
    skill_embs = []
    skill_texts = []
    for sent in sentences:
        skills = ner(sent, token_skill_classifier, token_knowledge_classifier)
        for entity in skills['entities']:
            text = entity['word']
            if entity['score'] > out_treshold:
                skill_embs.append(get_embedding(text).squeeze())
                skill_texts.append(text)
                
    idxs, sims = compute_similarity_mat(skill_embs, emb_label)
    for i in range(len(idxs)):
        if sims[i] > sim_threshold:
            pred_labels.append(idxs[i])
            res.append((skill_texts[i], esco_df.iloc[idxs[i]]['label_cleaned'], sims[i]))
    return pred_labels, res


def aggregate_span(results):
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] == current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)

    return new_results


def ner(text, token_skill_classifier, token_knowledge_classifier):
    output_skills = token_skill_classifier(text)
    for result in output_skills:
        if result.get("entity_group"):
            result["entity"] = "Skill"
            del result["entity_group"]

    output_knowledge = token_knowledge_classifier(text)
    for result in output_knowledge:
        if result.get("entity_group"):
            result["entity"] = "Knowledge"
            del result["entity_group"]

    if len(output_skills) > 0:
        output_skills = aggregate_span(output_skills)
    if len(output_knowledge) > 0:
        output_knowledge = aggregate_span(output_knowledge)
    
    skills = []
    skills.extend(output_skills)
    skills.extend(output_knowledge)
    return {"text": text, "entities": skills}

In [55]:
tsc, tkc = get_classifiers("jobbert")

start_time = time.time()
_, res = extract_skills(job_sample, tsc, tkc)
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.4f} seconds")
for r in res:
    print('=========================')
    print(f"text: {r[0]}\nESCO skill:{r[1]}\nSimilarity:{r[2]:.4f}")

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

TypeError: '<' not supported between instances of 'str' and 'int'