In [None]:
# dependencies
import time
import os
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import asyncio
import nest_asyncio
import httpx
from openai import AsyncOpenAI
from google.colab import drive, userdata

# async fix for notebook
nest_asyncio.apply()

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

# set computation device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BASE_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
MODEL_STATE_PATH = work_dir('experiments', 'untuned.pth')

Mounted at /content/drive


In [None]:
# load esco for reference
esco_en = pd.read_csv(work_dir('ESCO', 'ESCO dataset - v1.1.2 - classification - en - csv', 'skills_en.csv'))

# convenience
esco_en = esco_en.rename(columns={'preferredLabel': 'label'})
esco_en = esco_en.sort_values('conceptUri').reset_index(drop=True)
esco_en = esco_en[['conceptUri', 'label', 'description']]

# check
esco_en

Unnamed: 0,conceptUri,label,description
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,Assign and manage staff tasks in areas such as...
1,http://data.europa.eu/esco/skill/00064735-8fad...,supervise correctional procedures,Supervise the operations of a correctional fac...
2,http://data.europa.eu/esco/skill/000709ed-2be5...,apply anti-oppressive practices,"Identify oppression in societies, economies, c..."
3,http://data.europa.eu/esco/skill/0007bdc2-dd15...,control compliance of railway vehicles regulat...,"Inspect rolling stock, components and systems ..."
4,http://data.europa.eu/esco/skill/00090cc1-1f27...,identify available services,Identify the different services available for ...
...,...,...,...
13891,http://data.europa.eu/esco/skill/ffef5eb3-a15e...,remediate healthcare user's occupational perfo...,"Remediate or restore the cognitive, sensorimot..."
13892,http://data.europa.eu/esco/skill/fff0b074-5a76...,install transport equipment lighting,Install lighting elements in transport equipme...
13893,http://data.europa.eu/esco/skill/fff0e2cd-d0bd...,natural language processing,The technologies which enable ICT devices to u...
13894,http://data.europa.eu/esco/skill/fff5bc45-b506...,coordinate construction activities,Coordinate the activities of several construct...


In [None]:
# load raw sentences from crawled Danish job ads (ignore null)
real_sentences = pd.read_csv(work_dir('Data', 'real_sentences.csv'))

# check
print(real_sentences.shape)
real_sentences.head(3)

(9220716, 3)


Unnamed: 0,id,sentence,frequency
0,1,Advokat til afdeling for fast ejendom og entre...,1
1,2,Til et spændende og udfordrende job hos Haugaa...,1
2,3,Vores afdeling for fast ejendom og entreprise ...,1


# Tokenizer

In [None]:
# initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# shortcut
def tokenize(sentences):
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=256, return_tensors='pt')

# Base Model

In [None]:
# initialize base model
base_model = AutoModel.from_pretrained(BASE_MODEL).to(DEVICE)

# Embedder

In [None]:
# define embedder
class SkillEmbedder(nn.Module):

    # initialize with base model and dropout rate
    def __init__(self, base_model, dropout_rate=0.0):
        super().__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_rate)

    # embed using batch input_ids and attention_mask (including attention mean pooling!)
    def forward(self, input_ids, attention_mask):
        embeddings = self.base_model(input_ids, attention_mask).last_hidden_state#.mean(dim=1)
        embeddings = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
        return self.dropout(embeddings)

# init embedder
embedder = SkillEmbedder(base_model=base_model).to(DEVICE)

# Predictor

In [None]:
# define predictor
class SkillPredictor(nn.Module):

    # initialize embedder, proxy_loader and proxy embeddings
    def __init__(self, embedder, n_proxies):

        super().__init__()

        self.embedder = embedder
        self.is_skill_dim = 8
        self.embeddings = nn.Parameter(torch.zeros(n_proxies,
                                                   1,
                                                   self.embedder.base_model.config.hidden_size,
                                                   dtype=torch.half),
                                       requires_grad=False)

    # predict is_skill from n'th dimension(s) and skill_id from proxy embedding similarity
    def forward(self, embeddings, include='both', logits=False):

        if include in ('both', 'all', 'is_skill'):
            is_skill = embeddings[:, -self.is_skill_dim:].mean(dim=-1)
            is_skill = is_skill if logits else F.sigmoid(is_skill)
            if include == 'is_skill':
                return is_skill

        if include in ('both', 'all', 'skill_id'):
            sims = F.cosine_similarity(embeddings.unsqueeze(1).unsqueeze(1),
                                       self.embeddings,
                                       dim=-1).max(dim=-1)[0]
            skill_id = sims if logits else F.softmax(sims, dim=-1)
            if include == 'skill_id':
                return skill_id

        return is_skill, skill_id

# init predictor
predictor = SkillPredictor(embedder=embedder, n_proxies=len(esco_en)).to(DEVICE)

In [None]:
# load weights and update proxies
state_dicts = torch.load(MODEL_STATE_PATH, weights_only=False, map_location=DEVICE)
embedder.load_state_dict(state_dicts['embedder_state_dict'])
predictor.load_state_dict(state_dicts['predictor_state_dict'])

<All keys matched successfully>

In [None]:
prompt_template = """
The following sentence was scraped, parsed and extracted from a webpage presenting a real Danish job ad:

sentence = [sentence]

You are tasked with correctly classifying this sentence by following the below instructions carefully. The purpose is to extract and categorize skills, related to the advertised position, that are requested of the applicant, either implicitly or explicitly. You should consider skills in a broad sense, including knowledge, experience, education, and competences.

First, the sentence could be leftover parsing noise from the webpage unrelated to the job ad, such as uncleaned button text, a reference to another position, an incorrectly partitioned sentence, a heading from a sidebar, or similar. If this is the case, output 0 and nothing else, and end your task here.

Second, the sentence could be part of the job ad description, but unrelated to the skills (broad sense) requested of the applicant. It could for instance be related to the application process or the company history. If this is the case, output 0 and nothing else, and end your task here.

Third, the sentence could be ambiguous and indeterminate, making it impossible to decide whether a given skill (broad sense) is requested of the applicant or mentioned in a different context. If this is the case, output -1 and nothing else, and end your task here.

Fourth, the sentence could in fact be requesting a skill (broad sense) of the applicant, either implicitly by describing the job title, work areas, activities, tasks, or responsibilities, or explicitly by mentioning skills (broad sense) directly. If this is the case, output 1 followed by new line, and continue your task.

Fifth, in the following you are given 25 suggestions for skills (broad sense), denoted by their label and description, that could potentially match the skills (broad sense) requested of the applicant in the given sentence. If this is the case, output the label for each match, if any, followed by new line. Output nothing else. Your task is done. Thank you.

[skills]
""".strip()
prompt_template

'The following sentence was scraped, parsed and extracted from a webpage presenting a real Danish job ad:\n\nsentence = [sentence]\n\nYou are tasked with correctly classifying this sentence by following the below instructions carefully. The purpose is to extract and categorize skills related to the advertised position that are requested of the applicant, either implicitly or explicitly. You should consider skills in a broad sense, including knowledge, experience, education, and competences.\n\nFirst, the sentence could be leftover parsing noise from the webpage unrelated to the job ad, such as uncleaned button text, a reference to another position, an incorrectly partitioned sentence, a heading from a sidebar, or similar. If this is the case, output 0 and nothing else, and end your task here.\n\nSecond, the sentence could be part of the job ad description, but unrelated to the skills (broad sense) requested of the applicant. It could for instance be related to the application process o

In [None]:
# openai client for async
client = AsyncOpenAI(api_key=userdata.get('OpenAI'), http_client=httpx.AsyncClient())

In [None]:
# prompt to completion (api response)
async def get_completion(prompt):
    response = await client.chat.completions.create(
        model='gpt-4o-mini',

        messages=[{'role': 'user', 'content': prompt}]
    )
    return response

# prompts to completions (api responses)
async def get_completions(prompts):
    tasks = [get_completion(prompt) for prompt in prompts]
    return await asyncio.gather(*tasks)

In [None]:
# loop through n at a time (n should reasonably be 100, but could be 1 for testing)
n = 100

with torch.no_grad():
    for idx in range(0, len(real_sentences), n):

        # continue if continuing
        filename = os.path.join(WORK_DIR, 'Annotated_data', 'sentences_{}.csv'.format(idx))
        if os.path.exists(filename):
            continue

        # get sentences
        sentences = real_sentences.iloc[idx:idx+n]['sentence'].tolist()

        # predict
        embedder.eval()
        tokens = tokenize(sentences).to(DEVICE)
        embeddings = embedder(**tokens)
        is_skill, skill_id = predictor(embeddings)

        # get label and description for top predictions
        top = [esco_en.loc[esco_en.index.isin(i.tolist()), ['label','description']].values for i in skill_id.topk(25, dim=-1)[1].cpu().numpy()]

        # generate prompts
        prompts = [prompt_template.replace('[sentence]', sentence) \
                                   .replace('[skills]', '\n'.join([f'label = {label}, description = {description}' for j, (label, description) in enumerate(top[i])]))
                   for i, sentence in enumerate(sentences)]

        # go go GPT
        completions = await get_completions(prompts)
        content = [[s.strip() for s in c.choices[0].message.content.strip().split('\n')] for c in completions]
        is_skill = [s if (s := int(re.sub(r'.+?(-?\d+).+', r'\1', c[0].strip()))) in [1, 0] else -1 for c in content]
        concept_uri = [list({p.values[0] for s in c[1:] if len(p := esco_en.loc[esco_en['label'] == s.strip(), 'conceptUri'])}) if len(c) > 1 else [] for c in content]
        df = pd.DataFrame({
            'conceptUri': concept_uri,
            'sentence': sentences,
            'is_skill': is_skill
        }).explode('conceptUri')
        df.to_csv(filename, index=False)

# Postprocessing

In [None]:
# get real sentences
reals = pd.concat([pd.read_csv(work_dir('Annotated_data', s))
                   for s in os.listdir(work_dir('Annotated_data')) if re.match(r'^sentences\_[0-9]+\.csv$', s)])

# filter real sentences conservatively in relation to llm instructions
skills = reals[reals['conceptUri'].notna() & (reals['conceptUri'] != '') & (reals['is_skill'] == 1)]
nonskills = reals[(reals['conceptUri'].isna() | (reals['conceptUri'] == '')) & (reals['is_skill'] == 0)]

# recombine reals (and set na to '' for nonskills)
reals = pd.concat([skills, nonskills.assign(conceptUri='')], ignore_index=True)

# check
reals

Unnamed: 0,conceptUri,sentence,is_skill
0,http://data.europa.eu/esco/skill/e46f8bd4-bc32...,Du skal deltage i uddannelsen af de værnepligt...,1
1,http://data.europa.eu/esco/skill/d8a03465-8cc2...,Du skal deltage i uddannelsen af de værnepligt...,1
2,http://data.europa.eu/esco/skill/1f1d2ff8-c4c1...,Det er dig som selvstændigt kan føre og lede d...,1
3,http://data.europa.eu/esco/skill/5be1c5fb-3833...,Det er dig som selvstændigt kan føre og lede d...,1
4,http://data.europa.eu/esco/skill/0fc0cb61-f44d...,Det er dig som selvstændigt kan føre og lede d...,1
...,...,...,...
1044848,,"Udover god frokostordning, pension og IT-udsty...",0
1044849,,"Hvis vi må være dine nye kolleger, så ser jeg ...",0
1044850,,"Har du spørgsmål til jobbet inden da, kan du r...",0
1044851,,Sana.aiche@cph.dk,0


In [None]:
# conditions to check for url, email, phone and date (latter more complex)
url_pattern = r'\b(https?\:\/\/|www\.)\S+(?![\'\",])'
email_pattern = r'\b([a-z0-9\.\-\_]+)?@[a-z0-9\.\-\_]+\b'
phone_pattern = r'(?<!\w)(\((?:\+|00)[0-9]{2}\))?(?: ?[0-9]{2,} ?){4,}(?!\w)'

# apply patterns
reals['cleaned'] = reals['sentence']
reals['cleaned'] = reals['cleaned'].str.replace(r'\W*' + url_pattern + r'\W*', '<URL>', regex=True, flags=re.IGNORECASE)
reals['cleaned'] = reals['cleaned'].str.replace(r'\W*' + email_pattern + r'\W*', '<EMAIL>', regex=True, flags=re.IGNORECASE)
reals['cleaned'] = reals['cleaned'].str.replace(r'\W*' + phone_pattern + r'\W*', '<PHONE>', regex=True, flags=re.IGNORECASE)

# clean spaces
reals['cleaned'] = reals['cleaned'].str.replace(r'\s+', ' ', regex=True, flags=re.IGNORECASE).str.strip()

# check
reals[['conceptUri', 'cleaned', 'is_skill']]

Unnamed: 0,conceptUri,cleaned,is_skill
0,http://data.europa.eu/esco/skill/e46f8bd4-bc32...,Du skal deltage i uddannelsen af de værnepligt...,1
1,http://data.europa.eu/esco/skill/d8a03465-8cc2...,Du skal deltage i uddannelsen af de værnepligt...,1
2,http://data.europa.eu/esco/skill/1f1d2ff8-c4c1...,Det er dig som selvstændigt kan føre og lede d...,1
3,http://data.europa.eu/esco/skill/5be1c5fb-3833...,Det er dig som selvstændigt kan føre og lede d...,1
4,http://data.europa.eu/esco/skill/0fc0cb61-f44d...,Det er dig som selvstændigt kan føre og lede d...,1
...,...,...,...
1044848,,"Udover god frokostordning, pension og IT-udsty...",0
1044849,,"Hvis vi må være dine nye kolleger, så ser jeg ...",0
1044850,,"Har du spørgsmål til jobbet inden da, kan du r...",0
1044851,,<EMAIL>,0


In [None]:
# occurences of <URL>, <EMAIL>, <PHONE>
reals['cleaned'].str.contains('<URL>').sum(), reals['cleaned'].str.contains('<EMAIL>').sum(), reals['cleaned'].str.contains('<PHONE>').sum()

(6434, 11856, 22028)

In [None]:
# create stratified splits
reals['split'] = 'train'
reals.loc[reals[reals['split'] == 'train'].groupby('conceptUri').sample(frac=0.01).index, 'split'] = 'test'
reals.loc[reals['sentence'].isin(reals.loc[reals['split'] == 'test', 'sentence']), 'split'] = 'test'
reals.loc[reals[reals['split'] == 'train'].groupby('conceptUri').sample(frac=0.01).index, 'split'] = 'val'
reals.loc[reals['sentence'].isin(reals.loc[reals['split'] == 'val', 'sentence']), 'split'] = 'val'

# check number of labels per split
reals['split'].value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,992104
test,27148
val,25601


In [None]:
# average label per sentence per split
reals.groupby('split')['conceptUri'].size() / reals.groupby('split')['sentence'].nunique()

Unnamed: 0_level_0,0
split,Unnamed: 1_level_1
test,2.81239
train,1.916128
val,2.738368


In [None]:
# proportion of sentences with no label per split
reals.assign(has_label=reals['conceptUri'] == '').groupby('split')['has_label'].mean()

Unnamed: 0_level_0,has_label
split,Unnamed: 1_level_1
test,0.088846
train,0.238301
val,0.093278


In [None]:
# save as one set
reals.to_csv(work_dir('Data', 'reals.csv'), index=False)