<a href="https://colab.research.google.com/github/jakobmwang/skillextraction/blob/main/auto_annotation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
# dependencies
import time
import os
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import asyncio
import nest_asyncio
import httpx
from openai import AsyncOpenAI
from google.colab import drive, userdata

# async fix for notebook
nest_asyncio.apply()

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

# set computation device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BASE_MODEL = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
MODEL_STATE_PATH = work_dir('experiments', 'llm_data_model.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
# load esco for reference
esco_en = pd.read_csv(work_dir('ESCO', 'ESCO dataset - v1.1.2 - classification - en - csv', 'skills_en.csv'))

# convenience
esco_en = esco_en.rename(columns={'preferredLabel': 'label'})
esco_en = esco_en.sort_values('conceptUri').reset_index(drop=True)
esco_en = esco_en[['conceptUri', 'label', 'description']]

# check
esco_en

Unnamed: 0,conceptUri,label,description
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,Assign and manage staff tasks in areas such as...
1,http://data.europa.eu/esco/skill/00064735-8fad...,supervise correctional procedures,Supervise the operations of a correctional fac...
2,http://data.europa.eu/esco/skill/000709ed-2be5...,apply anti-oppressive practices,"Identify oppression in societies, economies, c..."
3,http://data.europa.eu/esco/skill/0007bdc2-dd15...,control compliance of railway vehicles regulat...,"Inspect rolling stock, components and systems ..."
4,http://data.europa.eu/esco/skill/00090cc1-1f27...,identify available services,Identify the different services available for ...
...,...,...,...
13891,http://data.europa.eu/esco/skill/ffef5eb3-a15e...,remediate healthcare user's occupational perfo...,"Remediate or restore the cognitive, sensorimot..."
13892,http://data.europa.eu/esco/skill/fff0b074-5a76...,install transport equipment lighting,Install lighting elements in transport equipme...
13893,http://data.europa.eu/esco/skill/fff0e2cd-d0bd...,natural language processing,The technologies which enable ICT devices to u...
13894,http://data.europa.eu/esco/skill/fff5bc45-b506...,coordinate construction activities,Coordinate the activities of several construct...


In [38]:
# load raw sentences from crawled Danish job ads (ignore null)
real_sentences = pd.read_csv(work_dir('Data', 'real_sentences.csv'))

# check
print(real_sentences.shape)
real_sentences.head(3)

(9220716, 3)


Unnamed: 0,id,sentence,frequency
0,1,Advokat til afdeling for fast ejendom og entre...,1
1,2,Til et spændende og udfordrende job hos Haugaa...,1
2,3,Vores afdeling for fast ejendom og entreprise ...,1


# Tokenizer

In [39]:
# initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# shortcut
def tokenize(sentences):
    return tokenizer(sentences, padding='max_length', truncation=True, max_length=256, return_tensors='pt')

# Base Model

In [40]:
# initialize base model
base_model = AutoModel.from_pretrained(BASE_MODEL).to(DEVICE)

# Embedder

In [41]:
# define embedder
class SkillEmbedder(nn.Module):

    # initialize with base model and dropout rate
    def __init__(self, base_model, dropout_rate=0.0):
        super().__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(dropout_rate)

    # embed using batch input_ids and attention_mask (including attention mean pooling!)
    def forward(self, input_ids, attention_mask):
        embeddings = self.base_model(input_ids, attention_mask).last_hidden_state#.mean(dim=1)
        embeddings = (embeddings * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1, keepdim=True)
        return self.dropout(embeddings)

# init embedder
embedder = SkillEmbedder(base_model=base_model).to(DEVICE)

# Predictor

In [42]:
# define predictor
class SkillPredictor(nn.Module):

    # initialize embedder, proxy_loader and proxy embeddings
    def __init__(self, embedder, n_proxies):

        super().__init__()

        self.embedder = embedder
        self.is_skill_dim = 8
        self.embeddings = nn.Parameter(torch.zeros(n_proxies,
                                                   1,
                                                   self.embedder.base_model.config.hidden_size,
                                                   dtype=torch.half),
                                       requires_grad=False)

    # predict is_skill from n'th dimension(s) and skill_id from proxy embedding similarity
    def forward(self, embeddings, include='both', logits=False):

        if include in ('both', 'all', 'is_skill'):
            is_skill = embeddings[:, -self.is_skill_dim:].mean(dim=-1)
            is_skill = is_skill if logits else F.sigmoid(is_skill)
            if include == 'is_skill':
                return is_skill

        if include in ('both', 'all', 'skill_id'):
            sims = F.cosine_similarity(embeddings.unsqueeze(1).unsqueeze(1),
                                       self.embeddings,
                                       dim=-1).max(dim=-1)[0]
            skill_id = sims if logits else F.softmax(sims, dim=-1)
            if include == 'skill_id':
                return skill_id

        return is_skill, skill_id

# init predictor
predictor = SkillPredictor(embedder=embedder, n_proxies=len(esco_en)).to(DEVICE)

In [43]:
# load weights and update proxies
state_dicts = torch.load(MODEL_STATE_PATH, weights_only=False, map_location=torch.device('cpu'))
embedder.load_state_dict(state_dicts['embedder_state_dict'])
predictor.load_state_dict(state_dicts['predictor_state_dict'])

<All keys matched successfully>

In [44]:
prompt_template = """
This sentence was retrieved from a real Danish job ad:

sentence = [sentence]

FIRST, determine whether this sentence is expressing a skill request of the applicant, either implicitly or explicitly.
If you are ABSOLUTELY certain there is a skill request, then you output 1 followed by new line. Output nothing else.
If you are certain there is NOT a skill request, for instance if it is irrelevant information or leftover textual noise from webpage parsing, then you output 0 followed by new line. Output nothing else.
However, in many cases you CANNOT be certain, because you do not know the context, and then you output -1 followed by new line. Output nothing else.

SECOND, if and only if the output above was 1, determine whether one or more of the following skills, denoted below by their label and description, are requested in the sentence.
Do note that just because the output above was 1, it does not necessarily mean it is any of the below skills that are requested in the above sentence.
For any of the below skills that are requested, output their label followed by new line. Output nothing else.

[skills]
""".strip()
prompt_template

'This sentence was retrieved from a real Danish job ad:\n\nsentence = [sentence]\n\nFIRST, determine whether this sentence is expressing a skill request of the applicant, either implicitly or explicitly.\nIf you are ABSOLUTELY certain there is a skill request, then you output 1 followed by new line. Output nothing else.\nIf you are certain there is NOT a skill request, for instance if it is irrelevant information or leftover textual noise from webpage parsing, then you output 0 followed by new line. Output nothing else.\nHowever, in many cases you CANNOT be certain, because you do not know the context, and then you output -1 followed by new line. Output nothing else.\n\nSECOND, if and only if the output above was 1, determine whether one or more of the following skills, denoted below by their label and description, are requested in the sentence.\nDo note that just because the output above was 1, it does not necessarily mean it is any of the below skills that are requested in the above 

In [45]:
# openai client for async
client = AsyncOpenAI(api_key=userdata.get('OpenAI'), http_client=httpx.AsyncClient())

In [46]:
# prompt to completion (api response)
async def get_completion(prompt):
    response = await client.chat.completions.create(
        model='gpt-4o-mini',

        messages=[{'role': 'user', 'content': prompt}]
    )
    return response

# prompts to completions (api responses)
async def get_completions(prompts):
    tasks = [get_completion(prompt) for prompt in prompts]
    return await asyncio.gather(*tasks)

In [None]:
# loop through n at a time (n should reasonably be 100, but could be 1 for testing)
n = 100

with torch.no_grad():
    for idx in range(0, len(real_sentences), n):

        # continue if continuing
        filename = os.path.join(WORK_DIR, 'Annotated_data', 'sentences_{}.csv'.format(idx))
        if os.path.exists(filename):
            continue

        # get sentences
        sentences = real_sentences.iloc[idx:idx+n]['sentence'].tolist()

        # predict
        embedder.eval()
        tokens = tokenize(sentences)
        embeddings = embedder(**tokens)
        is_skill, skill_id = predictor(embeddings)

        # get label and description for top 10 predictions
        top10 = [esco_en.loc[esco_en.index.isin(i.tolist()), ['label','description']].values for i in skill_id.topk(10, dim=-1)[1].numpy()]

        # generate prompts
        prompts = [prompt_template.replace('[sentence]', sentence) \
                                   .replace('[skills]', '\n'.join([f'label = {label}, description = {description}' for j, (label, description) in enumerate(top10[i])]))
                   for i, sentence in enumerate(sentences)]

        # go go GPT
        completions = await get_completions(prompts)
        content = [[s.strip() for s in c.choices[0].message.content.strip().split('\n')] for c in completions]
        is_skill = [int(c[0].strip()) for c in content]
        concept_uri = [[p.values[0] for s in c[1:] if len(p := esco_en.loc[esco_en['label'] == s.strip(), 'conceptUri'])] if len(c) > 1 else [] for c in content]
        df = pd.DataFrame({
            'conceptUri': concept_uri,
            'sentence': sentences,
            'is_skill': is_skill
        }).explode('conceptUri')
        df.to_csv(filename, index=False)

In [None]:
pd.read_csv(os.path.join(WORK_DIR, 'Annotated_data', 'sentences_{}.csv'.format(idx)))