In [None]:
import os
import re
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from google.colab import drive, userdata

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Mounted at /content/drive


In [None]:
# load english sentence dataset from article
article_en = pd.read_csv(work_dir('Data', 'dataset.csv'))
article_en = article_en.groupby('skill').head(10)

# check
print(article_en.shape)
print(article_en['skill'].value_counts().value_counts())
article_en.head(3)

(138260, 2)
count
10    13826
Name: count, dtype: int64


Unnamed: 0,sentence,skill
0,the ideal candidate for this position should b...,advise customers on sewing patterns
1,we need an employee who is able to assist our ...,advise customers on sewing patterns
2,if you possess good communication skills and h...,advise customers on sewing patterns


In [None]:
# stitch up danish translations of dataset from article
article_da = pd.concat([pd.read_csv(work_dir('Translated_data', s)) for s in os.listdir(work_dir('Translated_data')) if re.match(r'^translated_sentences\_[0-9]+\.csv$', s)])
article_da = article_da.groupby('skill').head(10)

# check
print(article_da.shape)
print(article_da['skill'].value_counts().value_counts())
article_da.head(3)

(138170, 2)
count
10    13798
9        18
1         5
7         1
6         1
5         1
3         1
2         1
Name: count, dtype: int64


Unnamed: 0,skill,sentence
0,procurement legislation,en omfattende forståelse af indkøbslovgivning ...
1,procurement legislation,kendskab og erfaring med indkøbslovgivning er ...
2,procurement legislation,ansøgere med erfaring inden for områder relate...


In [None]:
# get replacements for leftovers
leftovers = pd.read_csv(work_dir('Translated_data', 'leftover_translated_sentences_0.csv'))

# check
print(leftovers.shape)
print(leftovers['skill'].value_counts().value_counts())
leftovers.head(3)

(280, 2)
count
10    28
Name: count, dtype: int64


Unnamed: 0,skill,sentence
0,Wireshark,ønske om at ansætte en erfaren person med erfa...
1,Wireshark,kun kandidater med en grundig forståelse af Wi...
2,Wireshark,Som netværkssikkerhedsanalytiker vil du anvend...


In [None]:
# combine main translations with leftover replacement (and ignore previous)
article_da = pd.concat([leftovers, article_da], ignore_index=True).groupby('skill').head(10)

# check
print(article_da.shape)
print(article_da['skill'].value_counts().value_counts())
article_da.head(3)

(138260, 2)
count
10    13826
Name: count, dtype: int64


Unnamed: 0,skill,sentence
0,Wireshark,ønske om at ansætte en erfaren person med erfa...
1,Wireshark,kun kandidater med en grundig forståelse af Wi...
2,Wireshark,Som netværkssikkerhedsanalytiker vil du anvend...


In [None]:
# get esco english/danish
esco_en = pd.read_csv(work_dir('ESCO', 'ESCO dataset - v1.1.2 - classification - en - csv', 'skills_en.csv'))
esco_da = pd.read_csv(work_dir('ESCO', 'ESCO dataset - v1.1.2 - classification - da - csv', 'skills_da.csv'))

# limit to necessary and merge for easier mapping
esco = pd.merge(
    esco_en[['conceptUri', 'preferredLabel', 'description']],
    esco_da[['conceptUri', 'preferredLabel', 'description']],
    on='conceptUri',
    suffixes=('_en', '_da')
)

# check
print(esco.shape)
print(esco['preferredLabel_en'].value_counts().value_counts())
print(esco['preferredLabel_da'].value_counts().value_counts())
print(esco['description_en'].value_counts().value_counts())
print(esco['description_da'].value_counts().value_counts())
esco.head(3)

(13896, 5)
count
1    13896
Name: count, dtype: int64
count
1    13879
2        8
Name: count, dtype: int64
count
1    13890
2        3
Name: count, dtype: int64
count
1    13894
2        1
Name: count, dtype: int64


Unnamed: 0,conceptUri,preferredLabel_en,description_en,preferredLabel_da,description_da
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,Assign and manage staff tasks in areas such as...,lede musikalsk personale,Tildele og forvalte personaleopgaver på område...
1,http://data.europa.eu/esco/skill/00064735-8fad...,supervise correctional procedures,Supervise the operations of a correctional fac...,føre tilsyn med fængselsprocedurer,Føre tilsyn med driften af et fængsel eller an...
2,http://data.europa.eu/esco/skill/000709ed-2be5...,apply anti-oppressive practices,"Identify oppression in societies, economies, c...",anvende antioppressiv praksis,"Identificere undertrykkelse i samfund, økonomi..."


In [None]:
# get rid of nans and duplicates (potential duplicate proxies!!!)
esco = esco.dropna().drop_duplicates(subset=['preferredLabel_da']).drop_duplicates(subset=['description_en']).drop_duplicates(subset=['description_da'])

# check
print(esco.shape)
print(esco['preferredLabel_en'].value_counts().value_counts())
print(esco['preferredLabel_da'].value_counts().value_counts())
print(esco['description_en'].value_counts().value_counts())
print(esco['description_da'].value_counts().value_counts())

(13884, 5)
count
1    13884
Name: count, dtype: int64
count
1    13884
Name: count, dtype: int64
count
1    13884
Name: count, dtype: int64
count
1    13884
Name: count, dtype: int64


In [None]:
# filter esco for what we have synthetic sentences (just ignore rest for now)
esco = esco[esco['preferredLabel_en'].isin(article_en['skill'])]

# check
print(esco.shape)
esco.head(3)

(13813, 5)


Unnamed: 0,conceptUri,preferredLabel_en,description_en,preferredLabel_da,description_da
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,Assign and manage staff tasks in areas such as...,lede musikalsk personale,Tildele og forvalte personaleopgaver på område...
1,http://data.europa.eu/esco/skill/00064735-8fad...,supervise correctional procedures,Supervise the operations of a correctional fac...,føre tilsyn med fængselsprocedurer,Føre tilsyn med driften af et fængsel eller an...
2,http://data.europa.eu/esco/skill/000709ed-2be5...,apply anti-oppressive practices,"Identify oppression in societies, economies, c...",anvende antioppressiv praksis,"Identificere undertrykkelse i samfund, økonomi..."


In [None]:
# filter synthetic sentences for what we have esco! (less in danish?)
article_en = article_en[article_en['skill'].isin(esco['preferredLabel_en'])]
article_da = article_da[article_da['skill'].isin(esco['preferredLabel_en'])]

# check
print(article_en.shape)
print(article_da.shape)
print(article_en['skill'].value_counts().value_counts())
print(article_da['skill'].value_counts().value_counts())
article_en.head(3)
article_da.head(3)

(138130, 2)
(138130, 2)
count
10    13813
Name: count, dtype: int64
count
10    13813
Name: count, dtype: int64


Unnamed: 0,skill,sentence
0,Wireshark,ønske om at ansætte en erfaren person med erfa...
1,Wireshark,kun kandidater med en grundig forståelse af Wi...
2,Wireshark,Som netværkssikkerhedsanalytiker vil du anvend...


In [None]:
# map label to conceptUri
map = dict(zip(esco['preferredLabel_en'], esco['conceptUri']))

# combine it all
skills = pd.concat([
    esco[['conceptUri', 'preferredLabel_en']].rename(columns={'preferredLabel_en': 'sentence'}).assign(group=1),
    esco[['conceptUri', 'preferredLabel_da']].rename(columns={'preferredLabel_da': 'sentence'}).assign(group=2),
    esco[['conceptUri', 'description_en']].rename(columns={'description_en': 'sentence'}).assign(group=3),
    esco[['conceptUri', 'description_da']].rename(columns={'description_da': 'sentence'}).assign(group=4),
    article_en[['sentence']].assign(conceptUri=article_en['skill'].map(map)).assign(group=5),
    article_da[['sentence']].assign(conceptUri=article_da['skill'].map(map)).assign(group=6)
])

# check
print(skills.shape)
print(skills['conceptUri'].value_counts().value_counts())
skills.sample(10)

(331512, 3)
count
24    13813
Name: count, dtype: int64


Unnamed: 0,conceptUri,sentence,group
50043,http://data.europa.eu/esco/skill/c27702fd-7ecc...,pharmaceutical company is searching for a pers...,5
67383,http://data.europa.eu/esco/skill/81665015-4963...,The role includes conducting routine maintenan...,5
4003,http://data.europa.eu/esco/skill/49921021-87b9...,yde behandling og hjælp til patienter efter ta...,2
2473,http://data.europa.eu/esco/skill/2cb23f25-95ff...,monitor fish health status,1
110292,http://data.europa.eu/esco/skill/012795fe-5f85...,Vi søger en erfaren elektrisk cykelreparatør t...,6
992,http://data.europa.eu/esco/skill/12adba20-07ba...,De forskellige årsager til tilstedeværelsen af...,4
71628,http://data.europa.eu/esco/skill/9a77486b-4cb2...,Kandidater skal have tilstrækkelig erfaring in...,6
92951,http://data.europa.eu/esco/skill/70b0bc44-6751...,expertise in configuring electronic access con...,5
120320,http://data.europa.eu/esco/skill/84bdef0f-b989...,ansøgeren skal have en baggrund inden for spor...,6
5017,http://data.europa.eu/esco/skill/468e27bb-d5fa...,Kandidaten bør have viden om forskelligt anæst...,6


In [None]:
# save!
skills.to_json(work_dir('Data', 'skills.json'), orient='records', lines=True, index=False)

In [None]:
# sanity check
df = pd.read_json(work_dir('Data', 'skills.json'), orient='records', lines=True)
print(df.shape)
print(df['conceptUri'].value_counts().value_counts())
df.head(3)

(331512, 3)
count
24    13813
Name: count, dtype: int64


Unnamed: 0,conceptUri,sentence,group
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,1
1,http://data.europa.eu/esco/skill/00064735-8fad...,supervise correctional procedures,1
2,http://data.europa.eu/esco/skill/000709ed-2be5...,apply anti-oppressive practices,1
