In [None]:
import os
import pandas as pd
from transformers import AutoTokenizer
from google.colab import drive, userdata

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Mounted at /content/drive


In [None]:
# load skills for filtering benchmark
skills = pd.read_json(work_dir('Data', 'skills.json'), orient='records', lines=True)
skills = skills[skills['group'] == 1] # need only 1 group for label and conceptUri

# check
print(skills.shape)
skills.head(3)

(13813, 3)


Unnamed: 0,conceptUri,sentence,group
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,1
1,http://data.europa.eu/esco/skill/00064735-8fad...,supervise correctional procedures,1
2,http://data.europa.eu/esco/skill/000709ed-2be5...,apply anti-oppressive practices,1


In [None]:
# load bench data and assign groups to differentiate
bench = pd.concat([
    pd.read_csv("hf://datasets/jensjorisdecorte/skill-extraction-tech/validation.csv").assign(group=1),
    pd.read_csv("hf://datasets/jensjorisdecorte/skill-extraction-tech/test.csv").assign(group=2),
    pd.read_csv("hf://datasets/jensjorisdecorte/skill-extraction-house/validation.csv").assign(group=3),
    pd.read_csv("hf://datasets/jensjorisdecorte/skill-extraction-house/test.csv").assign(group=4),
    pd.read_csv("hf://datasets/jensjorisdecorte/skill-extraction-techwolf/test.csv").assign(group=5)
], ignore_index=True).reset_index(drop=True)

#check
bench

Unnamed: 0,sentence,span,sub_span,label,group
0,javascript reactjs java,javascript,,JavaScript,1
1,javascript reactjs java,reactjs,,LABEL NOT PRESENT,1
2,javascript reactjs java,java,,Java (computer programming),1
3,javascript reactjs java,javascript,,JavaScript,1
4,javascript reactjs java,reactjs,,LABEL NOT PRESENT,1
...,...,...,...,...,...
3162,"* An ability to inspire, coach and develop you...",,,manage a team,5
3163,* Role model who consistently delivers amazing...,,,manage the customer experience,5
3164,* Role model who consistently delivers amazing...,,,show an exemplary leading role in an organisation,5
3165,* Passion for our products.,,,show determination,5


In [None]:
# filter bench by valid labels (exists in skills)
bench['conceptUri'] = bench['label'].map(dict(zip(skills['sentence'], skills['conceptUri'])))
bench = bench[['conceptUri', 'group', 'sentence']]
bench = bench.loc[bench['conceptUri'].notna()]

# check
bench

Unnamed: 0,conceptUri,group,sentence
0,http://data.europa.eu/esco/skill/3cd569a2-4f88...,1,javascript reactjs java
2,http://data.europa.eu/esco/skill/19a8293b-8e95...,1,javascript reactjs java
3,http://data.europa.eu/esco/skill/3cd569a2-4f88...,1,javascript reactjs java
5,http://data.europa.eu/esco/skill/19a8293b-8e95...,1,javascript reactjs java
6,http://data.europa.eu/esco/skill/adc6dc11-3376...,1,As a member of our <ORGANIZATION> <ORGANIZATIO...
...,...,...,...
3162,http://data.europa.eu/esco/skill/cb668e89-6ef5...,5,"* An ability to inspire, coach and develop you..."
3163,http://data.europa.eu/esco/skill/4b95c7bb-5672...,5,* Role model who consistently delivers amazing...
3164,http://data.europa.eu/esco/skill/5be1c5fb-3833...,5,* Role model who consistently delivers amazing...
3165,http://data.europa.eu/esco/skill/19a64e91-bfe3...,5,* Passion for our products.


In [None]:
# load translated bench and shift group numbers and filter
trans_bench = pd.read_json(work_dir('Data', 'translated_bench.json'), orient='records', lines=True)[['conceptUri', 'group', 'sentence']]
trans_bench['group'] += 5
trans_bench = trans_bench.explode('conceptUri').reset_index(drop=True)
trans_bench = trans_bench[trans_bench['conceptUri'].isin(skills['conceptUri'])]
trans_bench = trans_bench.loc[trans_bench['conceptUri'].notna()]

# check
trans_bench

Unnamed: 0,conceptUri,group,sentence
0,http://data.europa.eu/esco/skill/60c78287-22eb...,6,Evne til at arbejde i store samarbejdsteams fo...
1,http://data.europa.eu/esco/skill/113b4428-0a31...,6,Evne til at arbejde i store samarbejdsteams fo...
2,http://data.europa.eu/esco/skill/f7e2eb04-3e50...,6,Avanceret viden om anvendelsesdata og infrastr...
3,http://data.europa.eu/esco/skill/4d85b881-e490...,6,Avanceret viden om anvendelsesdata og infrastr...
4,http://data.europa.eu/esco/skill/d0c6d77e-cb25...,6,Avanceret viden om anvendelsesdata og infrastr...
...,...,...,...
2072,http://data.europa.eu/esco/skill/7ff2c668-0e86...,10,Din rolle vil være at arbejde med både test- o...
2073,http://data.europa.eu/esco/skill/91abe492-18be...,10,Din rolle vil være at arbejde med både test- o...
2074,http://data.europa.eu/esco/skill/699e7c26-6502...,10,hjælp i deres udvikling.
2075,http://data.europa.eu/esco/skill/21c5790c-0930...,10,og mindst 2 års erfaring på mellemniveau i for...


In [None]:
# gather benches
total_bench = pd.concat([bench, trans_bench], ignore_index=True)

# check
total_bench

Unnamed: 0,conceptUri,group,sentence
0,http://data.europa.eu/esco/skill/3cd569a2-4f88...,1,javascript reactjs java
1,http://data.europa.eu/esco/skill/19a8293b-8e95...,1,javascript reactjs java
2,http://data.europa.eu/esco/skill/3cd569a2-4f88...,1,javascript reactjs java
3,http://data.europa.eu/esco/skill/19a8293b-8e95...,1,javascript reactjs java
4,http://data.europa.eu/esco/skill/adc6dc11-3376...,1,As a member of our <ORGANIZATION> <ORGANIZATIO...
...,...,...,...
4177,http://data.europa.eu/esco/skill/7ff2c668-0e86...,10,Din rolle vil være at arbejde med både test- o...
4178,http://data.europa.eu/esco/skill/91abe492-18be...,10,Din rolle vil være at arbejde med både test- o...
4179,http://data.europa.eu/esco/skill/699e7c26-6502...,10,hjælp i deres udvikling.
4180,http://data.europa.eu/esco/skill/21c5790c-0930...,10,og mindst 2 års erfaring på mellemniveau i for...


In [None]:
# save to json
total_bench.to_json(work_dir('Data', 'bench.json'), orient='records', lines=True, index=False)

In [None]:
# sanity check
df = pd.read_json(work_dir('Data', 'bench.json'), orient='records', lines=True)
print(df.shape)
print(df['group'].value_counts().sort_index())
df.head(3)

(4182, 3)
group
1     153
2     671
3     134
4     566
5     588
6     153
7     671
8     115
9     553
10    578
Name: count, dtype: int64


Unnamed: 0,conceptUri,group,sentence
0,http://data.europa.eu/esco/skill/3cd569a2-4f88...,1,javascript reactjs java
1,http://data.europa.eu/esco/skill/19a8293b-8e95...,1,javascript reactjs java
2,http://data.europa.eu/esco/skill/3cd569a2-4f88...,1,javascript reactjs java
