In [None]:
# dependencies
import os
import re
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from google.colab import drive, userdata

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Mounted at /content/drive


In [None]:
# load raw sentences from crawled Danish job ads (ignore null)
df = pd.read_csv(work_dir('Data', 'raw_sentences.csv')).dropna()

# check
print(df.shape)
df.head(3)

(9220716, 3)


Unnamed: 0,id,completion,frequency
0,1,Advokat til afdeling for fast ejendom og entre...,1
1,2,Til et spændende og udfordrende job hos Haugaa...,1
2,3,Vores afdeling for fast ejendom og entreprise ...,1


In [None]:
# examine most frequent (use interactive table)
df.sort_values('frequency', ascending=False)

Unnamed: 0,id,completion,frequency
900,901,Indrykket:,178661
1303,1304,Ansøg,157853
1046945,1046946,Antal ledige stillinger:,131938
173,174,Skriv venligst i din ansøgning at du fandt job...,125247
3082008,3082009,QUICKNR,111371
...,...,...,...
3937590,3937591,Samtaler afholdes: 27. februar 2023 (hele dagen).,1
3937589,3937590,Stillingen løber fra 1. april 2023 til og med ...,1
3937586,3937587,Stillingen er med forventet tiltrædelse hurtig...,1
3937584,3937585,"Lyder ovenstående som et job, der kunne passe ...",1


In [None]:
# the 2000 most frequent with few identified exceptions are nonskills
frequent_df = df.sort_values('frequency', ascending=False)[~df.index.isin((3255, 3256, 126927, 21507, 708, 14070, 395445, 395444, 395446, 395441, 736,
                                                                           7615, 17183, 193, 3002, 125646, 72683, 149853, 48619, 192642, 259520, 21541))].head(2000)

# rename completion to sentence and ignore other columns
frequent_df = frequent_df.rename(columns={'completion': 'sentence'})[['sentence']]

# check
print(frequent_df.shape)
frequent_df.head(3)

(2000, 1)


Unnamed: 0,sentence
900,Indrykket:
1303,Ansøg
1046945,Antal ledige stillinger:


In [None]:
# conditions to check for url, email, phone and date (latter more complex)
has_url = r'\b(https?\:\/\/|www\.)\S+(?![\'\",])'
has_email = r'\b([a-z0-9\.\-\_]+)?@[a-z0-9\.\-\_]+\b'
has_phone = r'(?<!\w)(\((?:\+|00)[0-9]{2}\))?(?: ?[0-9]{2,} ?){4,}(?!\w)'

# define written date parts
weekdays = ['man', 'mon', 'mandag', 'monday', 'tir', 'tue', 'tirsdag', 'ons', 'wed', 'onsdag', 'wednesday', 'tor', 'thi', 'torsdag', 'thirsday',
            'fre', 'fri', 'fredag', 'friday', 'lør', 'sat', 'lørdag', 'saturday', 'søn', 'sun', 'søndag', 'sunday']
months = ['jan', 'januar', 'january', 'feb', 'februar', 'february', 'mar', 'marts', 'march', 'apr', 'april', 'maj', 'may', 'jun', 'juni', 'june',
          'jul', 'juli', 'july', 'aug', 'august', 'sep', 'september', 'okt', 'oct', 'oktober', 'october', 'nov', 'november', 'dec', 'december']

# define imploded date parts
weekday = r'(' + r'|'.join([r'\b%s\b' % w for w in sorted(weekdays, key=len, reverse=True)]) + r')'
month = r'(' + r'|'.join([r'\b%s\b' % m for m in sorted(months, key=len, reverse=True)]) + r')'

# define more date parts
dd = r'((3[0-1]|[1-2][0-9]|0?[1-9])(\.|st|nd|rd|th)?)'
mm = r'(1[0-2]|0?[1-9])'
yy = r'((19|20)?[0-9]{2})'
hh = r'(2[0-3]|[0-1]?[0-9])'
mm = r'([0-5][0-9])'
the = r'((the|d|den|per|pr)\.?)'
at = r'(at|kl\.?)'
of = r'(of)'
ampm = '(am|pm)'
w_sep = r' ?,? ?'
d_sep = r'[\.\-\/]'
t_sep = r'[\:\.]'
q = r'?'
time = r'(' + at + w_sep + hh + t_sep + q + mm + q + t_sep + q + mm + q + w_sep + ampm + q + r')'

# assemble date patterns
has_date_1 = weekday + q + w_sep + the + q + w_sep + dd + w_sep + of + q + w_sep + month + w_sep + yy + q + w_sep + time + q
has_date_2 = weekday + q + w_sep + month + w_sep + the + q + w_sep + dd + w_sep + yy + w_sep + time + q

In [None]:
# filter based on occurence of either url, email, phone or date
filtered_df = df[
    df['completion'].str.contains(has_url, regex=True, flags=re.IGNORECASE)
    | df['completion'].str.contains(has_email, regex=True, flags=re.IGNORECASE)
    | df['completion'].str.contains(has_phone, regex=True, flags=re.IGNORECASE)
    | df['completion'].str.contains(has_date_1, regex=True, flags=re.IGNORECASE)
    | df['completion'].str.contains(has_date_2, regex=True, flags=re.IGNORECASE)
]

# rename completion to sentence and ignore other columns
filtered_df = filtered_df.rename(columns={'completion': 'sentence'})[['sentence']].copy()

# check
print(filtered_df.shape)
filtered_df.head(3)

  df['completion'].str.contains(has_url, regex=True, flags=re.IGNORECASE)
  | df['completion'].str.contains(has_email, regex=True, flags=re.IGNORECASE)
  | df['completion'].str.contains(has_phone, regex=True, flags=re.IGNORECASE)
  | df['completion'].str.contains(has_date_1, regex=True, flags=re.IGNORECASE)
  | df['completion'].str.contains(has_date_2, regex=True, flags=re.IGNORECASE)


(1074115, 1)


Unnamed: 0,sentence
19,Ansøgningsfrist: fredag den 23. februar 2024 m...
22,"Advokat Rasmus Haugaard, rh@haugaardbraad.dk, ..."
23,"Hvis du vil vide mere om stillingen, og os som..."


In [None]:
# examine filtered (use interactive table)
filtered_df.sample(20000, random_state=7)

Unnamed: 0,sentence
8691617,Please submit your application and CV in Engli...
8112814,"En kort ansøgning med CV, eksamensprotokoludsk..."
6761322,"Ved opfølgende spørgsmål, er du velkommen til ..."
5644116,Så send en kortfattet ansøgning og dit CV - en...
320899,Praktiske oplysninger: Ansøgningsfrist: 12. ja...
...,...
8746908,Ansøgningsfristen er 10. oktober 2021. Der er ...
8241021,Hvis du ønsker at besøge os i afdelingen eller...
8843627,Stillingen er ledig til besættelse den 1. dece...
4751622,"Har du spørgsmål i øvrigt, er du velkommen til..."


In [None]:
# identified condition to check for high risk sentences
has_risk = r'flair for|stærk i|strong in|erfaren i|experienced in|med erfaring i|with experience in'

# check filtered results
filtered_df[~filtered_df['sentence'].str.contains(has_risk, regex=True, flags=re.IGNORECASE)]['sentence'].tolist()

['Ansøgningsfrist: fredag den 23. februar 2024 med tiltrædelse snarest muligt derefter.',
 'Advokat Rasmus Haugaard, rh@haugaardbraad.dk, mærket "advokat til fast ejendom og entreprise" i emnelinjen.',
 'Hvis du vil vide mere om stillingen, og os som virksomhed, er du velkommen til at kontakte: Rasmus Haugaard på telefon 30785132 - rh@haugaardbraad.dk eller Anders Braad på telefon 30685340 – ab@haugaardbraad.dk.',
 'For further details or information please contact Peter Højlund, mobile + 45 5175 5079 or peter.hojlund@dnv.com',
 'Send os din ansøgning på kolding-u@jobteam.dk, og vi vil kontakte dig snarest.',
 'Yderligere oplysninger om stillingen kan fås ved henvendelse til afdelingsleder Dorte Marie Jensen på mail dmij@aarhus.dk eller tlf. 8713 3944 samt afdelingsleder Anita Gandrup Ernst på mail angae@aarhus.dk eller tlf. 4185 6590.',
 'Ansøgning med relevante bilag skal være modtaget senest søndag den 18. februar 2024.',
 'Tiltrædelse: 1. april 2024.',
 'https://www.aarhus.dk/demok

In [None]:
# remove high risk sentences
filtered_df = filtered_df[~filtered_df['sentence'].str.contains(has_risk, regex=True, flags=re.IGNORECASE)]

# init new column for cleaning
filtered_df['cleaned'] = filtered_df['sentence'].copy()

# raw clean url, email, phone and date from filtered
filtered_df['cleaned'] = filtered_df['cleaned'].str.replace(r'\W*' + has_url + r'\W*', ' ', regex=True, flags=re.IGNORECASE)
filtered_df['cleaned'] = filtered_df['cleaned'].str.replace(r'\W*' + has_email + r'\W*', ' ', regex=True, flags=re.IGNORECASE)
filtered_df['cleaned'] = filtered_df['cleaned'].str.replace(r'\W*' + has_phone + r'\W*', ' ', regex=True, flags=re.IGNORECASE)
filtered_df['cleaned'] = filtered_df['cleaned'].str.replace(r'\W*' + has_date_1 + r'\W*', ' ', regex=True, flags=re.IGNORECASE)
filtered_df['cleaned'] = filtered_df['cleaned'].str.replace(r'\W*' + has_date_2 + r'\W*', ' ', regex=True, flags=re.IGNORECASE)

# clean spaces
filtered_df['cleaned'] = filtered_df['cleaned'].str.replace(r'\s+', ' ', regex=True, flags=re.IGNORECASE).str.strip()

# seed the selection
np.random.seed(7)

# randomly select either original (10%) or cleaned (90%)
filtered_df['sentence'] = np.where(np.random.rand(len(filtered_df)) > 0.9, filtered_df['sentence'], filtered_df['cleaned'])
filtered_df.drop('cleaned', axis=1, inplace=True)

# drop empty and duplicates, and drop if already in handpicked
filtered_df = filtered_df[filtered_df['sentence'].str.len() > 0]
filtered_df = filtered_df.drop_duplicates(subset='sentence')
filtered_df = filtered_df[~filtered_df['sentence'].isin(frequent_df['sentence'])]

# check
print(filtered_df.shape)
filtered_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['cleaned'] = filtered_df['sentence'].copy()


(726662, 1)


Unnamed: 0,sentence
19,Ansøgningsfrist med tiltrædelse snarest muligt...
22,"Advokat Rasmus Haugaard mærket ""advokat til fa..."
23,"Hvis du vil vide mere om stillingen, og os som..."


In [None]:
# examine filtered - again (finally!) (use interactive table)
filtered_df.sample(20000, random_state=7)

Unnamed: 0,sentence
9163516,Hvis du vil vide mere om stillingen er du velk...
414486,For further information about this position co...
6911221,Har du spørgsmål til jobbet eller Frie generel...
5974536,https://studerendeonline.dk/job/2068941//
1029034,"Ansøgningen skal stiles til Kirkeministeriet, ..."
...,...
8325443,Stillingen er et barselsvikariat på 80-100% me...
742955,Application deadline is 2 January 2024 at 23:5...
9120064,"Ved spørgsmål, kontakt Peter Orloff på tlf"
9177417,For yderligere information er du velkommen til...


In [None]:
# load synthetic nonskills
with open(work_dir('Data', 'notskills.txt'), 'r') as file:
    lines = file.readlines()

# filter and split lines
lines = [[l.strip() for l in line.split('=')] for line in lines if line.count('=') == 1]

# check
print(len(lines))
lines[:5]

1623


[['1. Hos SuperBiler A/S har vi specialiseret os i salg af brugte luksusbiler siden 2008',
  'At SuperCars A/S, we have specialized in selling used luxury cars since 2008'],
 ['2. TILMELD DIG VORES NYHEDSBREV OG FÅ DE SENESTE NYHEDER OM BRUGTE BILER DIREKTE I DIN INDBAKKE',
  'SIGN UP FOR OUR NEWSLETTER AND GET THE LATEST NEWS ABOUT USED CARS DIRECTLY IN YOUR INBOX'],
 ['3. Søg efter job', 'Search for jobs'],
 ['4. Vi elsker frisk kaffe og glade medarbejdere',
  'We love fresh coffee and happy employees'],
 ['5. Vores hovedkontor ligger i hjertet af Aarhus',
  'Our head office is located in the heart of Aarhus']]

In [None]:
# create dataframe of sentences, explode pairs into 1d
synthetic_df = pd.DataFrame([l[0] for l in lines] + [l[1] for l in lines], columns=['sentence'])

# remove list prefixes, quotations and placeholders from gpt
synthetic_df['sentence'] = synthetic_df['sentence'].str.replace(r'^\s*([\>\-\*]|\d+[\.\)])\s*', '', regex=True, flags=re.IGNORECASE)
synthetic_df['sentence'] = synthetic_df['sentence'].str.replace(r'^"(.+)"$', '\1', regex=True, flags=re.IGNORECASE)
synthetic_df['sentence'] = synthetic_df['sentence'].str.replace(r'\[.*?\]', '', regex=True, flags=re.IGNORECASE)

# remove duplicates and empty
synthetic_df = synthetic_df.drop_duplicates(subset='sentence')
synthetic_df = synthetic_df[synthetic_df['sentence'].str.len() > 0].dropna()

# check
print(synthetic_df.shape)
synthetic_df.head(3)

(3026, 1)


Unnamed: 0,sentence
0,Hos SuperBiler A/S har vi specialiseret os i s...
1,TILMELD DIG VORES NYHEDSBREV OG FÅ DE SENESTE ...
2,Søg efter job


In [None]:
# sample arbitrary number of filtered and merge what we have
nonskills = pd.concat([
    synthetic_df[['sentence']].assign(group=1),
    frequent_df[['sentence']].assign(group=2),
    filtered_df[['sentence']].sample(100000 - len(synthetic_df) - len(frequent_df), random_state=7).assign(group=3)
], ignore_index=True)[['group', 'sentence']]

# check
print(nonskills.shape)
nonskills.head(3)

(100000, 2)


Unnamed: 0,group,sentence
0,1,Hos SuperBiler A/S har vi specialiseret os i s...
1,1,TILMELD DIG VORES NYHEDSBREV OG FÅ DE SENESTE ...
2,1,Søg efter job


In [None]:
# save!
nonskills[['group', 'sentence']].to_json(work_dir('Data', 'nonskills.json'), orient='records', lines=True, index=False)

In [None]:
# load for sanity check
nonskills_reloaded = pd.read_json(work_dir('Data', 'nonskills.json'), orient='records', lines=True)

# check
print(nonskills_reloaded.shape)
nonskills_reloaded.head(3)

(100000, 2)


Unnamed: 0,group,sentence
0,1,Hos SuperBiler A/S har vi specialiseret os i s...
1,1,TILMELD DIG VORES NYHEDSBREV OG FÅ DE SENESTE ...
2,1,Søg efter job
