# Installation and Importing

In [1]:
# dependencies
import os
import re
import gc
import numpy as np
import pandas as pd
from datetime import datetime
from google.colab import drive, userdata

# file management
drive.mount("/content/drive")
WORK_DIR = "/content/drive/MyDrive/Projects/skillextraction"

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Mounted at /content/drive


In [2]:
# load database export and drop na columns
jobads = pd.concat([pd.read_csv(work_dir(f'jobads{i}.csv.gz'), compression='gzip', low_memory=False) for i in range(1, 5)], ignore_index=True)
jobads = jobads.drop(columns=['id', 'url', 'category']).dropna(subset='content').reset_index(drop=True)
jobads['date'] = pd.to_datetime(jobads['published'], format='%Y-%m-%d')
jobads

Unnamed: 0,source,real_url,title,company,location,content,published,crawled,date
0,jobindex,https://www.jobindex.dk/jobannonce/502456/advo...,Advokat til afdeling for fast ejendom og entre...,Haugaard Braad Advokatpartnerselskab,Aalborg,Advokat til afdeling for fast ejendom og entre...,2024-02-07,2024-02-09 11:56:34,2024-02-07
1,jobindex,https://jobs.dnv.com/job-search/business-assur...,Key Customer Project Manager (KCPM),Dnv Business Assurance Denmark A/S,Hellerup,Key Customer Project Manager (KCPM)\nLocation:...,2024-02-07,2024-02-09 11:56:35,2024-02-07
2,jobindex,https://jobteam.dk/afdeling/6000U/6000U-C43/jo...,Salgs- og eventkoordinator søges i Esbjerg,,Esbjerg,Jobsøgende Industri og Lager Kolding Industri ...,2024-02-07,2024-02-09 11:56:35,2024-02-07
3,jobindex,https://www.jobindex.dk/jobannonce/r12281807/j...,"Jobcenter Aarhus, Job og Virksomhedsservice, C...",Aarhus Kommune,Aarhus C,"Jobcenter Aarhus, Job og Virksomhedsservice, C...",2024-02-07,2024-02-09 11:56:36,2024-02-07
4,jobindex,https://www.jobindex.dk/jobannonce/r12282086/e...,Er du vores nye rengøringsassistent i Sakskøbing?,,Sakskøbing,Er du vores nye rengøringsassistent i Sakskøbi...,2024-02-07,2024-02-09 11:56:36,2024-02-07
...,...,...,...,...,...,...,...,...,...
3114254,jobindex,http://www.jobindex.dk/cgi/open.cgi?rm=jobnet&...,Rustfast klejnsmed,,Kolding,Lignende jobannoncer\n• Job i Sydjylland kateg...,2012-11-23,2024-12-21 08:21:06,2012-11-23
3114255,jobindex,http://www.jobindex.dk/cgi/open.cgi?rm=jobnet&...,Kok,,Holstebro,Lignende jobannoncer\n• Job i Region Midtjylla...,2012-11-23,2024-12-21 08:21:06,2012-11-23
3114256,jobindex,http://www.peoplexs.com/Peoplexs22/CandidatesP...,Akutjob: Salgsleder i Netto,Netto,Hirtshals,Akutjob: Salgsleder i Netto\nSådan er vi\nTage...,2012-11-23,2024-12-21 08:21:06,2012-11-23
3114257,jobindex,http://www.peoplexs.com/Peoplexs22/CandidatesP...,"Butiksassistent deltid, Netto - under 18 år",Netto,Vallensbæk Strand,"Butiksassistent deltid, Netto - under 18 år\nV...",2012-11-23,2024-12-21 08:21:06,2012-11-23


In [8]:
# extract domains excluding www.
domain = jobads['real_url'].str.extract(r'^https?:\/\/(?:www\.)?([^\/]+)\/?.*$')

# check
domain.value_counts()

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
jobindex.dk,1375220
candidate.hr-manager.net,251302
jobzonen.dk,130245
jobsincopenhagen.com,44432
sallinggroup.com,38775
...,...
friis-ren.dk,1
perform.dk,1
friluftsland.dk,1
frilundbiler.dk,1


In [4]:
# check sdu ad counts
jobads[jobads['company'] == 'Syddansk Universitet'].dropna()['date'].dt.year.value_counts().sort_index()

Unnamed: 0_level_0,count
date,Unnamed: 1_level_1
2012,56
2013,700
2014,636
2015,669
2016,526
2017,691
2018,846
2019,813
2020,766
2021,516


In [5]:
# isolate an sdu decade
sdu = jobads[(jobads['company'] == 'Syddansk Universitet') &
             (jobads['date'].dt.year > 2013) &
             (jobads['date'].dt.year < 2024)].dropna().reset_index(drop=True)

# check
sdu['date'].dt.year.value_counts().sort_index()

Unnamed: 0_level_0,count
date,Unnamed: 1_level_1
2014,636
2015,669
2016,526
2017,691
2018,846
2019,813
2020,766
2021,516
2022,676
2023,595
