In [13]:
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

'''set path'''
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('..', 'src')))

'''import helper functions'''
import clean as clean

'''multiprocessing'''
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [14]:
# from Huggingface
#from datasets import load_dataset
#dataset = load_dataset("jonas/undp_jobs_raw")

In [15]:
# locally
data = pd.read_csv("../data/undp_jobs.csv") 

In [16]:
# from Zenodo
# df = pd.read_csv('https://zenodo.org/record/6589661/files/undp_jobs.csv?download=1')

In [17]:
df = data.T
df.rename(columns={0: 'text'}, inplace=True)

In [18]:
'''not an elegant solution but fast hack to get started'''

# extract metadata
df['title'] = df.text.apply(lambda x: re.findall('\[s*(.*?)\s*location',x))
df['location'] = df.text.apply(lambda x: re.findall('location\s*(.*?)\s*application deadline',x))
df['year'] = df.text.apply(lambda x: re.findall('application deadline\s*(.*?)\s*(midnight new york)',x))
df['type_of_contract'] = df.text.apply(lambda x: re.findall('type of contract\s*(.*?)\s*post level',x))
df['post_level'] = df.text.apply(lambda x: re.findall('post level\s*(.*?)\s*languages required',x))
df['languages_required'] = df.text.apply(lambda x: re.findall('languages required\s*(.*?)\s*starting date',x) or re.findall('languages required\s*(.*?)\s*duration of initial',x) or re.findall('languages required\s*(.*?)\s*expected duration',x) or re.findall('languages required\s*(.*?)\s*background',x))
df['starting_date'] = df.text.apply(lambda x: re.findall('expected to start\)\s*(.*?)\s*duration of initial',x))
df['duration_contract'] = df.text.apply(lambda x: re.findall('initial contract\s*(.*?)\s*expected duration',x))
df['duration_assignment'] = df.text.apply(lambda x: re.findall('duration of assignment\s*(.*?)\s*refer',x))
df['background'] = df.text.apply(lambda x: re.findall('background\s*(.*?)\s*duties and responsibilities',x))
df['duties_responsibilities'] = df.text.apply(lambda x: re.findall('duties and responsibilities\s*(.*?)\s*competencies',x))
df['competencies'] = df.text.apply(lambda x: re.findall('competencies\s*(.*?)\s*required skills and experience',x))
df['skills_experiences'] = df.text.apply(lambda x: re.findall('required skills and experience\s*(.*?)\s*refer a friend',x))

#clean to string and remove square brackets 
df = df.astype(str)
df.loc[df['year'].str.contains('-22'), 'year'] = '2022'
df.loc[df['year'].str.contains('-21'), 'year'] = '2021'
df.loc[df['year'].str.contains('-20'), 'year'] = '2020'
df.loc[df['year'].str.contains('-19'), 'year'] = '2019'
df.loc[df['year'].str.contains('-18'), 'year'] = '2018'
df.loc[df['year'].str.contains('-17'), 'year'] = '2017'
df.loc[df['year'].str.contains('-16'), 'year'] = '2016'
df.loc[df['year'].str.contains('-15'), 'year'] = '2015'
df.loc[df['year'].str.contains('-14'), 'year'] = '2014'
df.loc[df['year'].str.contains('-13'), 'year'] = '2013'
df.loc[df['year'].str.contains('-12'), 'year'] = '2012'
df.loc[df['year'].str.contains('-11'), 'year'] = '2011'
df.loc[df['year'].str.contains('-10'), 'year'] = '2010'
df.loc[df['year'].str.contains('-09'), 'year'] = '2009'
df.loc[df['year'].str.contains('-08'), 'year'] = '2008'
df.loc[df['year'].str.contains('-07'), 'year'] = '2007'
df.loc[df['year'].str.contains('-06'), 'year'] = '2006'
df.loc[df['year'].str.contains('-05'), 'year'] = '2005'

In [19]:
'''process and clean text'''
columns_to_clean = ['title', 'location', 'type_of_contract', 'post_level',
       'languages_required', 'background', 'duties_responsibilities',
       'competencies', 'skills_experiences']

for column in columns_to_clean:
    df[column] = df[column].astype(str)

    '''spacy takes around 5h+ even with multiprocessing'''
    #lemmatise and stemming + basic cleaning
    # new_spacy = column + "_clean_spacy"
    # print(new_spacy)
    # df[new_spacy] = df[column].parallel_apply(clean.spacy_clean)
    
    #basic cleaning
    # new = column + "_clean"
    print(column)
    df[column] = df[column].parallel_apply(clean.basic)
    


title


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

location


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

type_of_contract


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

post_level


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

languages_required


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

background


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

duties_responsibilities


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

competencies


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

skills_experiences


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12954), Label(value='0 / 12954')))…

In [20]:
df.to_csv('../data/undp_jobs_processed.csv')  

In [23]:
df.head(5)

Unnamed: 0,text,title,location,year,type_of_contract,post_level,languages_required,starting_date,duration_contract,duration_assignment,background,duties_responsibilities,competencies,skills_experiences
0,{'content': ['hiv and sti clinical consultant ...,hiv and sti clinical consultant ic,suva fiji,2022,individual contract,international consultant,english,"[""01-jul-2022', '""]","["":1/07/2022-31/12/2023', '""]","["":171 days', '\\n\\n""]",the united nations development programme undp ...,project description and consultancy rationale ...,strong interpersonal and communication skillss...,educational qualificationsminimum master s deg...
1,{'content': ['internship- pacific digital econ...,internship pacific digital economy programme p...,honiara solomon islands,2022,internship,intern,english,"[""30-jun-2022', '""]","["":6 months', '""]","["":6 months', '\\n\\n""]",the united nations capital development fund un...,under the guidance and supervision of uncdf s ...,uncdfundp core competenciescommunicationdelive...,educationcandidate must be enrolled in a degre...
2,{'content': ['consultant international spécial...,consultant international spécialisé dans le co...,djibouti djibouti,2022,individual contract,international consultant,french,"[""02-sep-2022', '""]",[],[],avis de recrutement d un consultant individuel...,description du projet le projet d appui à la j...,livrables attenduslivrablesrésultatsdurée de r...,qualifications et experiences requisesi qualif...
3,{'content': ['consultant national pour l’élabo...,consultant national pour l élaboration du plan...,djibouti djibouti,2022,individual contract,national consultant,french,"[""14-jun-2022', '""]","["":3 mois', '""]","["":3 mois', '\\n\\n""]",avis de recrutement d un consultant individuel...,objectifs l objectif principal de la mission e...,qualificationle ou la consultant e doit posséd...,termes de paiementle consultant sera payé à la...
4,{'content': ['individual consultant - national...,individual consultant national project officer,riyadh saudi arabia,2022,individual contract,national consultant,arabic english expected duration of assignment...,[],[],"["":12 months', '\\n\\n""]",post titlenational project officerstarting dat...,scope of work ensure effective and efficient i...,competenciescorporate competenciesdemonstrates...,required skills and experience education maste...
