In [1]:
import glob, os, sys; sys.path.append('../src')
from datetime import datetime

from langdetect import detect
from tabulate import tabulate 
# data wrangling
import numpy as np
import pandas as pd
import re
import ast
import itertools
import wordninja
from dateutil import parser
import wordninja
import string
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
tqdm.pandas()

'''import helper functions'''
import clean as clean
import extract_attributes as ex

'''multiprocessing'''
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 11 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# let's load the data
df = pd.read_csv("../data/undp_jobs.csv") 
original_size = df.shape[0]

print("length of raw dataset:", original_size)

length of raw dataset: 98894


Here we only want to understand the structure of the job postings and figure out how to - cleanly - extract the fields properly. It seems that all of the posting use the same java structure on the website

In [3]:
df['cleaned_content'] = df['content'].apply(clean.basic)

In [4]:
# Binary column whether or not a posting follows the structure (background, competencies, sills and experiences, ...)
template_structure = ['background',
                      'duties and responsibilities',
                      'competencies',
                      'required skills and experience']
for section in template_structure:
    df['has_' + section.replace(' ','_')] = df['cleaned_content'].apply(lambda l: ex.has_attribute(l, section))

In [5]:
df_no = df[(df['has_background']==0)|\
           (df['has_duties_and_responsibilities']==0)|\
           (df['has_competencies']==0)|\
           (df['has_required_skills_and_experience']==0)]['job_id']

In [6]:
# Remove job postings not following the given structure and corresponding binary columns
df = df[(df['has_background']!=0)&\
           (df['has_duties_and_responsibilities']!=0)&\
           (df['has_competencies']!=0)&\
           (df['has_required_skills_and_experience']!=0)]

df.drop(['has_background',\
                  'has_duties_and_responsibilities',\
                  'has_competencies',\
                  'has_required_skills_and_experience'], axis=1, inplace=True)

print('Rows following job posting template structure: ', df.shape[0])
print('Number of projects that are not following the structure and are thus dropped:', original_size- df.shape[0])

Rows following job posting template structure:  92051
Number of projects that are not following the structure and are thus dropped: 6843


How can I write good tests to make sure to not exclude valid job postings here? Are there really all empty? 

In [7]:
# Extract string list of meta attributes such as application deadline, job title, ...
df['meta_atributes'] = df['cleaned_content'].apply(lambda l: ex.get_meta_attributes(l))

# Extract job posting components following given template
df['background'] = df['cleaned_content'].apply(lambda l: ex.get_background(l))
df['duties_and_responsibilities'] = df['cleaned_content'].apply(lambda l: ex.get_duties_and_responsibilities(l))
df['competencies'] = df['cleaned_content'].apply(lambda l: ex.get_competencies(l))
df['required_skills_and_experience'] = df['cleaned_content'].apply(lambda l: ex.get_required_skills(l))

In [8]:

# Example usage remains the same
meta_attributes = ['location', 'type of contract', 'starting date',
                   'application deadline', 'post level', 'duration of initial contract',
                   'languages required', 'expected duration of assignment']

# Apply to DataFrame
df['title'] = df['meta_atributes'].apply(lambda l: ex.extract_title(l, meta_attributes[0]))

for m_attr in meta_attributes:
    df[m_attr.replace(' ', '_')] = df['meta_atributes'].apply(
        lambda l: ex.extract_meta_attribute(m_attr, l, meta_attributes)
    )

In [41]:
def count_substring(df, column, substring):
    # Convert to lowercase for case-insensitive matching
    return df[column].str.lower().str.contains(substring.lower()).sum()

# Example usage
search_string = "home"
count = count_substring(df, 'location', search_string)

print(f"The string '{search_string}' appears {count} times in the 'location' column.")

The string 'home' appears 10877 times in the 'location' column.


In [37]:
result_df

Unnamed: 0,Metric,Value,location,count
0,Unique count,10997.0,,
1,,,new york united states of america,5834.0
2,,,kabul afghanistan,2568.0
3,,,jakarta indonesia,2143.0
4,,,dhaka bangladesh,2114.0
...,...,...,...,...
10993,,,shimlachambakullumandiunanalagarhparwanoo dist...,1.0
10994,,,homebased with travel to bangkok and asean reg...,1.0
10995,,,yerevan and project target regions armenia,1.0
10996,,,pereira risaralda armenia quindio o manizales ...,1.0


In [10]:
# dependency file with locations
df_locations = pd.read_excel('../src/world-countries.xlsx')
df_locations['country'] = df_locations['country'].str.lower()
print(f'Shape: {df_locations.shape}')
display(df_locations.head())

country2iso = dict(df_locations[['country', 'country_iso']].values)
country2region = dict(df_locations[['country_iso', 'region']].values)

Shape: (270, 3)


Unnamed: 0,country,country_iso,region
0,afghanistan,AF,Southern Asia
1,åland islands,AX,Northern Europe
2,albania,AL,Southern Europe
3,algeria,DZ,Northern Africa
4,american samoa,AS,Australia and Oceania


In [11]:
print('Unique count before:', df['location'].nunique())
display(df['location'].value_counts(dropna = False).head(20))

df.insert(4, 'country', df['location'].apply(lambda x: clean.clean_location(x, country2iso)))
print('Unique count after :', df['country'].nunique())
display(df['country'].value_counts(dropna = False).head(20))

df.insert(5, 'region', df['country'].apply(lambda x: clean.get_region(x, country2region)))
display(df['region'].value_counts(dropna = False))

Unique count before: 10997


location
new york united states of america    5834
kabul afghanistan                    2568
jakarta indonesia                    2143
dhaka bangladesh                     2114
homebased                            1487
kyiv ukraine                         1473
home based                           1270
kinshasa congo dem republic          1050
phnom penh cambodia                  1038
dakar senegal                        1033
amman jordan                         1011
bangkok thailand                     1010
addis ababa ethiopia                  979
bogota colombia                       871
bamako mali                           846
tashkent uzbekistan                   834
beijing china                         820
nairobi kenya                         816
chisinau moldova                      810
portauprince haiti                    778
Name: count, dtype: int64

Unique count after : 178


country
Home-based    10946
US             6458
AF             3678
ID             3020
BD             2976
UA             2535
ML             1795
CO             1785
CG             1718
IN             1516
BA             1410
TR             1269
NE             1256
TH             1239
KH             1226
IQ             1163
JO             1148
SN             1125
GN             1120
ET             1107
Name: count, dtype: int64

region
Sub-Saharian Africa                18800
Home-based                         10946
Southern Asia                      10657
South-Eastern Asia                  9156
Latin America and the Carribean     8888
Western Asia                        8351
Northern America                    6459
Southern Europe                     5049
Eastern Europe                      4216
Northern Africa                     2861
Central Asia                        2267
Eastern Asia                        1167
Australia and Oceania               1123
Western Europe                      1067
Unspecified                          592
Northern Europe                      452
Name: count, dtype: int64