In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import os

In [2]:
# for reproducibility
random_state = 0

## Load Data

In [3]:
base_directory = "C:/Users/Gi Han/OneDrive - Singapore Management University/Documents/GitHub/job-posting-analysis"

Linkedin Data

In [4]:
directory = os.chdir(f"{base_directory}/data/raw/Linkedin")
files = os.listdir(".")
files

['cloud_links.csv',
 'cloud_posts_darren.csv',
 'cloud_posts_gihan.csv',
 'cloud_posts_joshua.csv',
 'cloud_posts_lingjia.csv',
 'cloud_posts_steph.csv']

In [5]:
relevant_csv = [x for x in files if 'cloud_posts' in x]
relevant_csv

['cloud_posts_darren.csv',
 'cloud_posts_gihan.csv',
 'cloud_posts_joshua.csv',
 'cloud_posts_lingjia.csv',
 'cloud_posts_steph.csv']

In [6]:
# Combine all csv
linkedin_df = pd.concat(map(pd.read_csv, relevant_csv), ignore_index=True, axis=0)
linkedin_df.head(3)

Unnamed: 0,jobTitle,jobLocation,applicantsCount,viewsCount,jobPosterProfileUrl,postedAt,appliesClosed,applyUrl,jobDescription,remoteAllowed,...,matchedSkills,jobFunctions,jobIndustries,appliesClosedAt,jobType,jobUrl,jobImageUrl,timestamp,query,error
0,Technical Content Writer (Infura),"Singapore, Singapore",11.0,84.0,,2022-03-11T23:32:47.000Z,False,https://consensys.net/open-roles/gh_jid?gh_jid...,About ConsenSys\r\n\r\nAt ConsenSys we have a ...,False,...,,"Marketing, Public Relations, Writing/Editing",Computer Software,2022-04-10T23:32:46.000Z,Full-time,https://www.linkedin.com/jobs/view/2952827007/,https://media-exp1.licdn.com/dms/image/C4E0BAQ...,2022-03-14T11:16:31.849Z,https://www.linkedin.com/jobs/view/2952827007/,
1,Software Engineering (DevOps) Intern,"Singapore, Singapore",16.0,145.0,,2022-01-20T04:08:28.000Z,False,https://razer.wd3.myworkdayjobs.com/en-US/Care...,Getting onboard Razer will place you on a glob...,False,...,,"Engineering, Information Technology",Consumer Electronics,2022-03-16T04:08:28.000Z,Full-time,https://www.linkedin.com/jobs/view/2918469480/,https://media-exp1.licdn.com/dms/image/C510BAQ...,2022-03-14T11:16:35.635Z,https://www.linkedin.com/jobs/view/2918469480/,
2,Expansion Associate,"Singapore, Singapore",9.0,120.0,https://www.linkedin.com/in/ACoAAB1CaW0BuleRsg...,2022-03-11T12:05:49.000Z,False,https://jobs.lever.co/Qashier/b8b66407-aff4-47...,Want to be part of the fastest growing startup...,False,...,,"Business Development, Consulting","Information Technology & Services, Financial S...",2022-04-10T12:05:49.000Z,Full-time,https://www.linkedin.com/jobs/view/2952512532/,https://media-exp1.licdn.com/dms/image/C560BAQ...,2022-03-14T11:16:39.354Z,https://www.linkedin.com/jobs/view/2952512532/,


In [7]:
linkedin_df.columns

Index(['jobTitle', 'jobLocation', 'applicantsCount', 'viewsCount',
       'jobPosterProfileUrl', 'postedAt', 'appliesClosed', 'applyUrl',
       'jobDescription', 'remoteAllowed', 'companyName', 'companyUrl',
       'matchedSkills', 'jobFunctions', 'jobIndustries', 'appliesClosedAt',
       'jobType', 'jobUrl', 'jobImageUrl', 'timestamp', 'query', 'error'],
      dtype='object')

MonsterSg

In [8]:
directory = os.chdir(f"{base_directory}/data/cleaned/")
files = os.listdir(".")
files

['.ipynb_checkpoints',
 'All_MonsterSg.csv',
 'Cloud Computing_MonsterSg.csv',
 'Cloud_MonsterSg.csv',
 'foo.png',
 'Wordcloud Visualisations (All Monster SG CSV).ipynb',
 'word_test.png']

In [9]:
monster_df = pd.read_csv('All_MonsterSg.csv') 
monster_df.head(3)

Unnamed: 0,job_title,company,salary,job_type,years_experience,tech_stack,job_description,industry,function,roles,date_posted,url
0,Senior Python Developer,Citi,,permanent,,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],['IT'],['Software Engineer/Programmer'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...
1,Senior Partner Solutions Architect (Microsoft),Amazon,,permanent,,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],['IT'],['Software Engineer/Programmer'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...
2,IT Technician,Ascend Com Pte. Ltd.,"['24000', '36000']",permanent,"['2', '5']","['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,['Other'],['IT'],['Other Software/Hardware/EDP'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...


NodeFlair

In [10]:
directory = os.chdir(f"{base_directory}/data/raw/nodeflair")
files = os.listdir(".")
files

['nodeflair_jobpostings.csv', 'nodeflair_links.csv']

In [11]:
nodeflair_df = pd.read_csv('nodeflair_jobpostings.csv') 
nodeflair_df.head(3)

Unnamed: 0.1,Unnamed: 0,URL,Date posted,data,Job Title,Company Name,Seniority,Salary,Job Type,Years of Experience,Tech Stack,Job Desc,cleaned_date
0,0,/jobs/53907,about 11 hours ago,"['ReactJS Developer (Full Stack)', 'Apar Techn...",ReactJS Developer (Full Stack),Apar Technologies,"['Mid', 'Junior']","$6,419 - $8,819 SGD / Monthly",Permanent,2-3 years,"['Docker', 'CloudFoundry', 'Spring', 'SonarQub...",We are looking for a candidate to fill in the ...,2022-03-10
1,1,/jobs/53898,about 11 hours ago,"['Manager, SRE', 'Rakuten Viki', ['Manager'], ...","Manager, SRE",Rakuten Viki,['Manager'],-,Permanent,Information not provided,"['Docker', 'API', 'PagerDuty', 'GKE', 'ELK', '...",The SRE team at Viki is responsible for buildi...,2022-03-10
2,2,/jobs/53894,about 11 hours ago,"['DevOps Engineer', 'GovTech', ['Junior'], '$5...",DevOps Engineer,GovTech,['Junior'],"$5,800 - $9,600 SGD / Monthly",Permanent,At least 2 years,"['Docker', 'DockerCompose', 'Fluentd', 'Clair'...",Our team in GovTech works on highly impactful ...,2022-03-10


In [49]:
df = pd.concat([linkedin_df['jobTitle'], monster_df['job_title'], nodeflair_df['Job Title']], ignore_index=True)
df

0             Technical Content Writer (Infura)
1          Software Engineering (DevOps) Intern
2                           Expansion Associate
3                               Cloud Architect
4           Cloud Infrastructure Engineer [MSS]
                          ...                  
12118                           DevOps Engineer
12119                           DevOps Engineer
12120                           DevOps Engineer
12121                         Software Engineer
12122    Sr. Site Reliability Engineer - Hadoop
Length: 12123, dtype: object

## Cleaning Data

In [50]:
# Drop Duplicates
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)
df

0                       Technical Content Writer (Infura)
1                    Software Engineering (DevOps) Intern
2                                     Expansion Associate
3                                         Cloud Architect
4                     Cloud Infrastructure Engineer [MSS]
                              ...                        
6945     Digital Solutions Consultant (Loyalty Solutions)
6946    Senior/ Infrastructure Engineer, Endpoint Secu...
6947    [INTERNSHIP] Agile Software Engineer Intern - ...
6948    Associate Software Engineer, Secured Infrastru...
6949               Sr. Site Reliability Engineer - Hadoop
Length: 6950, dtype: object

In [54]:
print(list(df))

['Technical Content Writer', 'Software Engineering  Intern', 'Expansion Associate', 'Cloud Architect', 'Cloud Infrastructure Engineer', 'Cloud Storage Engineer', 'Cloud Native Developer - Traffic Scheduling', 'Cloud Operations Engineer', 'Associate Solution Sales Director - Cloud', 'Partner Specialist', 'Infrastructure Engineer - remote', 'Internship : Product Management - Onboarding Experience', 'Cloud Native Developer - Elastic Machine Platform', 'Segment Marketing Manager', 'Cloud Native Developer - Computing Platform', 'Sales Specialist Hybrid Cloud', 'Software Engineer Intern', 'Cloud Native Developer - DevOps Platform', 'Technology Consulting Enterprise Applications & Cloud  Senior Associate', 'IT Cloud Security Engineer', 'Customer Engineering Manager Digital Native Customers Google Cloud', 'IT Cloud Architect', 'Data Engineer', 'Cloud & IT Transformation - Japanese Speaking - Senior Associate', 'Cloud Security Engineer', 'Cloud Systems Engineer', 'Software Engineer - Cloud Plat

In [52]:
# remove fields in brackets, numbers, hashatags
def clean_regex(x, company_list, country_list):
    x = re.sub('\\(.+?\\)', "", str(x))   # remove anything in ()
    x = re.sub('\\[.+?\\]', "", str(x))   # remove anything in []
    x = re.sub(r'\w*\d\w*','', str(x))    # remove number
    x = re.sub(r'[\S]+#[\S]+','',str(x))  # remove hashtags
    x = re.sub(r'#[\S]+','', str(x))      # remove hashtags 
    
    for word in x: 
        if word in company_list:
            x = x.split('|')[-1]
        else:
            x = re.sub('\|.*$',"", x)
            
    x = x.replace(',','')                 # remove commas
    x = x.replace('/','')                 # remove slash
    
    
    for word in x.split(): 
        if word in country_list:
            x = x.replace(word,'')   
            
    return x.strip()

In [53]:
country_list = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of the', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', "Korea, Democratic People's Republic of", 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, Republic of', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States of', 'Moldova, Republic of', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Réunion', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and the South Sandwich Islands', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'South Sudan', 'Svalbard and Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela, Bolivarian Republic of', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis and Futuna', 'Yemen', 'Zambia', 'Zimbabwe']
company_list = ['DigitalBCG', 'Essence' ]

df = df.apply(lambda x: clean_regex(str(x), company_list, country_list))
df

0                                Technical Content Writer
1                            Software Engineering  Intern
2                                     Expansion Associate
3                                         Cloud Architect
4                           Cloud Infrastructure Engineer
                              ...                        
6945                         Digital Solutions Consultant
6946     Senior Infrastructure Engineer Endpoint Security
6947    Agile Software Engineer Intern - Smart Buildin...
6948    Associate Software Engineer Secured Infrastruc...
6949               Sr. Site Reliability Engineer - Hadoop
Length: 6950, dtype: object

In [55]:
# Convert df to DataFrame
df = df.to_frame(name='job_title')

# Change to vector

In [56]:
vec = TfidfVectorizer(stop_words="english")
vec.fit(df['job_title'].values) # change to skills
features = vec.transform(df['job_title'].values)

In [57]:
vec.get_feature_names()

['abap',
 'abm',
 'academy',
 'acceleration',
 'accelerator',
 'access',
 'account',
 'accountant',
 'accounting',
 'accounts',
 'accurics',
 'ace',
 'acquisition',
 'acquisitions',
 'action',
 'activation',
 'active',
 'ad',
 'adm',
 'admin',
 'administration',
 'administrative',
 'administrator',
 'administrators',
 'adminitrator',
 'admiralty',
 'adobe',
 'adoption',
 'ads',
 'adtech',
 'advanced',
 'advancement',
 'advertising',
 'adviser',
 'advisor',
 'advisorsenior',
 'advisory',
 'advocacy',
 'advocate',
 'aem',
 'aerospace',
 'affairs',
 'affinity',
 'agency',
 'agent',
 'agent_fte',
 'agile',
 'agility',
 'ai',
 'aimachine',
 'aiml',
 'aio',
 'aiops',
 'airport',
 'aladdin',
 'alarm',
 'algorithm',
 'algorithms',
 'alliance',
 'alliances',
 'almachine',
 'amazon',
 'amda',
 'amea',
 'aml',
 'analist',
 'analog',
 'analysis',
 'analyst',
 'analystassistant',
 'analystbusiness',
 'analystconsultant',
 'analystspecialist',
 'analysttrade',
 'analyt',
 'analytic',
 'analytics',
 

## Creating model

In [58]:
cls = MiniBatchKMeans(n_clusters=10, random_state=random_state)
cls.fit(features)

MiniBatchKMeans(n_clusters=10, random_state=0)

In [59]:
# predict cluster labels for new dataset
cls.predict(features)

# to get cluster labels for the dataset used while
# training the model (used for models that does not
# support prediction on new dataset).
cls.labels_

array([8, 8, 8, ..., 7, 7, 7])

In [60]:
df['Cluster'] = cls.labels_

In [61]:
df

Unnamed: 0,job_title,Cluster
0,Technical Content Writer,8
1,Software Engineering Intern,8
2,Expansion Associate,8
3,Cloud Architect,9
4,Cloud Infrastructure Engineer,7
...,...,...
6945,Digital Solutions Consultant,6
6946,Senior Infrastructure Engineer Endpoint Security,2
6947,Agile Software Engineer Intern - Smart Buildin...,7
6948,Associate Software Engineer Secured Infrastruc...,7


## Evaluation
- best value is 1 and worst value is -1
- values near 0 indicates overlapping clusters

In [62]:
from sklearn.metrics import silhouette_score
silhouette_score(features, labels=cls.predict(features)) 

0.041258868984515

In [63]:
cluster_list = df.groupby('Cluster')['job_title'].apply(list)
cluster_list

Cluster
0    [HR Business Partner, Business Program Manager...
1    [Software Engineer Intern, Software Engineer -...
2    [Lead  Senior Cloud Engineer, Senior Cybersecu...
3    [Segment Marketing Manager, Technical Account ...
4    [Cloud Native Developer - Computing Platform, ...
5    [Data Analyst Data Management Governance & Dat...
6    [Customer Engineering Manager Digital Native C...
7    [Cloud Infrastructure Engineer, Cloud Storage ...
8    [Technical Content Writer, Software Engineerin...
9    [Cloud Architect, Associate Solution Sales Dir...
Name: job_title, dtype: object

In [64]:
for i in cluster_list.index:
    top5 = pd.Series(cluster_list[i]).value_counts().head(5)
    top5_words = list(top5.index)
    print(f'Cluster {i}:')
    print(top5_words)
    print()

Cluster 0:
['Business Analyst', 'Business Development Manager', 'Business Development Representative', 'Senior Business Analyst', 'Business Development Representatives  - Join our Talent Network!']

Cluster 1:
['Software Engineer', 'Senior Software Engineer', 'Software Engineer  Cloud Infrastructure', 'Software Engineer  - Cloud Infrastructure', '- Software Engineer']

Cluster 2:
['Senior DevOps Engineer', 'Senior System Engineer', 'Senior Site Reliability Engineer', 'Senior Engineer', 'Senior Systems Engineer']

Cluster 3:
['Project Manager', 'Product Manager', 'Manager', 'Sales Manager', 'IT Project Manager']

Cluster 4:
['Software Developer', 'Full Stack Developer', 'Java Developer', 'Senior Full Stack Developer', 'Application Developer']

Cluster 5:
['System Analyst', 'Senior System Analyst', 'Data Analyst', 'Security Analyst', 'SOC Advanced Cyber Defense Analyst']

Cluster 6:
['Digital Business Analyst', 'Digital Marketing Manager', 'Digital Consultant', 'Digital Marketing Special

## Visualisation

In [None]:
# reduce the features to 2D
pca = PCA(n_components=2, random_state=random_state)
reduced_features = pca.fit_transform(features.toarray())

# reduce the cluster centers to 2D
reduced_cluster_centers = pca.transform(cls.cluster_centers_)

In [None]:
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')