### Imports and Reading most updated data

In [94]:
import pandas as pd
import numpy as np
import re

In [159]:
df = pd.read_csv("../data/cleaned/filtered_job_roles.csv", index_col=0)
df.head(1)

Unnamed: 0.1,Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,extracted_tech_stack,all_tech_stack,cleaned_job_desc,Cluster
0,0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{'JavaScript', 'SQL', 'REST', 'big data', 'Haz...",set(),job description job purpose we are look...,Software Developer


### Extracting Education Level and Field of Study from Job Postings

In [160]:
level_dict = {
    "PhD": ["phd", "doctor", "doctorate"],
    "Master's": ["masters degree", "master", "master's degree", " ms "],
    "Bachelor's": ["bachelor's degree", "bachelor degree", "bachelor's", "bachelor", "degree", "bsc", " bs ", "college", "university"],
    "Diploma": ["diploma"]
}

field_dict = {
    "Relevant Field": ["related", "relevant", "equivalent", "related field", "related discipline", "technical field", "related area"],
    "Computer Science": ["computer science", " cs ", "comp sci", "com sci", "computing"],
    "Mathematics": ["mathematics", "math"],
    "Information Technology": ["information technology", " it ", "technology", "information systems", "digital technologies"],
    "Computer/Software Engineering": ["computer engineering", "software engineering"],
    "Electrical and Electronics Engineering": ["electrical and electronics engineering", "bsee", "electrical engineering", "electronics engineering"],
    "Physics": ["physics"],
    "Business": ["business", "MBA"]
}

In [161]:
# Initialise empty rows
df['education_level'] = df.apply(lambda x: set(), axis=1)
df['education_field'] = df.apply(lambda x: set(), axis=1)

In [162]:
# Iterate through level dictionary and level variations
for level, level_list in level_dict.items():
    for level_variation in level_list:
        # Iterate through dataframe
        for i, row in df.iterrows():
            # If description is a string, and contains level_variation, append level
            if type(row['cleaned_job_desc']) == str and re.search(level_variation, row['cleaned_job_desc']) is not None:
                df.at[i, "education_level"].add(level)
                # Iterate through matches of level_variations
                for match in re.finditer(level_variation, row['cleaned_job_desc']):
                    # Iterate through field dictionary and field variations
                    for field, field_list in field_dict.items():
                        for field_variation in field_list:
                            # If field_variation within 100 characters of match, append field
                            if field_variation in row['cleaned_job_desc'][match.start():match.end()+100]:
                                df.at[i, "education_field"].add(field)
df

Unnamed: 0.1,Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,extracted_tech_stack,all_tech_stack,cleaned_job_desc,Cluster,education_level,education_field
0,0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{'JavaScript', 'SQL', 'REST', 'big data', 'Haz...",set(),job description job purpose we are look...,Software Developer,{},{}
1,1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{'Stack', 'Caching', 'Azure', 'Load Balencers'...",set(),job description job summary description d...,Solutions Architect,{Master's},{}
7,7,Cloud Engineer (Automation),Recruit Expert Pte. Ltd.,permanent,"['Puppet', 'Cloud Applications', 'Cloud Securi...","In-charge of the design, deployment, installat...",['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,48000.0,66000.0,2.0,5.0,"{'IT', 'upgrades', 'R1326740', 'Ansible', 'fix...","{' Cloud Computing', ' Cloud-based', ' Cloud S...",in charge of the design deployment installat...,Cloud Engineer,"{Master's, Bachelor's}","{Computer Science, Information Technology, Rel..."
8,8,Cloud Engineer (Automation),Recruit Expert Pte. Ltd.,permanent,"['Puppet', 'Cloud Applications', 'Cloud Securi...","In-charge of the design, deployment, installat...",['Other'],2022-03-14,https://www.monster.com.sg/seeker/job-details?...,monster,48000.0,60000.0,2.0,5.0,"{'IT', 'upgrades', 'R1326740', 'Ansible', 'fix...","{' Cloud Computing', ' Cloud-based', ' Cloud S...",in charge of the design deployment installat...,Cloud Engineer,"{Master's, Bachelor's}","{Computer Science, Information Technology, Rel..."
9,9,Senior Software and Automation Architect,Cisco International Limited Singapore Branch,permanent,"['Kubernetes', 'Cloud Computing', 'JIRA', 'Pyt...",What You'll Do\r\nCisco Customer Experience (C...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,108000.0,180000.0,10.0,15.0,"{'Industry Cloud', 'Artifactory', 'YANG', 'Cus...","{'Cisco', 'CircleCI', 'CI', 'Orchestration', '...",what you ll do cisco customer experience cx ...,Solutions Architect,{},{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11442,2894,Enterprise Solutions Engineer (Singapore),Confluent,Permanent,"['ETL', 'MessageQueue', 'essageQueue', 'Apache...",We’re looking for self-motivated team members ...,,2022-01-11,https://www.nodeflair.com//jobs/46158,nodeflair,,,,,"{'California', 'beliefs', 'Inc', 'disk', 'Apac...","{' essageQueue', ' Apache', 'California', 'bel...",we re looking for self motivated team members ...,Solutions Architect,{Bachelor's},{Computer Science}
11443,2896,DevOps Engineer,2C2P,Permanent,"['GitLab', 'HTTP', 'UDP', 'TCP', 'ShellScript'...",2C2P is looking for a .NET DevOps Engineer to ...,,2022-01-11,https://www.nodeflair.com//jobs/46156,nodeflair,7000.0,10000.0,,3.0,"{'TCP', 'Bamboo CI', 'qualifications &', 'HTTP...","{' ELK', ' Jenkins', 'TCP', 'Code', ' .NET', '...",is looking for a net devops engineer to join...,Devops Engineer,{},{}
11444,2897,DevOps Engineer,FINXFLO,Permanent,"['Strategy', 'ShellScript', 'CI', 'Shell', 'UN...",Alpha Stone Capital is looking for an amazing ...,,2022-01-11,https://www.nodeflair.com//jobs/46153,nodeflair,,,,,"{'Secure Infrastructure', 'Jenkins', 'IaC', 'T...","{' Jenkins', ' JavaScript', ' Terraform', 'Sec...",alpha stone capital is looking for an amazing ...,Devops Engineer,"{Master's, Bachelor's}","{Computer Science, Relevant Field}"
11445,2898,DevOps Engineer,Quilt.AI,Permanent,"['Next.js', 'Docker', 'Cloudflare', 'DockerSwa...",As part of a growing team consisting of ML exp...,,2022-01-11,https://www.nodeflair.com//jobs/46151,nodeflair,,,,3.0,"{'AWS CodePipeline', 'release', 'Jenkins', 'Ia...","{' Jenkins', ' Container', 'AWS CodePipeline',...",as part of a growing team consisting of ml exp...,Devops Engineer,{Bachelor's},"{Computer Science, Computer/Software Engineeri..."


In [163]:
df.drop(columns=["Unnamed: 0", "extracted_tech_stack"], inplace=True)
df.rename(columns={"Cluster":"cluster"}, inplace=True)
df.head()

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,all_tech_stack,cleaned_job_desc,cluster,education_level,education_field
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,set(),job description job purpose we are look...,Software Developer,{},{}
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,set(),job description job summary description d...,Solutions Architect,{Master's},{}
7,Cloud Engineer (Automation),Recruit Expert Pte. Ltd.,permanent,"['Puppet', 'Cloud Applications', 'Cloud Securi...","In-charge of the design, deployment, installat...",['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,48000.0,66000.0,2.0,5.0,"{' Cloud Computing', ' Cloud-based', ' Cloud S...",in charge of the design deployment installat...,Cloud Engineer,"{Master's, Bachelor's}","{Computer Science, Information Technology, Rel..."
8,Cloud Engineer (Automation),Recruit Expert Pte. Ltd.,permanent,"['Puppet', 'Cloud Applications', 'Cloud Securi...","In-charge of the design, deployment, installat...",['Other'],2022-03-14,https://www.monster.com.sg/seeker/job-details?...,monster,48000.0,60000.0,2.0,5.0,"{' Cloud Computing', ' Cloud-based', ' Cloud S...",in charge of the design deployment installat...,Cloud Engineer,"{Master's, Bachelor's}","{Computer Science, Information Technology, Rel..."
9,Senior Software and Automation Architect,Cisco International Limited Singapore Branch,permanent,"['Kubernetes', 'Cloud Computing', 'JIRA', 'Pyt...",What You'll Do\r\nCisco Customer Experience (C...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,108000.0,180000.0,10.0,15.0,"{'Cisco', 'CircleCI', 'CI', 'Orchestration', '...",what you ll do cisco customer experience cx ...,Solutions Architect,{},{}


### Extracting Cloud Service Provider

In [164]:
df['aws'] = 0
df['google_cloud'] = 0
df['azure'] = 0

df.head(1)

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,...,lower_year,upper_year,all_tech_stack,cleaned_job_desc,cluster,education_level,education_field,aws,google_cloud,azure
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,set(),job description job purpose we are look...,Software Developer,{},{},0,0,0


In [165]:
for i, row in df.iterrows():
    job_desc = str(row['cleaned_job_desc'])
    if "aws" in job_desc or "amazon web services" in job_desc:
        df.at[i,'aws'] = 1
    if "google cloud platform" in job_desc or "gcp" in job_desc:
        df.at[i,'google_cloud'] = 1
    if "azure" in job_desc:
        df.at[i,'azure'] = 1

In [166]:
df["aws"].value_counts()

1    2790
0    2615
Name: aws, dtype: int64

In [167]:
df["google_cloud"].value_counts()

0    4605
1     800
Name: google_cloud, dtype: int64

In [168]:
df["azure"].value_counts()

0    3764
1    1641
Name: azure, dtype: int64

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5405 entries, 0 to 11446
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         5405 non-null   object 
 1   company_name      5405 non-null   object 
 2   job_type          5402 non-null   object 
 3   tech_stack        4112 non-null   object 
 4   job_desc          5399 non-null   object 
 5   industry          3867 non-null   object 
 6   date_posted       5046 non-null   object 
 7   url               5405 non-null   object 
 8   source            5405 non-null   object 
 9   lower_salary      2874 non-null   float64
 10  upper_salary      2874 non-null   float64
 11  lower_year        3133 non-null   float64
 12  upper_year        4009 non-null   float64
 13  all_tech_stack    5405 non-null   object 
 14  cleaned_job_desc  5399 non-null   object 
 15  cluster           5405 non-null   object 
 16  education_level   5405 non-null   object 

### Assign Industry for each Job Posting

In [170]:
industries_scraped = pd.read_csv("../data/cleaned/Posts_Industry.csv", index_col=0)
industries_scraped.reset_index(inplace=True)
industries_scraped = industries_scraped[["company_name", "industry"]]
industries_scraped.drop_duplicates(inplace=True)
industries_scraped.value_counts()

company_name                    industry                                                     
 Hitachi Vantara                ['Database management company in Singapore']                     1
People Profilers Pte. Ltd.      ['Employment agency in Singapore']                               1
Pearl Care Singapore Pte. Ltd.  [None]                                                           1
Pegasystems Pte Ltd             ['Software company in Singapore']                                1
Pelago                          ['Leisure', ' Travel & Tourism']                                 1
                                                                                                ..
GREAT EASTERN                   ['Insurance', ' Financial Services', ' Banking', 'Insurance']    1
GOOD FOR FOOD PTE. LTD.         ['Event planner in Singapore']                                   1
GO-JEK                          ['Fund management company in Singapore']                         1
GIC            

In [172]:
def get_industry(text):
    """
    
    Args:
        text (str): string of industry based on scraping

    Returns:
        list: tokenized text
    """
    if text == 'NaN':
        return np.nan
    
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace("'", '')
    mid_working = text.strip().split(',')
    to_be_returned = mid_working[0]
    return to_be_returned

def clean_str(text):
    if text == 'NaN':
        return np.nan
    
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace("'", '')
    
    return text

def remove_locations(text): 
    text = text.replace('in Singapore', '')
    text = text.replace("in London", "")
    text = text.replace("in Pune", "")
    text = text.replace("in South Jakarta", "")
    text = text.replace("in Hong Kong", "")
#     text = text.replace("Bank", "Banking/Accounting/Financial Services")
#     text = text[0:-1]
    
    return text

industries_scraped['cleaned_industry'] = industries_scraped['industry'].apply(get_industry)
industries_scraped['cleaned_industry'] = industries_scraped['cleaned_industry'].apply(remove_locations)
industries_scraped['cleaned_industry'] = industries_scraped['cleaned_industry'].apply(clean_str)
industries_scraped['cleaned_industry'].value_counts()

None                                        507
Software company                            225
Corporate office                            145
Employment agency                            83
Information Technology & Services            70
                                           ... 
Industrial consultant                         1
Consumer Electronics/Durables/Appliances      1
Advertising/Entertainment/Media               1
Airlines/Aviation/Aerospace                   1
Management consulting services company        1
Name: cleaned_industry, Length: 290, dtype: int64

In [175]:
for i, row in industries_scraped.iterrows():
    company = row['company_name']
    df['industry'].loc[df['company_name'] == company] = str(row['cleaned_industry'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['industry'].loc[df['company_name'] == company] = str(row['cleaned_industry'])


In [176]:
df.head()

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,...,lower_year,upper_year,all_tech_stack,cleaned_job_desc,cluster,education_level,education_field,aws,google_cloud,azure
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,Banking/Accounting/Financial Services,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,set(),job description job purpose we are look...,Software Developer,{},{},1,1,0
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,Internet/E-commerce,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,set(),job description job summary description d...,Solutions Architect,{Master's},{},1,1,1
7,Cloud Engineer (Automation),Recruit Expert Pte. Ltd.,permanent,"['Puppet', 'Cloud Applications', 'Cloud Securi...","In-charge of the design, deployment, installat...",Employment agency,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,48000.0,...,2.0,5.0,"{' Cloud Computing', ' Cloud-based', ' Cloud S...",in charge of the design deployment installat...,Cloud Engineer,"{Master's, Bachelor's}","{Computer Science, Information Technology, Rel...",0,0,0
8,Cloud Engineer (Automation),Recruit Expert Pte. Ltd.,permanent,"['Puppet', 'Cloud Applications', 'Cloud Securi...","In-charge of the design, deployment, installat...",Employment agency,2022-03-14,https://www.monster.com.sg/seeker/job-details?...,monster,48000.0,...,2.0,5.0,"{' Cloud Computing', ' Cloud-based', ' Cloud S...",in charge of the design deployment installat...,Cloud Engineer,"{Master's, Bachelor's}","{Computer Science, Information Technology, Rel...",0,0,0
9,Senior Software and Automation Architect,Cisco International Limited Singapore Branch,permanent,"['Kubernetes', 'Cloud Computing', 'JIRA', 'Pyt...",What You'll Do\r\nCisco Customer Experience (C...,Employment agency,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,108000.0,...,10.0,15.0,"{'Cisco', 'CircleCI', 'CI', 'Orchestration', '...",what you ll do cisco customer experience cx ...,Solutions Architect,{},{},1,0,0


In [182]:
df.to_json("combined_data.json")