In [2]:
import pandas as pd
import regex as re
import spacy

### Reading each dataset

In [3]:
monster_data = pd.read_csv("../data/cleaned/individual_platforms/All_MonsterSg.csv", index_col=0)
monster_data.drop(["roles", "function"], axis=1, inplace=True)
monster_data.reset_index(inplace=True)
monster_data["source"] = 'monster'

monster_data.head(1)

Unnamed: 0,job_title,company,salary,job_type,years_experience,tech_stack,job_description,industry,date_posted,url,source
0,Senior Python Developer,Citi,,permanent,,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster


In [4]:
linkedin_data = pd.read_csv("../data/cleaned/individual_platforms/LinkedIn.csv", index_col=0)
linkedin_data.drop(["applicantsCount", "remoteAllowed", "job_functions", "companyUrl"], axis=1, inplace=True)
linkedin_data.reset_index(inplace=True)
linkedin_data["source"] = 'linkedin'

linkedin_data.head(1)

Unnamed: 0,job_title,date_posted,job_desc,company_name,industry,job_type,url,source
0,Cloud Consumption Analyst,2021-10-07,Job Description\r\n\r\nThe Cloud Consumption A...,Intel Corporation,Semiconductors,permanent,https://www.linkedin.com/jobs/view/2773797384/,linkedin


In [5]:
nodeflair_data = pd.read_csv("../data/cleaned/individual_platforms/cleaned_nodeflair_jobpostings.csv", index_col=0)
nodeflair_data["source"] = 'nodeflair'

nodeflair_data.head(1)

Unnamed: 0,url,job_title,company_name,salary,job_type,years_of_experience,tech_stack,job_desc,date_posted,source
0,https://www.nodeflair.com//jobs/53907,ReactJS Developer (Full Stack),Apar Technologies,"['6,419', '8,819']",Permanent,"[2, 3]","['Docker', 'CloudFoundry', 'Spring', 'SonarQub...",We are looking for a candidate to fill in the ...,2022-03-10,nodeflair


In [6]:
print("Monster columns:", monster_data.columns)
print("LinkedIn columns:", linkedin_data.columns)
print("NodeFlair columns:", nodeflair_data.columns)

Monster columns: Index(['job_title', 'company', 'salary', 'job_type', 'years_experience',
       'tech_stack', 'job_description', 'industry', 'date_posted', 'url',
       'source'],
      dtype='object')
LinkedIn columns: Index(['job_title', 'date_posted', 'job_desc', 'company_name', 'industry',
       'job_type', 'url', 'source'],
      dtype='object')
NodeFlair columns: Index(['url', 'job_title', 'company_name', 'salary', 'job_type',
       'years_of_experience', 'tech_stack', 'job_desc', 'date_posted',
       'source'],
      dtype='object')


### Standardising column names

In [7]:
monster_data.rename(columns={'company':'company_name', 'years_experience':'years_of_experience','job_description':'job_desc'}, inplace=True)

monster_data.head(1)

Unnamed: 0,job_title,company_name,salary,job_type,years_of_experience,tech_stack,job_desc,industry,date_posted,url,source
0,Senior Python Developer,Citi,,permanent,,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster


### Combining all 3 datasets

In [8]:
combined_data = pd.concat([monster_data, linkedin_data, nodeflair_data])

In [9]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11448 entries, 0 to 2900
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_title            11448 non-null  object
 1   company_name         11448 non-null  object
 2   salary               5711 non-null   object
 3   job_type             11441 non-null  object
 4   years_of_experience  8969 non-null   object
 5   tech_stack           7976 non-null   object
 6   job_desc             11437 non-null  object
 7   industry             9131 non-null   object
 8   date_posted          10908 non-null  object
 9   url                  11448 non-null  object
 10  source               11448 non-null  object
dtypes: object(11)
memory usage: 1.0+ MB


In [10]:
combined_data.head(1)

Unnamed: 0,job_title,company_name,salary,job_type,years_of_experience,tech_stack,job_desc,industry,date_posted,url,source
0,Senior Python Developer,Citi,,permanent,,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster


### Text Cleaning on Job Description

In [58]:
def clean_job_description(text):

    text = re.sub(r'\w*\d\w*','', str(text))
    text = text.replace("\r", " ").replace("\n", " ")
    text = re.sub(r'[^\w\s]', ' ', text)
    text = text.lower()

    return text

In [59]:
combined_data["cleaned_job_desc"] = combined_data["job_desc"].apply(clean_job_description)

## Information Extraction

### Extracting Salary Limits and Years of Experience Limits to individual columns

In [11]:
def get_lower_salary(text):
    if text == "NaN":
        return "NaN"

    elif type(text) != float:
        text = text[1:-1].split(", ")

        lower_salary = text[0].replace(",", "")
        return lower_salary[1:-1]

    else:
        return "NaN"

def get_upper_salary(text):
    if text == "NaN":
        return "NaN"

    elif type(text) != float:
        text = text[1:-1].split(", ")

        lower_salary = text[1].replace(",", "")
        return lower_salary[1:-1]

    else:
        return "NaN"

In [12]:
def get_lower_year(text):
    if text == "NaN":
        return "NaN"

    elif type(text) != float:
        if len(text.split(",")) == 1:
            return "NaN"
        else:
            text = text[1:-1].split(", ")

            lower_year = text[0].replace(",", "")
            return lower_year[1:-1]

    else:
        return "NaN"

def get_upper_year(text):
    if text == "NaN":
        return "NaN"

    elif type(text) != float:
        if len(text.split(",")) == 1:
            return text[1:-1]
        else:
            text = text[1:-1].split(", ")
            # print(text)
            # print(text[1])

            upper_year = text[1].replace(",", "")
            return upper_year[1:-1]

    else:
        return "NaN"

In [12]:
combined_data["lower_salary"] = combined_data["salary"].apply(get_lower_salary)
combined_data["upper_salary"] = combined_data["salary"].apply(get_upper_salary)
combined_data["lower_year"] = combined_data["years_of_experience"].apply(get_lower_year)
combined_data["upper_year"] = combined_data["years_of_experience"].apply(get_upper_year)

combined_data.drop(columns=["salary", "years_of_experience"], inplace=True)

combined_data.head()

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,
2,IT Technician,Ascend Com Pte. Ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,36000.0,2.0,5.0
3,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,84000.0,2.0,5.0
4,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,93600.0,2.0,5.0


### Consolidating Tech Stack for each Job Listing

In [13]:
skills_ner_model = spacy.load('../analysis/custom_ner_model')
ner=skills_ner_model.get_pipe('ner')

move_names = list(ner.move_names)
assert skills_ner_model.get_pipe("ner").move_names == move_names

In [14]:
def get_skills(text):
    skills_entities = []

    text = str(text).replace("\r", "").replace("\n", "")

    try:
        for sentence in text.split("."):
            doc = skills_ner_model(sentence)
            for ent in doc.ents:
                skills_entities.append(ent.text)
    except Exception as e:
        return ""

    return set(skills_entities)

In [22]:
combined_data["extracted_tech_stack"] = combined_data["job_desc"].apply(get_skills)

combined_data.head()

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,extracted_tech_stack
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{Mockito, Cucumber, Citigroup Inc, Oracle, BI,..."
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{Identity, Active Directory, API's, Microsoft ..."
2,IT Technician,Ascend Com Pte. Ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,36000.0,2.0,5.0,"{MaintenanceComputer Hardware, Office, 1st &, ..."
3,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,84000.0,2.0,5.0,"{Tier, SolvingRecognizes, IP, guidanceInterper..."
4,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,93600.0,2.0,5.0,"{Tier II, IP, Revs, BKM}"


In [47]:
# combined_data.reset_index(inplace=True)
# combined_data.to_json('../data cleaning/extracted_tech_stack.json', orient='columns')

combined_data = pd.read_json("extracted_tech_stack.json")
combined_data.drop(columns=["index"], inplace=True)
combined_data.head(5)

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,extracted_tech_stack
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"[Mockito, Cucumber, Citigroup Inc, Oracle, BI,..."
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"[Identity, Active Directory, API's, Microsoft ..."
2,IT Technician,Ascend Com Pte. Ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,36000.0,2.0,5.0,"[MaintenanceComputer Hardware, Office, 1st &, ..."
3,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,84000.0,2.0,5.0,"[Tier, SolvingRecognizes, IP, guidanceInterper..."
4,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,93600.0,2.0,5.0,"[Tier II, IP, Revs, BKM]"


In [41]:
def consolidate_tech_stack(row):
    all_tech_stack = []
    text = str(row["tech_stack"])

    if text != 'NaN':
        text = text[1:-1]
        text = text.replace("'", "")
        text = text.split(", ")

        for tech_stack in text:
            all_tech_stack.append(tech_stack)
    
        all_tech_stack += list(row["extracted_tech_stack"])

    return set(all_tech_stack)

In [53]:
combined_data["all_tech_stack"] = combined_data.apply(consolidate_tech_stack, axis=1)

combined_data.head()

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,extracted_tech_stack,all_tech_stack
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"[Mockito, Cucumber, Citigroup Inc, Oracle, BI,...","{RabbitMQ, Tableau Dashboards, Bitbucket, SOAP..."
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"[Identity, Active Directory, API's, Microsoft ...","{Queue, Containers, Azure, RFPs, Microsoft ELA..."
2,IT Technician,Ascend Com Pte. Ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,36000.0,2.0,5.0,"[MaintenanceComputer Hardware, Office, 1st &, ...","{Computer Hardware, Switches, LAN, Firewall, N..."
3,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,84000.0,2.0,5.0,"[Tier, SolvingRecognizes, IP, guidanceInterper...","{Knowledge Management, Problem Solving, BKM, R..."
4,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,93600.0,2.0,5.0,"[Tier II, IP, Revs, BKM]","{Knowledge Management, Problem Solving, BKM, R..."


In [54]:
combined_data.iloc[0]["all_tech_stack"]

{'APAC Cross',
 'AWS',
 'Angular',
 'Ant',
 'BI',
 'Big Data',
 'Bitbucket',
 'CMAD',
 'CSS',
 'Citi',
 'Citigroup Inc',
 'Cloud Computing',
 'Cloud Foundry)Good',
 'Cloud Platforms',
 'Container',
 'Core &',
 'Cucumber',
 'Dev',
 'Docker',
 'EHCache',
 'GIT',
 'Gemfire',
 'Google Cloud',
 'HTML 5',
 'Hazlecast',
 'Integration',
 'JIRA',
 'JMS',
 'JSON',
 'JUnit',
 'Jasmin',
 'Java',
 'JavaScript',
 'Jenkins',
 'Kafka',
 'Karma',
 'Kubernetes',
 'Maven',
 'MicroServices',
 'Mockito',
 'Mule',
 'Oracle',
 'Orchestration',
 'PostgreSQL',
 'Python',
 'REST',
 'RabbitMQ',
 'SOAP',
 'SQL',
 'Selenium',
 'Skills',
 'Sonar',
 'Spring Boot',
 'Spring Cloud',
 'Spring MVC',
 'Tableau',
 'Tableau Dashboards',
 'Testing',
 'Tools',
 'UI',
 'UX',
 'Web Technologies',
 'big data',
 'color',
 'national origin',
 'on',
 'religion',
 'resource-',
 'sex'}

### Extracting Education Level and Field of Study from Job Postings

In [55]:
level_dict = {
    "PhD": ["phd", "doctor", "doctorate"],
    "Master's": ["masters degree", "master", "master's degree", " ms "],
    "Bachelor's": ["bachelor's degree", "bachelor degree", "bachelor's", "bachelor", "degree", "bsc", " bs ", "college", "university"],
    "Diploma": ["diploma"]
}

field_dict = {
    "Relevant Field": ["related", "relevant", "equivalent", "related field", "related discipline", "technical field", "related area"],
    "Computer Science": ["computer science", " cs ", "comp sci", "com sci", "computing"],
    "Mathematics": ["mathematics", "math"],
    "Information Technology": ["information technology", " it ", "technology", "information systems", "digital technologies"],
    "Computer/Software Engineering": ["computer engineering", "software engineering"],
    "Electrical and Electronics Engineering": ["electrical and electronics engineering", "bsee", "electrical engineering", "electronics engineering"],
    "Physics": ["physics"],
    "Business": ["business", "MBA"]
}

In [56]:
# Initialise empty rows
combined_data['education_level'] = combined_data.apply(lambda x: set(), axis=1)
combined_data['education_field'] = combined_data.apply(lambda x: set(), axis=1)

In [60]:
# Iterate through level dictionary and level variations
for level, level_list in level_dict.items():
    for level_variation in level_list:
        # Iterate through dataframe
        for i, row in combined_data.iterrows():
            # If description is a string, and contains level_variation, append level
            if type(row['cleaned_job_desc']) == str and re.search(level_variation, row['cleaned_job_desc']) is not None:
                combined_data.at[i, "education_level"].add(level)
                # Iterate through matches of level_variations
                for match in re.finditer(level_variation, row['cleaned_job_desc']):
                    # Iterate through field dictionary and field variations
                    for field, field_list in field_dict.items():
                        for field_variation in field_list:
                            # If field_variation within 100 characters of match, append field
                            if field_variation in row['cleaned_job_desc'][match.start():match.end()+100]:
                                combined_data.at[i, "education_field"].add(field)
combined_data

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,extracted_tech_stack,all_tech_stack,education_level,education_field,cleaned_job_desc
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"[Mockito, Cucumber, Citigroup Inc, Oracle, BI,...","{RabbitMQ, Tableau Dashboards, Bitbucket, SOAP...",{},{},job description job purpose we are look...
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,['Internet/E-commerce'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"[Identity, Active Directory, API's, Microsoft ...","{Queue, Containers, Azure, RFPs, Microsoft ELA...",{Master's},{},job description job summary description d...
2,IT Technician,Ascend Com Pte. Ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,36000.0,2,5,"[MaintenanceComputer Hardware, Office, 1st &, ...","{Computer Hardware, Switches, LAN, Firewall, N...",{Diploma},"{Relevant Field, Information Technology}",responsibilities provide helpdesk support fo...
3,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,84000.0,2,5,"[Tier, SolvingRecognizes, IP, guidanceInterper...","{Knowledge Management, Problem Solving, BKM, R...",{},{},key responsibilities performs all standard se...
4,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,['Other'],2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,93600.0,2,5,"[Tier II, IP, Revs, BKM]","{Knowledge Management, Problem Solving, BKM, R...",{Bachelor's},{},has developed specialized skills or is multi s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11443,DevOps Engineer,2C2P,Permanent,"['GitLab', 'HTTP', 'UDP', 'TCP', 'ShellScript'...",2C2P is looking for a .NET DevOps Engineer to ...,,2022-01-11,https://www.nodeflair.com//jobs/46156,nodeflair,7000.0,10000.0,,3,"[Terraform, Python, TCP, Amazon Web, IP, IaC, ...","{IaC, TCP, .NET, Ansible Strong, Node.js, UDP,...",{},{},is looking for a net devops engineer to join...
11444,DevOps Engineer,FINXFLO,Permanent,"['Strategy', 'ShellScript', 'CI', 'Shell', 'UN...",Alpha Stone Capital is looking for an amazing ...,,2022-01-11,https://www.nodeflair.com//jobs/46153,nodeflair,,,,,"[SysOps, Infrastructure &, DevOps Engineer, Py...","{IaC, SysOps, Terraform Team, BS, Terraform, A...","{Master's, Bachelor's}","{Relevant Field, Computer Science}",alpha stone capital is looking for an amazing ...
11445,DevOps Engineer,Quilt.AI,Permanent,"['Next.js', 'Docker', 'Cloudflare', 'DockerSwa...",As part of a growing team consisting of ML exp...,,2022-01-11,https://www.nodeflair.com//jobs/46151,nodeflair,,,,3,"[Terraform, DevOps Engineer, Python, Cloudflar...","{IaC, React, Container, Docker, Ansible, Prefe...",{Bachelor's},"{Relevant Field, Computer Science, Computer/So...",as part of a growing team consisting of ml exp...
11446,Software Engineer,Zoku Integrated Commerce,Permanent,"['API', 'Magento', 'CI', 'DOM', 'Node.js', 'No...",We are hiring software engineers with expertis...,,2022-01-11,https://www.nodeflair.com//jobs/46149,nodeflair,,,,3,"[Experience Codes, Magento, NodeJS, Bachelor’s...","{DOM, Node.js, React, TypeScript, API, ERP, ba...",{Bachelor's},"{Relevant Field, Computer Science}",we are hiring software engineers with expertis...


### Extracting Cloud Service Provider

In [61]:
combined_data['aws'] = 0
combined_data['google_cloud'] = 0
combined_data['azure'] = 0

combined_data.head(1)

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,...,lower_year,upper_year,extracted_tech_stack,all_tech_stack,education_level,education_field,cleaned_job_desc,aws,google_cloud,azure
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,['Banking/Accounting/Financial Services'],2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,"[Mockito, Cucumber, Citigroup Inc, Oracle, BI,...","{RabbitMQ, Tableau Dashboards, Bitbucket, SOAP...",{},{},job description job purpose we are look...,0,0,0


In [62]:
for i, row in combined_data.iterrows():
    job_desc = str(row['cleaned_job_desc'])
    if "aws" in job_desc or "amazon web services" in job_desc:
        combined_data.at[i,'aws'] = 1
    if "google cloud platform" in job_desc or "gcp" in job_desc:
        combined_data.at[i,'google_cloud'] = 1
    if "azure" in job_desc:
        combined_data.at[i,'azure'] = 1

In [63]:
combined_data["aws"].value_counts()

0    6207
1    5241
Name: aws, dtype: int64

In [64]:
combined_data["google_cloud"].value_counts()

0    10084
1     1364
Name: google_cloud, dtype: int64

In [65]:
combined_data["azure"].value_counts()

0    8664
1    2784
Name: azure, dtype: int64

### Assign Industry for each Job Posting

In [66]:
industries_scraped = pd.read_csv("../data/cleaned/Posts_Industry.csv", index_col=0)
industries_scraped.reset_index(inplace=True)
industries_scraped = industries_scraped[["company_name", "industry"]]
industries_scraped.drop_duplicates(inplace=True)
industries_scraped.value_counts()

company_name                    industry                                                     
 Hitachi Vantara                ['Database management company in Singapore']                     1
People Profilers Pte. Ltd.      ['Employment agency in Singapore']                               1
Pearl Care Singapore Pte. Ltd.  [None]                                                           1
Pegasystems Pte Ltd             ['Software company in Singapore']                                1
Pelago                          ['Leisure', ' Travel & Tourism']                                 1
                                                                                                ..
GREAT EASTERN                   ['Insurance', ' Financial Services', ' Banking', 'Insurance']    1
GOOD FOR FOOD PTE. LTD.         ['Event planner in Singapore']                                   1
GO-JEK                          ['Fund management company in Singapore']                         1
GIC            

In [67]:
def get_industry(text):
    if text == 'NaN':
        return np.nan
    
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace("'", '')
    mid_working = text.strip().split(',')
    to_be_returned = mid_working[0]

    return to_be_returned

def clean_str(text):
    if text == 'NaN':
        return np.nan
    to_be_returned = ""
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace("'", '')

    text = text.rstrip()

    return text



def remove_locations(text): 
    text = text.replace('in Singapore', '')
    text = text.replace("in London", "")
    text = text.replace("in Pune", "")
    text = text.replace("in South Jakarta", "")
    text = text.replace("in Hong Kong", "")
    
    text = text.replace("company", "")
    text = text.replace("office", "")
    
    text = text.replace("IT/Computers - Software","IT")
    text = text.replace("Software","IT")
    
    
    
#     text = text.replace("Bank", "Banking/Accounting/Financial Services")
#     text = text[0:-1]
    
    return text

industries_scraped['cleaned_industry'] = industries_scraped['industry'].apply(get_industry)
industries_scraped['cleaned_industry'] = industries_scraped['cleaned_industry'].apply(remove_locations)
industries_scraped['cleaned_industry'] = industries_scraped['cleaned_industry'].apply(clean_str)
industries_scraped['cleaned_industry'].value_counts()

None                                 507
IT                                   294
Corporate                            145
Employment agency                     83
Information Technology & Services     70
                                    ... 
Travel agency                          1
Engineering services                   1
Entertainment/Media/Publishing         1
Marketing consultant                   1
Management consulting services         1
Name: cleaned_industry, Length: 277, dtype: int64

In [68]:
for i, row in industries_scraped.iterrows():
    company = row['company_name']
    combined_data['industry'].loc[combined_data['company_name'] == company] = str(row['cleaned_industry'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data['industry'].loc[combined_data['company_name'] == company] = str(row['cleaned_industry'])


In [70]:
combined_data.head(5)

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,...,lower_year,upper_year,extracted_tech_stack,all_tech_stack,education_level,education_field,cleaned_job_desc,aws,google_cloud,azure
0,Senior Python Developer,Citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,Banking/Accounting/Financial Services,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,"[Mockito, Cucumber, Citigroup Inc, Oracle, BI,...","{RabbitMQ, Tableau Dashboards, Bitbucket, SOAP...",{},{},job description job purpose we are look...,1,1,0
1,Senior Partner Solutions Architect (Microsoft),Amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,Internet/E-commerce,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,"[Identity, Active Directory, API's, Microsoft ...","{Queue, Containers, Azure, RFPs, Microsoft ELA...",{Master's},{},job description job summary description d...,1,1,1
2,IT Technician,Ascend Com Pte. Ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,Corporate,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,...,2.0,5.0,"[MaintenanceComputer Hardware, Office, 1st &, ...","{Computer Hardware, Switches, LAN, Firewall, N...",{Diploma},"{Relevant Field, Information Technology}",responsibilities provide helpdesk support fo...,0,0,0
3,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,Semi conductor supplier,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,...,2.0,5.0,"[Tier, SolvingRecognizes, IP, guidanceInterper...","{Knowledge Management, Problem Solving, BKM, R...",{},{},key responsibilities performs all standard se...,0,0,0
4,Customer Engineer,Applied Materials South East Asia Pte. Ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,Semi conductor supplier,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,...,2.0,5.0,"[Tier II, IP, Revs, BKM]","{Knowledge Management, Problem Solving, BKM, R...",{Bachelor's},{},has developed specialized skills or is multi s...,0,0,0


### Lowercase Job Type and Company Name

In [71]:
combined_data['job_type'] = combined_data['job_type'].str.lower()
combined_data['company_name'] = combined_data['company_name'].str.lower()

combined_data.head(5)

Unnamed: 0,job_title,company_name,job_type,tech_stack,job_desc,industry,date_posted,url,source,lower_salary,...,lower_year,upper_year,extracted_tech_stack,all_tech_stack,education_level,education_field,cleaned_job_desc,aws,google_cloud,azure
0,Senior Python Developer,citi,permanent,,Job Description :\r\n\r\nJob Purpose:\r\nWe ar...,Banking/Accounting/Financial Services,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,"[Mockito, Cucumber, Citigroup Inc, Oracle, BI,...","{RabbitMQ, Tableau Dashboards, Bitbucket, SOAP...",{},{},job description job purpose we are look...,1,1,0
1,Senior Partner Solutions Architect (Microsoft),amazon,permanent,,Job Description :\r\nJob summary\r\nDESCRIPTIO...,Internet/E-commerce,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,...,,,"[Identity, Active Directory, API's, Microsoft ...","{Queue, Containers, Azure, RFPs, Microsoft ELA...",{Master's},{},job description job summary description d...,1,1,1
2,IT Technician,ascend com pte. ltd.,permanent,"['Switches', 'Mac', 'Windows 10', 'Cloud Compu...",Responsibilities:\r\nProvide helpdesk support ...,Corporate,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,...,2.0,5.0,"[MaintenanceComputer Hardware, Office, 1st &, ...","{Computer Hardware, Switches, LAN, Firewall, N...",{Diploma},"{Relevant Field, Information Technology}",responsibilities provide helpdesk support fo...,0,0,0
3,Customer Engineer,applied materials south east asia pte. ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Key Responsibilities\r\nPerforms all standard ...,Semi conductor supplier,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,...,2.0,5.0,"[Tier, SolvingRecognizes, IP, guidanceInterper...","{Knowledge Management, Problem Solving, BKM, R...",{},{},key responsibilities performs all standard se...,0,0,0
4,Customer Engineer,applied materials south east asia pte. ltd.,permanent,"['Scalability', 'Cloud Computing', 'Google Clo...",Has developed specialized skills or is multi-s...,Semi conductor supplier,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,...,2.0,5.0,"[Tier II, IP, Revs, BKM]","{Knowledge Management, Problem Solving, BKM, R...",{Bachelor's},{},has developed specialized skills or is multi s...,0,0,0


### Add Clusters for each Job Posting

In [None]:
clustered_roles = pd.read_csv("../data/cleaned/unfiltered_df.csv", index_col=0)
clustered_roles = clustered_roles["Cluster"]
clustered_roles

0         Software Developer
1        Solutions Architect
2                     Others
3                     Others
4                     Others
                ...         
11443        Devops Engineer
11444        Devops Engineer
11445        Devops Engineer
11446      Software Engineer
11447                 Others
Name: Cluster, Length: 11448, dtype: object

In [72]:
combined_data["cluster"] = clustered_roles

In [82]:
filtered_clusters = combined_data[combined_data["cluster"] != "Others"]
filtered_clusters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5405 entries, 0 to 11446
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         5405 non-null   object 
 1   company_name      5405 non-null   object 
 2   job_type          5402 non-null   object 
 3   industry          5405 non-null   object 
 4   date_posted       5046 non-null   object 
 5   url               5405 non-null   object 
 6   source            5405 non-null   object 
 7   lower_salary      2874 non-null   float64
 8   upper_salary      2874 non-null   float64
 9   lower_year        5405 non-null   object 
 10  upper_year        5405 non-null   object 
 11  all_tech_stack    5405 non-null   object 
 12  education_level   5405 non-null   object 
 13  education_field   5405 non-null   object 
 14  cleaned_job_desc  5405 non-null   object 
 15  aws               5405 non-null   int64  
 16  google_cloud      5405 non-null   int64  

## Remove redundant columns

In [73]:
combined_data.columns

Index(['job_title', 'company_name', 'job_type', 'tech_stack', 'job_desc',
       'industry', 'date_posted', 'url', 'source', 'lower_salary',
       'upper_salary', 'lower_year', 'upper_year', 'extracted_tech_stack',
       'all_tech_stack', 'education_level', 'education_field',
       'cleaned_job_desc', 'aws', 'google_cloud', 'azure', 'cluster'],
      dtype='object')

In [76]:
combined_data.drop(columns=["tech_stack", "extracted_tech_stack", "job_desc"], inplace=True)
combined_data.head(5)

Unnamed: 0,job_title,company_name,job_type,industry,date_posted,url,source,lower_salary,upper_salary,lower_year,upper_year,all_tech_stack,education_level,education_field,cleaned_job_desc,aws,google_cloud,azure,cluster
0,Senior Python Developer,citi,permanent,Banking/Accounting/Financial Services,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{RabbitMQ, Tableau Dashboards, Bitbucket, SOAP...",{},{},job description job purpose we are look...,1,1,0,Software Developer
1,Senior Partner Solutions Architect (Microsoft),amazon,permanent,Internet/E-commerce,2022-02-18,https://www.monster.com.sg/seeker/job-details?...,monster,,,,,"{Queue, Containers, Azure, RFPs, Microsoft ELA...",{Master's},{},job description job summary description d...,1,1,1,Solutions Architect
2,IT Technician,ascend com pte. ltd.,permanent,Corporate,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,24000.0,36000.0,2.0,5.0,"{Computer Hardware, Switches, LAN, Firewall, N...",{Diploma},"{Relevant Field, Information Technology}",responsibilities provide helpdesk support fo...,0,0,0,Others
3,Customer Engineer,applied materials south east asia pte. ltd.,permanent,Semi conductor supplier,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,42000.0,84000.0,2.0,5.0,"{Knowledge Management, Problem Solving, BKM, R...",{},{},key responsibilities performs all standard se...,0,0,0,Others
4,Customer Engineer,applied materials south east asia pte. ltd.,permanent,Semi conductor supplier,2022-02-19,https://www.monster.com.sg/seeker/job-details?...,monster,46800.0,93600.0,2.0,5.0,"{Knowledge Management, Problem Solving, BKM, R...",{Bachelor's},{},has developed specialized skills or is multi s...,0,0,0,Others


In [80]:
combined_data.to_json("../data/cleaned/combined_data.json", orient="columns")

In [None]:
filtered_clusters.to_json("../data/cleaned/filtered_data.json", orient="columns")