In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import warnings
warnings.filterwarnings("ignore")

'''
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
'''

# Load dataset
job_path = "jobstreet_all_job_dataset.csv" 
jobs_df = pd.read_csv(job_path)

admindiv_path = "daerah-working-set.csv" 
admindiv_df = pd.read_csv(admindiv_path)

loc_adm_path = "location_admindiv_mapping.csv"
admindiv_map = pd.read_csv(loc_adm_path)

print("Initial shape:", jobs_df.shape)
jobs_df.head()

Initial shape: (69024, 11)


Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate
0,74630583.0,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z
1,74660602.0,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z
2,74655679.0,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z
3,74657624.0,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Petaling,Engineering,Electrical/Electronic Engineering,services-engineer,Full time,"RM 3,000 – RM 3,500 per month",2024-03-22T05:32:09Z
4,74679363.0,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Hulu Langat,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Full time,"RM 2,800 – RM 3,500 per month",2024-03-23T03:56:39Z


# Location cleaning

In [2]:
# Take the first part of admindiv_match before the comma, that's the State
admindiv_map['State'] = admindiv_map['admindiv_match'].str.split(',').str[0]

In [3]:
# Drop the admindiv_match col
admindiv_map = admindiv_map.drop(columns='admindiv_match')

In [4]:
# Map this back to the jobs_df location field. Use the State field for subsequent location related analysis
jobs_df_merge = jobs_df.merge(admindiv_map, how='left', on='location')

In [5]:
jobs_df_merge.head()

Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate,State
0,74630583.0,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z,Negeri Sembilan
1,74660602.0,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z,Selangor
2,74655679.0,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z,Selangor
3,74657624.0,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Petaling,Engineering,Electrical/Electronic Engineering,services-engineer,Full time,"RM 3,000 – RM 3,500 per month",2024-03-22T05:32:09Z,Selangor
4,74679363.0,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Hulu Langat,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Full time,"RM 2,800 – RM 3,500 per month",2024-03-23T03:56:39Z,Selangor


# Job role cleaning

In [6]:
# Removes digits after underscore first
# Remove underscore, comma
jobs_df_merge["role_clean"] = jobs_df_merge["role"].apply(
    lambda x: re.sub(r'[,_]', '', re.sub(r'_\d+', '', str(x)))
)
# Replace dashes with single space
jobs_df_merge["role_clean"] = jobs_df_merge["role_clean"].apply(
    lambda x: re.sub(r'\-+', ' ', str(x))
)

In [7]:
# Capitalise roles
jobs_df_merge["role_clean"] = jobs_df_merge["role_clean"].apply(lambda x: x.title())

In [8]:
role_clean = pd.DataFrame(jobs_df_merge['role_clean'].unique())
#role_clean.to_csv('role_clean.csv', index=False)

In [9]:
jobs_df_merge.head()

Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate,State,role_clean
0,74630583.0,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z,Negeri Sembilan,Procurement Executive
1,74660602.0,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z,Selangor,Executive Assistant
2,74655679.0,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z,Selangor,Asset Management Analyst
3,74657624.0,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Petaling,Engineering,Electrical/Electronic Engineering,services-engineer,Full time,"RM 3,000 – RM 3,500 per month",2024-03-22T05:32:09Z,Selangor,Services Engineer
4,74679363.0,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Hulu Langat,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Full time,"RM 2,800 – RM 3,500 per month",2024-03-23T03:56:39Z,Selangor,Purchasing Executive


# Job type cleaning

In [10]:
jobs_df_merge["type_clean"] = jobs_df_merge["type"].apply(lambda x: re.sub(r'["\'\[\]]', '', str(x)))

In [11]:
pd.DataFrame(jobs_df_merge['type_clean'].unique())

Unnamed: 0,0
0,Contract/Temp
1,Full time
2,Casual/Vacation
3,Part time
4,"Contract/Temp, Full time"
5,"Full time, Part time"
6,"Contract/Temp, Full time, Part time"


In [12]:
jobs_df_merge.head()

Unnamed: 0,job_id,job_title,company,descriptions,location,category,subcategory,role,type,salary,listingDate,State,role_clean,type_clean
0,74630583.0,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",procurement-executive,Contract/Temp,,2024-03-21T05:58:35Z,Negeri Sembilan,Procurement Executive,Contract/Temp
1,74660602.0,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Petaling,Accounting,Bookkeeping & Small Practice Accounting,executive-assistant,Full time,"RM 2,800 – RM 3,200 per month",2024-03-22T06:52:57Z,Selangor,Executive Assistant,Full time
2,74655679.0,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Klang District,"Manufacturing, Transport & Logistics",Analysis & Reporting,asset-management-analyst,Full time,,2024-03-22T04:22:43Z,Selangor,Asset Management Analyst,Full time
3,74657624.0,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Petaling,Engineering,Electrical/Electronic Engineering,services-engineer,Full time,"RM 3,000 – RM 3,500 per month",2024-03-22T05:32:09Z,Selangor,Services Engineer,Full time
4,74679363.0,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Hulu Langat,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",purchasing-executive,Full time,"RM 2,800 – RM 3,500 per month",2024-03-23T03:56:39Z,Selangor,Purchasing Executive,Full time


# Overall cleaning

In [13]:
df = jobs_df_merge[['job_id','job_title', 'company', 'descriptions', 'State', 'category', 
                   'subcategory', 'role_clean', 'type_clean','salary']]

In [14]:
print(df.shape)
df.head()

(69024, 10)


Unnamed: 0,job_id,job_title,company,descriptions,State,category,subcategory,role_clean,type_clean,salary
0,74630583.0,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Procurement Executive,Contract/Temp,
1,74660602.0,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Selangor,Accounting,Bookkeeping & Small Practice Accounting,Executive Assistant,Full time,"RM 2,800 – RM 3,200 per month"
2,74655679.0,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Selangor,"Manufacturing, Transport & Logistics",Analysis & Reporting,Asset Management Analyst,Full time,
3,74657624.0,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Selangor,Engineering,Electrical/Electronic Engineering,Services Engineer,Full time,"RM 3,000 – RM 3,500 per month"
4,74679363.0,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Selangor,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Purchasing Executive,Full time,"RM 2,800 – RM 3,500 per month"


## combining all text columns

In [15]:
cols=['job_title', 'company', 'descriptions', 'State', 'category', 'subcategory', 'role_clean', 'type_clean' ]

def combine_row_values(row, cols=['job_title', 'company', 'descriptions', 'State', 'category', 'subcategory', 'role_clean', 'type_clean']):
    string = []
    for col_name in cols:
        string.append(f"{col_name}: {row[col_name]}")
    return ". ".join(string)

# Combine text in these cols to one string
df['text'] = df.apply(combine_row_values, axis=1)

# Convert job_id column to string
df['job_id']=df['job_id'].astype(str).str[:-2]
df.head()

Unnamed: 0,job_id,job_title,company,descriptions,State,category,subcategory,role_clean,type_clean,salary,text
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Procurement Executive,Contract/Temp,,job_title: Procurement Executive (Contract). c...
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Selangor,Accounting,Bookkeeping & Small Practice Accounting,Executive Assistant,Full time,"RM 2,800 – RM 3,200 per month",job_title: Account Executive/ Assistant. compa...
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Selangor,"Manufacturing, Transport & Logistics",Analysis & Reporting,Asset Management Analyst,Full time,,"job_title: Data Analyst - Asset Management, SP..."
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Selangor,Engineering,Electrical/Electronic Engineering,Services Engineer,Full time,"RM 3,000 – RM 3,500 per month",job_title: Service Engineer. company: Sun Medi...
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Selangor,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Purchasing Executive,Full time,"RM 2,800 – RM 3,500 per month",job_title: Purchasing Executive. company: Magn...


## combining title + description

In [16]:
# Combine job title and description into one field
df["title+desc"] = df["job_title"].fillna('') + " " + df["descriptions"].fillna('')

df = df[df["title+desc"].str.strip().astype(bool)]

df.head()

Unnamed: 0,job_id,job_title,company,descriptions,State,category,subcategory,role_clean,type_clean,salary,text,title+desc
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Procurement Executive,Contract/Temp,,job_title: Procurement Executive (Contract). c...,Procurement Executive (Contract) Position Purp...
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Selangor,Accounting,Bookkeeping & Small Practice Accounting,Executive Assistant,Full time,"RM 2,800 – RM 3,200 per month",job_title: Account Executive/ Assistant. compa...,Account Executive/ Assistant We are looking fo...
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Selangor,"Manufacturing, Transport & Logistics",Analysis & Reporting,Asset Management Analyst,Full time,,"job_title: Data Analyst - Asset Management, SP...","Data Analyst - Asset Management, SPX Express P..."
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Selangor,Engineering,Electrical/Electronic Engineering,Services Engineer,Full time,"RM 3,000 – RM 3,500 per month",job_title: Service Engineer. company: Sun Medi...,Service Engineer You are important for trouble...
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Selangor,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Purchasing Executive,Full time,"RM 2,800 – RM 3,500 per month",job_title: Purchasing Executive. company: Magn...,Purchasing Executive MAG is a trailblazer in t...


In [17]:
def normalize_text(text):
    text = re.sub(r'<.*?>', ' ', text)                     # remove HTML tags
    text = re.sub(r'\\n|\n', ' ', text)                    # remove newlines
    text = re.sub(r'[^A-Za-z0-9\s]', ' ', text)            # keep letters & numbers only
    text = text.lower()                                    # lowercase
    text = re.sub(r'\s+', ' ', text).strip()               # normalize spaces
    return text

df["clean_title+desc"] = df["title+desc"].apply(normalize_text)
df["clean_text"] = df["text"].apply(normalize_text)
df.head()


Unnamed: 0,job_id,job_title,company,descriptions,State,category,subcategory,role_clean,type_clean,salary,text,title+desc,clean_title+desc,clean_text
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Procurement Executive,Contract/Temp,,job_title: Procurement Executive (Contract). c...,Procurement Executive (Contract) Position Purp...,procurement executive contract position purpos...,job title procurement executive contract compa...
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Selangor,Accounting,Bookkeeping & Small Practice Accounting,Executive Assistant,Full time,"RM 2,800 – RM 3,200 per month",job_title: Account Executive/ Assistant. compa...,Account Executive/ Assistant We are looking fo...,account executive assistant we are looking for...,job title account executive assistant company ...
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Selangor,"Manufacturing, Transport & Logistics",Analysis & Reporting,Asset Management Analyst,Full time,,"job_title: Data Analyst - Asset Management, SP...","Data Analyst - Asset Management, SPX Express P...",data analyst asset management spx express perf...,job title data analyst asset management spx ex...
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Selangor,Engineering,Electrical/Electronic Engineering,Services Engineer,Full time,"RM 3,000 – RM 3,500 per month",job_title: Service Engineer. company: Sun Medi...,Service Engineer You are important for trouble...,service engineer you are important for trouble...,job title service engineer company sun medical...
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Selangor,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Purchasing Executive,Full time,"RM 2,800 – RM 3,500 per month",job_title: Purchasing Executive. company: Magn...,Purchasing Executive MAG is a trailblazer in t...,purchasing executive mag is a trailblazer in t...,job title purchasing executive company magnet ...


In [18]:
stop_words = set(stopwords.words("english"))
custom_stop = {"experience","responsibilities","requirements","requirement","knowledge",
    "skill","skills","advantage","preferred","strong","good","excellent",
    "degree","diploma","bachelor","graduate","title",
    "independent","self-motivated","hardworking",
    "deadline","pressure","benefits","apply","immediately", "writing","spoken","etc","others","job","work","company",
    "candidate","menu","new","plus","years","revenue","ensure", "provide", "including", "malaysia"}
stop_words |= custom_stop
stop_words |= set(stopwords.words("indonesian")) # unable to find malay stopword, hence using indonasian which is relatively close
lemmatizer = WordNetLemmatizer()

def preprocess_nltk(text):
    sentences = sent_tokenize(text)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)                        
        for w in words:
            if w.isalpha() and w not in stop_words:        
                lemma = lemmatizer.lemmatize(w)           
                tokens.append(lemma)
    return " ".join(tokens)
df["processed_title+desc"] = df["clean_title+desc"].apply(preprocess_nltk)
df["processed_text"] = df["clean_text"].apply(preprocess_nltk)
print(df.shape)
df.head()

(69024, 16)


Unnamed: 0,job_id,job_title,company,descriptions,State,category,subcategory,role_clean,type_clean,salary,text,title+desc,clean_title+desc,clean_text,processed_title+desc,processed_text
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Procurement Executive,Contract/Temp,,job_title: Procurement Executive (Contract). c...,Procurement Executive (Contract) Position Purp...,procurement executive contract position purpos...,job title procurement executive contract compa...,procurement executive contract position purpos...,procurement executive contract coca cola bottl...
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Selangor,Accounting,Bookkeeping & Small Practice Accounting,Executive Assistant,Full time,"RM 2,800 – RM 3,200 per month",job_title: Account Executive/ Assistant. compa...,Account Executive/ Assistant We are looking fo...,account executive assistant we are looking for...,job title account executive assistant company ...,account executive assistant looking account ex...,account executive assistant acoustic lighting ...
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Selangor,"Manufacturing, Transport & Logistics",Analysis & Reporting,Asset Management Analyst,Full time,,"job_title: Data Analyst - Asset Management, SP...","Data Analyst - Asset Management, SPX Express P...",data analyst asset management spx express perf...,job title data analyst asset management spx ex...,data analyst asset management spx express perf...,data analyst asset management spx express shop...
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Selangor,Engineering,Electrical/Electronic Engineering,Services Engineer,Full time,"RM 3,000 – RM 3,500 per month",job_title: Service Engineer. company: Sun Medi...,Service Engineer You are important for trouble...,service engineer you are important for trouble...,job title service engineer company sun medical...,service engineer important troubleshooting ins...,service engineer sun medical system sdn bhd de...
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Selangor,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Purchasing Executive,Full time,"RM 2,800 – RM 3,500 per month",job_title: Purchasing Executive. company: Magn...,Purchasing Executive MAG is a trailblazer in t...,purchasing executive mag is a trailblazer in t...,job title purchasing executive company magnet ...,purchasing executive mag trailblazer industry ...,purchasing executive magnet security automatio...


In [19]:
print("Unique categories:", df["category"].nunique())
print("Unique subcategories:", df["subcategory"].nunique())
print("\nCategories:\n", df["category"].value_counts().head(30))

Unique categories: 30
Unique subcategories: 310

Categories:
 category
Accounting                                11308
Engineering                                8978
Information & Communication Technology     8675
Administration & Office Support            7101
Sales                                      5715
Manufacturing, Transport & Logistics       5388
Human Resources & Recruitment              4055
Marketing & Communications                 3004
Banking & Financial Services               2641
Call Centre & Customer Service             1792
Construction                               1621
Retail & Consumer Products                 1504
Hospitality & Tourism                       974
Healthcare & Medical                        910
Design & Architecture                       879
Education & Training                        810
Real Estate & Property                      705
Trades & Services                           600
Science & Technology                        551
Advertising, Arts

In [20]:
# Mapping from old categories to new merged ones
category_mapping = {
    # Accounting & Finance
    "Accounting": "Accounting & Finance",
    "Banking & Financial Services": "Accounting & Finance",

    # Engineering & Technology
    "Engineering": "Engineering & Technology",
    "Information & Communication Technology": "Engineering & Technology",
    "Science & Technology": "Engineering & Technology",
    "Mining, Resources & Energy": "Engineering & Technology",

    # Manufacturing & Logistics
    "Manufacturing, Transport & Logistics": "Manufacturing & Logistics",

    # Administration
    "Administration & Office Support": "Administration",

    # Sales & Marketing
    "Sales": "Sales & Marketing",
    "Marketing & Communications": "Sales & Marketing",

    # Creative & Design
    "Advertising, Arts & Media": "Creative & Design",
    "Design & Architecture": "Creative & Design",

    # Hospitality & Services
    "Hospitality & Tourism": "Hospitality & Services",
    "Retail & Consumer Products": "Hospitality & Services",
    "Call Centre & Customer Service": "Hospitality & Services",

    # Healthcare
    "Healthcare & Medical": "Healthcare",

    # Human Resources
    "Human Resources & Recruitment": "Human Resources",

    # Legal & Compliance
    "Legal": "Legal & Compliance",

    # Management & Strategy
    "CEO & General Management": "Management & Strategy",
    "Consulting & Strategy": "Management & Strategy",

    # Construction & Trades
    "Construction": "Construction & Trades",
    "Trades & Services": "Construction & Trades",
    "Real Estate & Property": "Construction & Trades",

    # Education
    "Education & Training": "Education",

    # Social & Community / Other
    "Farming, Animals & Conservation": "Social & Community / Other",
    "Sport & Recreation": "Social & Community / Other",
    "Community Services & Development": "Social & Community / Other",
    "Government & Defence": "Social & Community / Other",
    "Self Employment": "Social & Community / Other",
    "Insurance & Superannuation": "Social & Community / Other",
}

# Create the new column based on the mapping
df["merged_category"] = df["category"].map(category_mapping)


In [21]:
print("Unique merged categories:", df["merged_category"].nunique())
print("\nNEW Categories:\n", df["merged_category"].value_counts().head(30))

Unique merged categories: 14

NEW Categories:
 merged_category
Engineering & Technology      18331
Accounting & Finance          13949
Sales & Marketing              8719
Administration                 7101
Manufacturing & Logistics      5388
Hospitality & Services         4270
Human Resources                4055
Construction & Trades          2926
Creative & Design              1295
Healthcare                      910
Education                       810
Social & Community / Other      461
Management & Strategy           435
Legal & Compliance              374
Name: count, dtype: int64


In [22]:
df.shape

(69024, 17)

In [23]:
df = df.drop_duplicates(subset=['processed_title+desc'], keep="first")
print("After deduplication:", df.shape)

After deduplication: (65077, 17)


In [24]:
unmapped = df[df["merged_category"].isna()]
print(unmapped["category"].unique())

[]


In [25]:
data_cleaned = df[['job_id','job_title', 'company', 'descriptions', 'State', 'merged_category', 'category', 
                   'subcategory', 'role_clean', 'type_clean','salary','processed_title+desc','processed_text']]

In [26]:
print(data_cleaned.shape)
data_cleaned.head()

(65077, 13)


Unnamed: 0,job_id,job_title,company,descriptions,State,merged_category,category,subcategory,role_clean,type_clean,salary,processed_title+desc,processed_text
0,74630583,Procurement Executive (Contract),Coca-Cola Bottlers (Malaysia) Sdn Bhd,Position Purpose\nManage aspects of procuremen...,Negeri Sembilan,Manufacturing & Logistics,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Procurement Executive,Contract/Temp,,procurement executive contract position purpos...,procurement executive contract coca cola bottl...
1,74660602,Account Executive/ Assistant,Acoustic & Lighting System Sdn Bhd,We are looking for a Account Executive/ Assist...,Selangor,Accounting & Finance,Accounting,Bookkeeping & Small Practice Accounting,Executive Assistant,Full time,"RM 2,800 – RM 3,200 per month",account executive assistant looking account ex...,account executive assistant acoustic lighting ...
2,74655679,"Data Analyst - Asset Management, SPX Express",Shopee Mobile Malaysia Sdn Bhd,Performs detailed data analysis on existing sp...,Selangor,Manufacturing & Logistics,"Manufacturing, Transport & Logistics",Analysis & Reporting,Asset Management Analyst,Full time,,data analyst asset management spx express perf...,data analyst asset management spx express shop...
3,74657624,Service Engineer,Sun Medical Systems Sdn Bhd,"You are important for troubleshooting, install...",Selangor,Engineering & Technology,Engineering,Electrical/Electronic Engineering,Services Engineer,Full time,"RM 3,000 – RM 3,500 per month",service engineer important troubleshooting ins...,service engineer sun medical system sdn bhd de...
4,74679363,Purchasing Executive,Magnet Security & Automation Sdn. Bhd.,"MAG is a trailblazer in the industry, boasting...",Selangor,Manufacturing & Logistics,"Manufacturing, Transport & Logistics","Purchasing, Procurement & Inventory",Purchasing Executive,Full time,"RM 2,800 – RM 3,500 per month",purchasing executive mag trailblazer industry ...,purchasing executive magnet security automatio...


In [27]:
data_cleaned.to_csv('data_cleaned_1.csv', index=False, encoding='utf-8-sig')

In [28]:
df.duplicated(subset=['processed_title+desc']).sum()

0