In [None]:
import pandas as pd
import os 
import numpy as np
import re

In [None]:
directory = os. chdir("../data/MonsterSg")
files = os.listdir(".")
files

In [None]:
# get all relevant files in df
relevant_csv = [x for x in files if 'Cloud_MonsterSg' in x]
relevant_csv

In [None]:
# Combine all csv
df = pd.concat(map(pd.read_csv, relevant_csv), ignore_index=True, axis=0)
df

In [None]:
df.columns

# Drop NA and duplicates

In [None]:
'''
As Monster may direct users to the job posting in the company's website, the website format is different from Monster. 
Hence, data could not be scrapped for those job posting.
'''

# Drop job postings that are not from Monster
df.dropna(subset=[['salary', 'job_type', 'years_experience', 'tech_stack', 'job_description', 'industry', 'function', 'roles',
                   'last_updated', 'posted_on']], how=all, inplace=True)
df

In [None]:
df.info()

In [None]:
# postings without job_description
df[df['job_description'].isna()]

In [None]:
'''
Companies may post the same job a few times over a period of time
'''

# drop duplicates 
df.drop_duplicates(subset=['job_title', 'company'], inplace=True)
df

# Salary
Convert to list(int) --> [lower, upper]

In [None]:
# Check different types of data

# df['salary'].unique()

In [None]:
def clean_salary(salary):        
    if salary == 'Not Specified' or salary == 'nan':
        return np.nan
    else: 
        salary = re.sub("[a-zA-Z]", "", salary)
        salary_range = salary.strip().split('-')
        return salary_range

In [None]:
df['salary'] = df['salary'].apply(lambda x: clean_salary(str(x)))
df

# Years of Experience
Convert to list(int) --> [lower, upper]

In [None]:
# Check different types of data

# df['years_experience'].unique()

In [None]:
def clean_yr_exp(yr_exp):        
    if yr_exp == 'Not Specified' or yr_exp == 'nan':
        return np.nan
    elif yr_exp == 'Fresher':
        return [0,0]
    else: 
        yr_exp = yr_exp.split()[0]
        yr_exp_range = yr_exp.strip().split('-')
        return yr_exp_range

In [None]:
df['years_experience'] = df['years_experience'].apply(lambda x: clean_yr_exp(str(x)))
df

# Tech Stack
Convert to list

In [None]:
# Check different types of data

# df['tech_stack'][0]

In [None]:
def clean_tech_stack(tech_stack):
    if tech_stack == 'nan': 
        return np.nan
    else:
        return tech_stack.split('  ')

In [None]:
df['tech_stack'] = df['tech_stack'].apply(lambda x: clean_tech_stack(str(x)))
df

# Industry& functions
Convert to list

In [None]:
# df['industry'].unique()

In [None]:
# df['function'].unique()

In [None]:
def str_to_list(text):
    if text == 'nan': 
        return np.nan
    else:
        return text.split(' , ')

In [None]:
df['industry'] = df['industry'].apply(lambda x: str_to_list(str(x)))
df

In [None]:
df['function'] = df['function'].apply(lambda x: str_to_list(str(x)))
df

# Roles
Convert to list

In [None]:
# df['roles'].unique()

In [None]:
def clean_role(role):
    if '  ' in role: 
        return np.nan
    else: 
        return str_to_list(role)

In [None]:
df['roles'] = df['roles'].apply(lambda x: clean_role(str(x)))
df

# Date Posted
Extract data posted

In [None]:
df['posted_on'].unique()

In [None]:
df[df['posted_on'] == '30+ days ago']['last_updated'].value_counts()

In [None]:
df[(df['posted_on'] == '30+ days ago') & (df['last_updated'].isna())]

In [None]:
df['last_updated'].isna()

In [None]:
def getDatePosted(text, date_of_scraping):
    """Converts date posted in the format of xx hours/days/months ago into datetime format

    Args:
        text (str): text in format: xx hours/days/months ago
        date_of_scraping (str): date in format 'YYYY-MM-DD'

    Returns:
        str: date in format 'YYYY-MM-DD' after subtracting the stated period 
    """

    # First convert the input text to datetime data type
    date_of_scraping = datetime.strptime(date_of_scraping, '%Y-%m-%d')
    date_posted = ''

    if "hours" in text:
        # Automatically take the day before day of scraping (assume hours do not spill over to the day before)
        date = date_of_scraping + timedelta(days=-1) 
        date_posted = date.date().strftime('%Y-%m-%d')

    elif "days" in text:
        # Extract number from text
        n_days = int(text.split(" ")[0])
        date = date_of_scraping + timedelta(days=-n_days)
        date_posted = date.date().strftime('%Y-%m-%d')
        
    elif "months" in text:
        # Extract number from text
        n_months = int(text.split(" ")[1])
        date = date_of_scraping + relativedelta(months=-n_months)
        date_posted = date.date().strftime('%Y-%m-%d')

    return date_posted

In [None]:
# if there is last updated, then use else, use the function