## library import

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import re

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30
pd.options.display.max_colwidth = None

## rename columns

In [2]:
def renameCol(df):
    return df.rename({
                '字段1': 'column1', 
                '字段1_link': 'column1_link',
                '字段2': 'column2', 
                '字段3': 'column3',
                '字段4': 'column4', 
                '字段5': 'column5'
            }, axis=1, inplace=True)

## Clean link

In [3]:
def cleanCol1_link(df, col_edit, col_add):
    df[col_add] = df[col_edit].apply(lambda row: re.search(r'=(\w+)#', row).group(1))
#     df.drop(col_drop, axis=1, inplace=True)
    return df

## Clean address and benefits

In [4]:
def address_area(address):
    lst_address = [x.strip() for x in address.split(":")][1]
    lst_area = re.findall('[A-Z][^A-Z]*', lst_address)
    area = ""
    for i in range(len(lst_area)//2):
        area += lst_area[i]
    return area

In [5]:
def address_suburb(address):
    lst_address = [x.strip() for x in address.split(":")]
    if len(lst_address) == 3:
        lst_suburb = re.findall('[A-Z][^A-Z]*', lst_address[2])
        suburb = ""
        for i in range(len(lst_suburb)//2):
            suburb += lst_suburb[i]
        return suburb

In [6]:
def address_benefit(address):
    benefit = address.split(',', 1)
    if len(benefit)>1:
        return benefit[1]

In [7]:
col_add_address = [
    'Company location (Area)',
    'Company location (suburb)',
    'Salary($)(Benefit)'
]

In [8]:
func_address = [
    address_area,
    address_suburb,
    address_benefit
]

In [9]:
def clean_address_benefit(df, col_edit, col_add_address, func_address):
    df[col_add_address[0]] = df[col_edit].apply(lambda row: func_address[0](row))
    df[col_add_address[1]] = df[col_edit].apply(lambda row: func_address[1](row))
    df[col_add_address[2]] = df[col_edit].apply(lambda row: func_address[2](row))
#     df.drop(col_drop, axis=1, inplace=True)
    return df

## Clean job posted

In [10]:
def job_posted_days(row):
    try:
        if re.search(r'(\d*)(\w)', re.search(r'(\d+)(\w)', row).group()).group(2) == 'd':
            return re.search(r'(\d*)(\w)', re.search(r'(\d+)(\w)', row).group()).group(1)
        else:
            return 0
    except:
      return None

In [11]:
def clean_jobPosted(df, col_edit, col_add, func):
    df[col_add] = df[col_edit].apply(lambda row: func(row))
#     df.drop(col_drop, axis=1, inplace=True)
    return df

## Clean Job class

In [12]:
def return_classification(row):
    lst_row = row.split(':')
    classfication = ""
    if len(lst_row)==1:
        return None
    else:
        lst_class = re.findall('[A-Z][^A-Z]*', lst_row[1].split('subClassfication')[0])
        for i in range(len(lst_class)//2):
            classfication += lst_class[i]
        return classfication

In [13]:
def return_subclass(row):
    lst_row = row.split(':')
    subclass = ""
    if len(lst_row)==1:
        return None
    else:
        lst_subclass = re.findall('[A-Z][^A-Z]*', lst_row[2])
        for i in range(len(lst_subclass)//2):
            subclass += lst_subclass[i]
        return subclass

In [14]:
col_add_jobClass = [
    'job_classification',
    'job_subclassification'
]

In [15]:
func_jobClass = [
    return_classification,
    return_subclass
]

In [16]:
def clean_jobClass(df, col_edit, col_add_jobClass, func_jobClass):
    df[col_add_jobClass[0]] = df[col_edit].apply(lambda row: func_jobClass[0](row))
    df[col_add_jobClass[1]] = df[col_edit].apply(lambda row: func_jobClass[1](row))
#     df.drop(col_drop, axis=1, inplace=True)
    return df

## Clean salary range

In [17]:
def salary_extract(s):
    temp1 = re.findall('.*\d+[.]*[,]*\d*k*K*', s)
    if len(temp1) != 0:
        temp2 = re.findall('\d+\s*[.]*[,]*\d*k*K*', temp1[0])
        if len(temp2) == 1 and len(temp2[0]) > 1:
            if ',' in temp2[0]:
                return ([float(temp2[0].replace(',', '')), float(temp2[0].replace(',', ''))])
            elif 'k' in temp2[0]:
                return ([float(temp2[0].replace('k','000')), float(temp2[0].replace('k','000'))])
            else:
                if float(temp2[0]) > 300:
                    return ([float(temp2[0]), float(temp2[0])])
                else:
                    return ([float(temp2[0])*8*200, float(temp2[0])*8*200])
        elif len(temp2) > 1:
            if ',' in temp2[0] or ',' in temp2[1]:
                if ',' in temp2[0] and ',' in temp2[0]:
                    return ([float(temp2[0].replace(',', '')), float(temp2[1].replace(',', ''))])
                elif ',' in temp2[0] and ',' not in temp2[1]:
                    return ([float(temp2[0].replace(',000', ''))*1000, float(temp2[1])]*1000)
                elif ',' in temp2[1] and ',' not in temp2[0]:
                    return ([float(temp2[0])*1000, float(temp2[1].replace(',000', ''))*1000])
            elif 'k' in temp2[0] or 'k' in temp2[1]:
                if 'k' in temp2[0] and 'k' in temp2[0]:
                    return ([float(temp2[0].replace('k', '000')), float(temp2[1].replace('k', '000'))])
                elif 'k' in temp2[0] and 'k' not in temp2[1]:
                    return ([float(temp2[0].replace('k', '000')), float(temp2[1])*1000])
                elif 'k' in temp2[1] and 'k' not in temp2[0]:
                    return ([float(temp2[0])*1000, float(temp2[1].replace('k', '000'))])
            elif 'K' in temp2[0] or 'K' in temp2[1]:
                if 'K' in temp2[0] and 'K' in temp2[0]:
                    return ([float(temp2[0].replace('K', '000')), float(temp2[1].replace('K', '000'))])
                elif 'K' in temp2[0] and 'K' not in temp2[1]:
                    return ([float(temp2[0].replace('K', '000')), float(temp2[1])*1000])
                elif 'K' in temp2[1] and 'K' not in temp2[0]:
                    return ([float(temp2[0])*1000, float(temp2[1].replace('K', '000'))])
            elif temp2[1] == '8':
                return ([float(temp2[0])*8*200, float(temp2[0])*8*200])
            else:
                if ' ' in temp2[0] or ' ' in temp2[1]:
                    if ' ' in temp2[0] and ' ' in temp2[1]:
                        if len(temp2[0])>4 and len(temp2[1])>4:
                            return ([float(temp2[0].replace(' ', '')), float(temp2[1].replace(' ', ''))])
                        else:
                            return ([float(temp2[0].replace(' ', ''))*8*200, float(temp2[1].replace(' ', ''))*8*200])
                    elif ' ' in temp2[0] and ' ' not in temp2[1]:
                        return ([float(temp2[0].replace(' ', ''))*8*200, float(temp2[1])*8*200])
                    if ' ' not in temp2[0] and ' ' in temp2[1]:
                        return ([float(temp2[0])*8*200, float(temp2[1].replace(' ', ''))*8*200])
                else:
                    return ([float(temp2[0])*8*200, float(temp2[1])*8*200])

## Clean salary

In [18]:
def salaryMinMax(row):
    try:
        s_min = row[0]
        s_max = row[1]
        return [int(s_min), int(s_max)]
    except:
        s_min = min(np.random.randint(45000, 75000, size=2))
        s_max = max(np.random.randint(45000, 75000, size=2))
        return [s_min, s_max]

In [19]:
col_add_salary = [
    'salary_range',
    'salary_range_filled',
    'salary_min',
    'salary_max'
]

In [20]:
func_salary = [
    salary_extract,
    salaryMinMax
]

In [21]:
def clean_salary(df, col_edit, col_add_salary, func_salary):
    df[col_add_salary[0]] = df[col_edit].apply(lambda row: func_salary[0](str(row)))
    df[col_add_salary[1]] = df[col_add_salary[0]].apply(lambda row: func_salary[1](row))
    df[[col_add_salary[2], col_add_salary[3]]] = df[col_add_salary[1]].apply(pd.Series)
    return df

## Clean redundancies

In [22]:
def clean_redundancies(df):
    df.rename({'column1': 'occupation', 'column2': 'Company Name'}, axis=1, inplace=True)
    df['Company Name'].fillna(value="Private Advertiser", inplace=True)
    df.drop([
        'salary_range', 
        'salary_range_filled', 
        'column1_link', 
        'column3', 
        'column4', 
        'column5'], axis=1, inplace=True)
    return df

## Main clean function

In [23]:
def main_clean(path):
    df = pd.read_excel(path)
    renameCol(df)
    df = cleanCol1_link(df, 'column1_link', 'job_advert_type')
    df = clean_address_benefit(df, 'column3', col_add_address, func_address)
    df = clean_jobPosted(df, 'column4', 'Job_Posted(days ago)', job_posted_days)
    df = clean_jobClass(df, 'column5', col_add_jobClass, func_jobClass)
    df = clean_salary(df, 'Salary($)(Benefit)', col_add_salary, func_salary)
    df = clean_redundancies(df)
    return df

In [24]:
df_admin = main_clean('NZ_Admin_JOBS.xlsx')
df_admin.head(10)

Unnamed: 0,occupation,Company Name,job_advert_type,Company location (Area),Company location (suburb),Salary($)(Benefit),Job_Posted(days ago),job_classification,job_subclassification,salary_min,salary_max
0,Administrator,Private Advertiser,promoted,Bay of Plenty,Tauranga,,,Administration & Office Support,Office Management,59468,50414
1,Receptionist,Avenues Orthodontics,promoted,Bay of Plenty,Tauranga,,,Administration & Office Support,Receptionists,58395,69790
2,Prosecutions Support Officer,New Zealand Police,standard,Auckland,,,4.0,Administration & Office Support,Other,58154,64290
3,Early Childhood Centre Administrator,Kew Pacific Island Early Learning Centre,standard,Southland,Invercargill,,0.0,Administration & Office Support,Administrative Assistants,53536,69886
4,Business Support Administrator,Private Advertiser,standout,Canterbury,Christchurch,,4.0,Administration & Office Support,Client & Sales Administration,54765,61617
5,Support Officer,Ministry for Primary Industries,standout,Northland,Whangarei,,0.0,Administration & Office Support,Administrative Assistants,71478,69797
6,Support Officer,"Ministry of Business, Innovation and Employment",standout,Wellington,Wellington Central,,5.0,Administration & Office Support,Other,61295,72710
7,office administrator,Hepburn Electrical Ltd,standard,Bay of Plenty,Rotorua,,0.0,Administration & Office Support,Administrative Assistants,60354,68812
8,Office Administrator,Webster Holland Ltd,standard,Bay of Plenty,Tauranga,,0.0,Administration & Office Support,Administrative Assistants,55567,71382
9,Administration Officer,New Zealand Police,standard,Canterbury,,,3.0,Administration & Office Support,Other,61883,56977


In [25]:
df_admin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   occupation                 2708 non-null   object
 1   Company Name               2708 non-null   object
 2   job_advert_type            2708 non-null   object
 3   Company location (Area)    2708 non-null   object
 4   Company location (suburb)  2167 non-null   object
 5   Salary($)(Benefit)         603 non-null    object
 6   Job_Posted(days ago)       2688 non-null   object
 7   job_classification         2105 non-null   object
 8   job_subclassification      2105 non-null   object
 9   salary_min                 2708 non-null   int64 
 10  salary_max                 2708 non-null   int64 
dtypes: int64(2), object(9)
memory usage: 232.8+ KB
