In [26]:
# import
import numpy as np
import pandas as pd
import random
import re
import warnings
warnings.filterwarnings('ignore')

### Create a header and an ID for each file 

In [27]:
admin_header_name = ["Job", "URL", "Company", "Location", "Posted Date", "Classification"]
admin_file_name = 'NZ_Admin_JOBS.xlsx'
admin_id = 0

banking_header_name = ["Job", "URL", "Company", "Location", "Classification_2", "Posted Date", "Classification"]
banking_file_name = 'NZ_Banking_JOBS.xlsx'
banking_id = 1

ceo_header_name = ["Job", "URL", "Company", "Location", "Classification", "Posted Date"]
ceo_file_name = 'NZ_CEO_JOBS.xlsx'
ceo_id = 2

construction_header_name = ["Job", "URL", "Company", "Location", "Classification_2", "Posted Date", "Classification"]
construction_file_name = 'NZ_Construction_JOBS.xlsx'
construction_id = 3

# Functions

### 1. read_file 
Read the file to dataframe and add a header to it

In [28]:
def read_file(file_name, header_name, file_id):
    # read the file and add a header
    df = pd.read_excel(file_name, header=None, names=header_name, engine='openpyxl')
    if file_id == 1 or file_id == 3:
        df = df.drop('Classification_2',axis=1)
    elif file_id == 2 or file_id == 3:
        df = df.iloc[1:]
    return df

### 2. cut_half 
In the raw data, there exists many dupications. This function is used to cut the duplicated strings half.
E.g., it cuts "TaurangaTauranga" to "Tauranga"

In [29]:
def cut_half(x):
    # a function used in data_cleaning
    # cut the duplicated string half
    # e.g., cut "TaurangaTauranga" to "Tauranga"
    
    if x != None:
        if x[-4:] == "area":
            return (x[0:((len(x)-4)//2)])
        else:
            return (x[0:(len(x)//2)])
    else:
        return None

### 3. find_number 
In the raw data, "salary" and "salary infor" are merged into the same column. This function separates "salary" and "salary info".

In [30]:
def find_number(x):
    # separate "salary" and "salary info"
    if x != None and not pd.isna(x):
        if any(char.isdigit() for char in x):
            return x + ";"
        else:
            return ";" + x 

### 4. low_high_salary
In the raw data, the format of the salary is messy. E.g., it has both per hour and yearly pay in the form of $20, $20k, or $40,000. However, the low and high salary are always separated by "-". This function separates the high and lowest and highest salary, and then we estimate the empty data.

In [31]:
def low_high_salary(x, file_id):
    # refine the salary using regular expression
    # get the lowest and the highest salary
    
    output = []
    if x != None:
        x = x.replace(',','')
        x = ''.join((ch if ch in '0123456789.' else ' ') for ch in x)
        x = x.replace(' .','').replace(' 0','0')
        listOfNumbers = [float(i) for i in x.split()]
        if len(listOfNumbers) == 1:
            output = [listOfNumbers[0], listOfNumbers[0]]
        elif len(listOfNumbers) > 1:
            output = [listOfNumbers[0], listOfNumbers[1]]
        # if "0k " in x or "5k " in x or "0K " in x or "5K " in x:
        if bool(re.match("\dk", x)):
            output = [i * 1000 for i in output]
        if output and output[0] < 1000 and output[1] < 1000:
            output = [i * 1760 for i in output]
        if output and output[0] < 1000 and output[1] > 1000:
            output[0] = output[0] * 1000
    if not output:
        if file_id == 0:
            lo = random.uniform(35000, 55000)
            output = [lo, random.uniform(lo, 55000)]
        elif file_id == 1:
            lo = random.uniform(77000, 141000)
            output = [lo, random.uniform(lo, 141000)]
        elif file_id == 2:
            lo = random.uniform(84000, 255000)
            output = [lo, random.uniform(lo, 255000)]
    listToStr = ' '.join([str(int(i)) for i in output])       
    return listToStr

### 5. fix_posted_data
In the raw data. the "posted data" could take the form of either "featured at" or $4d ago, at$. We refine them all to "days" (integer)

In [32]:
def fix_posted_date(x):
    # refine the posted date
    if "Featured" in x or bool(re.search("\d[h,m]", x)):
        return str(0)
    elif bool(re.search("\dd", x)):
        return ''.join((ch if ch in '0123456789' else '') for ch in x)

### 6. fix_company
In the raw data, sometimes the company has a "at " at the begining. This function fixs this problem. 

In [33]:
def fix_company(x):
    if len(x) > 3 and x[:3] == "at ":
        return x[3:]
    else:
        return x

### 7. The main data preprocessing function

In [34]:
def data_cleaning(df, file_id):
    # data cleaning
    # file_id identifies which file it is
    
    # 1. Refine the "posted date" column
    
    column_date_place = df["Posted Date"].str.split(",", n = -1, expand = True) 
    df["Posted Date"] = column_date_place[0]
    df["Posted Place"] = column_date_place[2]
    
    
    # 2. Refine the "location" column
    
    # 2.1 Some rows contain the salary after "," and we need to remove them
    df["Location"] = df["Location"].str.split(",", n = 1, expand = True)[0]
    
    # 2.2 Need to remove the "location:" at the beginning of the strings, and then separate the region and the city
    column_location = df["Location"].str.split(": ", n = 3, expand = True) 
    df['Region'] = column_location[1]
    df['City'] = column_location[2]
    df = df.drop('Location',axis=1)
    
    # 2.3 Remove duplications
    df["Region"] = df["Region"].apply(cut_half)
    df["City"] = df["City"].apply(cut_half)
    
    
    # 3. Refine "classification"
    # Some rows contain the "classification" and the "subclassification", others contain the salary
    # Analyse "classification" and "salary" separately as df_classification and df_salary
    
    # 3.1 analyse "classification" 
    
    df_classification = df[df.Classification.str.contains(':',case=False)]
    column_classifications = df_classification["Classification"].str.split("subClassification: ", n = -1, expand = True)
    df_classification['Classification'] = column_classifications[0].str.split("classification: ", n = -1, expand = True)[1].apply(cut_half)
    df_classification['Sub-classification'] = column_classifications[1].apply(cut_half)

    
    # 3.2 analyse "salary"
    df_salary = df[~df.Classification.str.contains(':',case=True)]
    df_salary = df_salary.rename(columns = {'Classification':'Salary'})
    
    # 3.3 merge "classification" and "salary"
    df = pd.merge(df_classification, df_salary, how='outer')
    df = df[["Job", "URL", "Company", "Posted Date", "Posted Place", "Classification", "Sub-classification", "Region", "City", "Salary"]]
    
    
    # 4. Refine "salary"
    
    # 4.1 separate into "salary" and "salary info"
    df["Salary"] = df["Salary"].apply(find_number)
    column_salary_info = df["Salary"].str.split(";", n = 2, expand = True) 
    df["Salary"] = column_salary_info[0]
    if column_salary_info.shape[1] != 1:
        df["Salary Info"] = column_salary_info[1]
    else:
        df["Salary Info"] = np.nan
    
    # 4.2 obtain the lowest and the highest salary
    column_low_high = df["Salary"].apply(low_high_salary, args=(file_id,))
    column_low_high = column_low_high.str.split(" ", n = -1, expand = True) 
    df["Lowest Salary"] = column_low_high[0]
    df["Higest Salary"] = column_low_high[1]
    df = df.drop('Salary',axis=1)
    
    
    # 5 Merge "Company" and "Posted Place"
    
    df["Company"] = df["Company"].fillna(df['Posted Place'])
    df = df.drop('Posted Place',axis=1)
    
    
    # 6 Refine the posted date
    df["Posted Date (Days Ago)"] = df["Posted Date"].apply(fix_posted_date)
    df = df.drop('Posted Date',axis=1)
    
    
    # 7 Refine "Company"
    df["Company"] = df["Company"].apply(fix_company)
    
    
    #8 Remove null
    df = df.fillna(value='NO DATA')
    
    return df


# Load data

### 1. Admin 

In [22]:
df_admin = read_file(admin_file_name, admin_header_name, admin_id)
df_admin_final = data_cleaning(df_admin, admin_id)
df_admin_final.to_excel('Cleaned_NZ_Admin_JOBS.xlsx')
df_admin_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,Private Advertiser,Administration & Office Support,Office Management,Bay of Plenty,Tauranga,NO DATA,39914,48400,0
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Administration & Office Support,Receptionists,Bay of Plenty,Tauranga,NO DATA,44433,53819,0
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Administration & Office Support,Other,Auckland,NO DATA,NO DATA,38244,39278,4
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Administration & Office Support,Administrative Assistants,Southland,Invercargill,NO DATA,54962,54966,0
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,Private Advertiser,Administration & Office Support,Client & Sales Administration,Canterbury,Christchurch,NO DATA,43383,54925,4


### 2. Banking

In [23]:
df_banking = read_file(banking_file_name, banking_header_name, banking_id)
df_banking_final = data_cleaning(df_banking, banking_id)
df_banking_final.to_excel('Cleaned_NZ_Banking_JOBS.xlsx')
df_banking_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Accounts Receivable,https://www.seek.co.nz/job/50568753?type=promo...,MTF Finance Mt Wellington,Banking & Financial Services,Credit,Auckland,Auckland Central,NO DATA,120119,126332,0
1,Internal Audit Manager,https://www.seek.co.nz/job/50556333?type=promo...,Industrial and Commercial Bank of China (New Z...,Banking & Financial Services,Compliance & Risk,Auckland,Auckland Central,NO DATA,130829,134811,0
2,Client Services Officer,https://www.seek.co.nz/job/50638706?type=stand...,NZ Funds Management Limited,Banking & Financial Services,Client Services,Auckland,Auckland Central,NO DATA,120719,122992,0
3,Private Wealth Assistant - Queenstown,https://www.seek.co.nz/job/50617226?type=stand...,Craigs Investment Partners,Banking & Financial Services,Client Services,Otago,Queenstown & Wanaka,NO DATA,135116,135914,5
4,Business Banking Credit Analyst,https://www.seek.co.nz/job/50615555?type=stand...,Kiwibank,Banking & Financial Services,Banking - Business,Auckland,Auckland Central,NO DATA,78452,105144,5


In [19]:
df_banking = read_file(banking_file_name, banking_header_name, banking_id)
df_banking_final = data_cleaning(df_banking, banking_id)
df_banking_final.to_excel('Cleaned_NZ_Banking_JOBS.xlsx')
df_banking_final

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Accounts Receivable,https://www.seek.co.nz/job/50568753?type=promo...,MTF Finance Mt Wellington,Banking & Financial Services,Credit,Auckland,Auckland Central,NO DATA,91799,104466,0
1,Internal Audit Manager,https://www.seek.co.nz/job/50556333?type=promo...,Industrial and Commercial Bank of China (New Z...,Banking & Financial Services,Compliance & Risk,Auckland,Auckland Central,NO DATA,81455,110535,0
2,Client Services Officer,https://www.seek.co.nz/job/50638706?type=stand...,NZ Funds Management Limited,Banking & Financial Services,Client Services,Auckland,Auckland Central,NO DATA,136781,140082,0
3,Private Wealth Assistant - Queenstown,https://www.seek.co.nz/job/50617226?type=stand...,Craigs Investment Partners,Banking & Financial Services,Client Services,Otago,Queenstown & Wanaka,NO DATA,103263,139048,5
4,Business Banking Credit Analyst,https://www.seek.co.nz/job/50615555?type=stand...,Kiwibank,Banking & Financial Services,Banking - Business,Auckland,Auckland Central,NO DATA,111751,121332,5
...,...,...,...,...,...,...,...,...,...,...,...
3879,"Head of Product, Performance and Operations",https://www.seek.co.nz/job/50495204?type=stand...,Find Recruitment Limited,NO DATA,NO DATA,Auckland,Auckland Central,,205000,205000,26
3880,Online Broker,https://www.seek.co.nz/job/50494688?type=stand...,Car Finance 2U,NO DATA,NO DATA,Gisborne,Gisborne,,86000,86000,26
3881,Online Broker,https://www.seek.co.nz/job/50494688?type=stand...,Car Finance 2U,NO DATA,NO DATA,Gisborne,Gisborne,,86000,86000,26
3882,Financial Advisor,https://www.seek.co.nz/job/50497173?type=stand...,NetYourJob,NO DATA,NO DATA,Auckland,Rodney & North Shore,Flexible Hours - Remuneration Negotiable,77099,111544,26


### 3. CEO 

In [24]:
df_ceo = read_file(ceo_file_name, ceo_header_name, ceo_id)
df_ceo_final = data_cleaning(df_ceo, ceo_id)
df_ceo_final.to_excel('Cleaned_NZ_CEO_JOBS.xlsx')
df_ceo_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Director (x2) - Auckland Transport,https://www.seek.co.nz/job/50540328?type=promo...,Auckland Council,CEO & General Management,Board Appointments,Auckland,Auckland Central,NO DATA,214546,244647,0
1,Director - Panuku Development,https://www.seek.co.nz/job/50540626?type=promo...,Auckland Council,CEO & General Management,Board Appointments,Auckland,Auckland Central,NO DATA,85802,89393,0
2,Chief Executive,https://www.seek.co.nz/job/50623700?type=stand...,JacksonStone & Partners,CEO & General Management,CEO,Wellington,Porirua & Kapiti Coast,NO DATA,101774,149184,4
3,Chief Executive,https://www.seek.co.nz/job/50623848?type=stand...,JacksonStone & Partners,CEO & General Management,CEO,Manawatu,Wanganui,NO DATA,240040,246981,4
4,Chief Executive,https://www.seek.co.nz/job/50638864?type=stand...,Asset Recruitment Ltd,CEO & General Management,CEO,Waikato,Hamilton,NO DATA,192626,231119,0


### Construction

In [25]:
df_construction = read_file(construction_file_name, construction_header_name, construction_id)
df_construction_final = data_cleaning(df_construction, construction_id)
df_construction_final.to_excel('Cleaned_NZ_Construction_JOBS.xlsx')
df_construction_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,REGIONAL FOREMAN TELCO SECTOR TEAM LEADER AUCK...,https://www.seek.co.nz/job/50541089?type=promo...,Millennium Electrical Ltd,Construction,Foreperson/Supervisors,Auckland,Rodney & North Shore,NO DATA,,NO DATA,0
1,Bitumen Sprayer Operator,https://www.seek.co.nz/job/50548669?type=promo...,Johnstone & Masters Ltd,Construction,Plant & Machinery Operators,Bay of Plenty,Rotorua,NO DATA,,NO DATA,0
2,General Labourers,https://www.seek.co.nz/job/50640801?type=stand...,AWF,Construction,Other,Northland,Whangarei,NO DATA,,NO DATA,0
3,Project Manager,https://www.seek.co.nz/job/50640890?type=stand...,Faye Homes New Zealand Ltd,Construction,Project Management,Canterbury,Christchurch,NO DATA,,NO DATA,0
4,OPERATIONS MANAGER,https://www.seek.co.nz/job/50639831?type=stand...,Fraemohs Homes NZ Ltd,Construction,Management,Canterbury,Christchurch,NO DATA,,NO DATA,0
