In [1]:
# import
import numpy as np
import pandas as pd
import random
import re
import warnings
warnings.filterwarnings('ignore')

### Create a header and an ID for each file 

In [2]:
admin_header_name = ["Job", "URL", "Company", "Location", "Posted Date", "Classification"]
admin_file_name = 'NZ_Admin_JOBS.xlsx'
admin_id = 0

banking_header_name = ["Job", "URL", "Company", "Location", "Classification_2", "Posted Date", "Classification"]
banking_file_name = 'NZ_Banking_JOBS.xlsx'
banking_id = 1

ceo_header_name = ["Job", "URL", "Company", "Location", "Classification", "Posted Date"]
ceo_file_name = 'NZ_CEO_JOBS.xlsx'
ceo_id = 2

construction_header_name = ["Job", "URL", "Company", "Location", "Classification_2", "Posted Date", "Classification"]
construction_file_name = 'NZ_Construction_JOBS.xlsx'
construction_id = 3

# ----------------------------------Functions----------------------------------

### 1. read_file 
Read the file to dataframe and add a header to it

In [3]:
def read_file(file_name, header_name, file_id):
    # read the file and add a header
    df = pd.read_excel(file_name, header=None, names=header_name, engine='openpyxl')
    if file_id == 1 or file_id == 3:
        df = df.drop('Classification_2',axis=1)
    elif file_id == 2 or file_id == 3:
        df = df.iloc[1:]
    return df

### 2. cut_half 
In the raw data, there exists many dupications. This function is used to cut the duplicated strings half.
E.g., it cuts "TaurangaTauranga" to "Tauranga"

In [4]:
def cut_half(x):
    # a function used in data_cleaning
    # cut the duplicated string half
    # e.g., cut "TaurangaTauranga" to "Tauranga"
    
    if x != None:
        if x[-4:] == "area":
            return (x[0:((len(x)-4)//2)])
        else:
            return (x[0:(len(x)//2)])
    else:
        return None

### 3. find_number 
In the raw data, "salary" and "salary infor" are merged into the same column. This function separates "salary" and "salary info".

In [5]:
def find_number(x):
    # separate "salary" and "salary info"
    if x != None and not pd.isna(x):
        if any(char.isdigit() for char in x):
            return x + ";"
        else:
            return ";" + x 

### 4. low_high_salary
In the raw data, the format of the salary is messy. E.g., it has both per hour and yearly pay in the form of $20, $20k, or $40,000. However, the low and high salary are always separated by "-". This function separates the high and lowest and highest salary, and then we estimate the empty data.

In [6]:
def low_high_salary(x, file_id):
    # refine the salary using regular expression
    # get the lowest and the highest salary
    
    output = []
    if x != None:
        x = x.replace(',','')
        x = ''.join((ch if ch in '0123456789.' else ' ') for ch in x)
        x = x.replace(' .','').replace(' 0','0')
        listOfNumbers = [float(i) for i in x.split()]
        if len(listOfNumbers) == 1:
            output = [listOfNumbers[0], listOfNumbers[0]]
        elif len(listOfNumbers) > 1:
            output = [listOfNumbers[0], listOfNumbers[1]]
        # if "0k " in x or "5k " in x or "0K " in x or "5K " in x:
        if bool(re.match("\dk", x)):
            output = [i * 1000 for i in output]
        if output and output[0] < 1000 and output[1] < 1000:
            output = [i * 1760 for i in output]
        if output and output[0] < 1000 and output[1] > 1000:
            output[0] = output[0] * 1000
    if not output:
        if file_id == 0:
            lo = random.uniform(35000, 55000)
            output = [lo, random.uniform(lo, 55000)]
        elif file_id == 1:
            lo = random.uniform(77000, 141000)
            output = [lo, random.uniform(lo, 141000)]
        elif file_id == 2:
            lo = random.uniform(84000, 255000)
            output = [lo, random.uniform(lo, 255000)]
    listToStr = ' '.join([str(int(i)) for i in output])       
    return listToStr

### 5. fix_posted_data
In the raw data. the "posted data" could take the form of either "featured at" or $4d ago, at$. We refine them all to "days" (integer)

In [7]:
def fix_posted_date(x):
    # refine the posted date
    if "Featured" in x or bool(re.search("\d[h,m]", x)):
        return str(0)
    elif bool(re.search("\dd", x)):
        return ''.join((ch if ch in '0123456789' else '') for ch in x)

### 6. fix_company
In the raw data, sometimes the company has a "at " at the begining. This function fixs this problem. 

In [8]:
def fix_company(x):
    if len(x) > 3 and x[:3] == "at ":
        return x[3:]
    else:
        return x

### 7. The main data preprocessing function

In [9]:
def data_cleaning(df, file_id):
    # data cleaning
    # file_id identifies which file it is
    
    # 1. Refine the "posted date" column
    
    column_date_place = df["Posted Date"].str.split(",", n = -1, expand = True) 
    df["Posted Date"] = column_date_place[0]
    df["Posted Place"] = column_date_place[2]
    
    
    # 2. Refine the "location" column
    
    # 2.1 Some rows contain the salary after "," and we need to remove them
    df["Location"] = df["Location"].str.split(",", n = 1, expand = True)[0]
    
    # 2.2 Need to remove the "location:" at the beginning of the strings, and then separate the region and the city
    column_location = df["Location"].str.split(": ", n = 3, expand = True) 
    df['Region'] = column_location[1]
    df['City'] = column_location[2]
    df = df.drop('Location',axis=1)
    
    # 2.3 Remove duplications
    df["Region"] = df["Region"].apply(cut_half)
    df["City"] = df["City"].apply(cut_half)
    
    
    # 3. Refine "classification"
    # Some rows contain the "classification" and the "subclassification", others contain the salary
    # Analyse "classification" and "salary" separately as df_classification and df_salary
    
    # 3.1 analyse "classification" 
    
    df_classification = df[df.Classification.str.contains(':',case=False)]
    column_classifications = df_classification["Classification"].str.split("subClassification: ", n = -1, expand = True)
    df_classification['Classification'] = column_classifications[0].str.split("classification: ", n = -1, expand = True)[1].apply(cut_half)
    df_classification['Sub-classification'] = column_classifications[1].apply(cut_half)

    
    # 3.2 analyse "salary"
    df_salary = df[~df.Classification.str.contains(':',case=True)]
    df_salary = df_salary.rename(columns = {'Classification':'Salary'})
    
    # 3.3 merge "classification" and "salary"
    df = pd.merge(df_classification, df_salary, how='outer')
    df = df[["Job", "URL", "Company", "Posted Date", "Posted Place", "Classification", "Sub-classification", "Region", "City", "Salary"]]
    
    
    # 4. Refine "salary"
    
    # 4.1 separate into "salary" and "salary info"
    df["Salary"] = df["Salary"].apply(find_number)
    column_salary_info = df["Salary"].str.split(";", n = 2, expand = True) 
    df["Salary"] = column_salary_info[0]
    if column_salary_info.shape[1] != 1:
        df["Salary Info"] = column_salary_info[1]
    else:
        df["Salary Info"] = np.nan
    
    # 4.2 obtain the lowest and the highest salary
    column_low_high = df["Salary"].apply(low_high_salary, args=(file_id,))
    column_low_high = column_low_high.str.split(" ", n = -1, expand = True) 
    df["Lowest Salary"] = column_low_high[0]
    df["Higest Salary"] = column_low_high[1]
    df = df.drop('Salary',axis=1)
    
    
    # 5 Merge "Company" and "Posted Place"
    
    df["Company"] = df["Company"].fillna(df['Posted Place'])
    df = df.drop('Posted Place',axis=1)
    
    
    # 6 Refine the posted date
    df["Posted Date (Days Ago)"] = df["Posted Date"].apply(fix_posted_date)
    df = df.drop('Posted Date',axis=1)
    
    
    # 7 Refine "Company"
    df["Company"] = df["Company"].apply(fix_company)
    
    
    #8 Remove null
    df = df.fillna(value='NO DATA')
    
    return df


In [10]:
def print_info(df):
    print("Total number of jobs: "+str(len(df.index)))
    print("Category of jobs: "+str(len(df.Job.unique())))

In [11]:
def load_col(df, col):
    pd.set_option('display.max_colwidth', None)
    nan_va = df[col].isna().sum()
    print(df[col].head(15))
    print("----------------------------------------------------------")
    print("There are "+ str(nan_va)+" NaN values out of "+str(len(df.index))+" jobs")

# --------------------------------Load data--------------------------------

##### 1. Admin jobs

In [12]:
df_admin = read_file(admin_file_name, admin_header_name, admin_id)

##### 2. Banking jobs

In [13]:
df_banking = read_file(banking_file_name, banking_header_name, banking_id)

##### 3. CEO jobs

In [14]:
df_ceo = read_file(ceo_file_name, ceo_header_name, ceo_id)

#### 4. Construction jobs

In [15]:
df_construction = read_file(construction_file_name, construction_header_name, construction_id)

# --------------------------------The raw data is faulty--------------------------------

### 1. The "Company" column

In [16]:
load_col(df_admin,"Company")

0                                                 NaN
1                                Avenues Orthodontics
2                                  New Zealand Police
3            Kew Pacific Island Early Learning Centre
4                                                 NaN
5                     Ministry for Primary Industries
6     Ministry of Business, Innovation and Employment
7                              Hepburn Electrical Ltd
8                                 Webster Holland Ltd
9                                  New Zealand Police
10                         Department of Conservation
11                                    Triple One Care
12     UIV Limited - Premier Insulation North Central
13                        Wraight and Associates Ltd.
14                    Morgan & Pollard Landscapes Ltd
Name: Company, dtype: object
----------------------------------------------------------
There are 22 NaN values out of 2708 jobs


There are 22 NaN values

### 2. The "Location" column

In [17]:
load_col(df_admin,"Location")

0                                      location: Bay of PlentyBay of Plentyarea: TaurangaTauranga
1                                      location: Bay of PlentyBay of Plentyarea: TaurangaTauranga
2                                                                      location: AucklandAuckland
3                                      location: SouthlandSouthlandarea: InvercargillInvercargill
4                                    location: CanterburyCanterburyarea: ChristchurchChristchurch
5                                            location: NorthlandNorthlandarea: WhangareiWhangarei
6                        location: WellingtonWellingtonarea: Wellington CentralWellington Central
7                                        location: Bay of PlentyBay of Plentyarea: RotoruaRotorua
8                                      location: Bay of PlentyBay of Plentyarea: TaurangaTauranga
9                                                                  location: CanterburyCanterbury
10                  

1. There is a "Location: " at the beginning of every cell
2. For some rows, the "location" and the "area" are merged together in one cell
3. The location is duplicated
4. In some rows, the payment is also merged into the cell

### 3. The "Posted Date" column

In [18]:
load_col(df_admin,"Posted Date")

0     Featured,at,Private Advertiser
1                        Featured,at
2                          4d ago,at
3                          1h ago,at
4       4d ago,at,Private Advertiser
5                          9m ago,at
6                          5d ago,at
7                          7m ago,at
8                          2h ago,at
9                          3d ago,at
10                         7d ago,at
11                         5d ago,at
12                         8d ago,at
13                         3d ago,at
14                         3d ago,at
Name: Posted Date, dtype: object
----------------------------------------------------------
There are 0 NaN values out of 2708 jobs


1. Some rows have "Featured,at,Private Advertiser"
2. Some rows have "Featured,at,"
3. Some rows have ",Private Advertiser" after the posted date
4. Some are measured by "m" (months) and others are measured by "d" (days)

### 4. The "Classification" column

In [19]:
load_col(df_admin,"Classification")

0                             classification: Administration & Office SupportAdministration & Office SupportsubClassification: Office ManagementOffice Management
1                                     classification: Administration & Office SupportAdministration & Office SupportsubClassification: ReceptionistsReceptionists
2                                                     classification: Administration & Office SupportAdministration & Office SupportsubClassification: OtherOther
3             classification: Administration & Office SupportAdministration & Office SupportsubClassification: Administrative AssistantsAdministrative Assistants
4     classification: Administration & Office SupportAdministration & Office SupportsubClassification: Client & Sales AdministrationClient & Sales Administration
5             classification: Administration & Office SupportAdministration & Office SupportsubClassification: Administrative AssistantsAdministrative Assistants
6                           

1. Every cell is merged with "classification" and "subclassification"
2. Every cell starts with "classification: "
3. Every cell has "subClassification: " in the middle
4. Every classification and every subClassification are duplicated

# ----------------------------------Data cleaning----------------------------------

### 1. Admin 

In [20]:
df_admin_final = data_cleaning(df_admin, admin_id)
df_admin_final.to_excel('Cleaned_NZ_Admin_JOBS.xlsx')
df_admin_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Administrator,https://www.seek.co.nz/job/50582301?type=promoted#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,Private Advertiser,Administration & Office Support,Office Management,Bay of Plenty,Tauranga,NO DATA,44469,49720,0
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promoted#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,Avenues Orthodontics,Administration & Office Support,Receptionists,Bay of Plenty,Tauranga,NO DATA,45928,46805,0
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=standard#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,New Zealand Police,Administration & Office Support,Other,Auckland,NO DATA,NO DATA,38776,44341,4
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=standard#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,Kew Pacific Island Early Learning Centre,Administration & Office Support,Administrative Assistants,Southland,Invercargill,NO DATA,54903,54961,0
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=standout#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,Private Advertiser,Administration & Office Support,Client & Sales Administration,Canterbury,Christchurch,NO DATA,50095,50788,4


##### Admin job info

In [21]:
print_info(df_admin_final)

Total number of jobs: 2708
Category of jobs: 548


### 2. Banking

In [22]:
df_banking_final = data_cleaning(df_banking, banking_id)
df_banking_final.to_excel('Cleaned_NZ_Banking_JOBS.xlsx')
df_banking_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Accounts Receivable,https://www.seek.co.nz/job/50568753?type=promoted#searchRequestToken=d0513dee-bb65-4c2d-ae6a-7f4b8838529d,MTF Finance Mt Wellington,Banking & Financial Services,Credit,Auckland,Auckland Central,NO DATA,133684,138822,0
1,Internal Audit Manager,https://www.seek.co.nz/job/50556333?type=promoted#searchRequestToken=d0513dee-bb65-4c2d-ae6a-7f4b8838529d,Industrial and Commercial Bank of China (New Zealand) Ltd,Banking & Financial Services,Compliance & Risk,Auckland,Auckland Central,NO DATA,102776,110790,0
2,Client Services Officer,https://www.seek.co.nz/job/50638706?type=standard#searchRequestToken=d0513dee-bb65-4c2d-ae6a-7f4b8838529d,NZ Funds Management Limited,Banking & Financial Services,Client Services,Auckland,Auckland Central,NO DATA,127174,129136,0
3,Private Wealth Assistant - Queenstown,https://www.seek.co.nz/job/50617226?type=standout#searchRequestToken=d0513dee-bb65-4c2d-ae6a-7f4b8838529d,Craigs Investment Partners,Banking & Financial Services,Client Services,Otago,Queenstown & Wanaka,NO DATA,95525,118825,5
4,Business Banking Credit Analyst,https://www.seek.co.nz/job/50615555?type=standout#searchRequestToken=d0513dee-bb65-4c2d-ae6a-7f4b8838529d,Kiwibank,Banking & Financial Services,Banking - Business,Auckland,Auckland Central,NO DATA,93855,134126,5


##### Banking job info

In [23]:
print_info(df_banking_final)

Total number of jobs: 3884
Category of jobs: 202


### 3. CEO 

In [24]:
df_ceo_final = data_cleaning(df_ceo, ceo_id)
df_ceo_final.to_excel('Cleaned_NZ_CEO_JOBS.xlsx')
df_ceo_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Director (x2) - Auckland Transport,https://www.seek.co.nz/job/50540328?type=promoted#searchRequestToken=d332ca7a-0567-4b31-98b9-38f138640c55,Auckland Council,CEO & General Management,Board Appointments,Auckland,Auckland Central,NO DATA,215151,239925,0
1,Director - Panuku Development,https://www.seek.co.nz/job/50540626?type=promoted#searchRequestToken=d332ca7a-0567-4b31-98b9-38f138640c55,Auckland Council,CEO & General Management,Board Appointments,Auckland,Auckland Central,NO DATA,86151,219829,0
2,Chief Executive,https://www.seek.co.nz/job/50623700?type=standout#searchRequestToken=d332ca7a-0567-4b31-98b9-38f138640c55,JacksonStone & Partners,CEO & General Management,CEO,Wellington,Porirua & Kapiti Coast,NO DATA,123544,205862,4
3,Chief Executive,https://www.seek.co.nz/job/50623848?type=standout#searchRequestToken=d332ca7a-0567-4b31-98b9-38f138640c55,JacksonStone & Partners,CEO & General Management,CEO,Manawatu,Wanganui,NO DATA,132343,189804,4
4,Chief Executive,https://www.seek.co.nz/job/50638864?type=standout#searchRequestToken=d332ca7a-0567-4b31-98b9-38f138640c55,Asset Recruitment Ltd,CEO & General Management,CEO,Waikato,Hamilton,NO DATA,137135,170382,0


##### CEO job info

In [25]:
print_info(df_ceo_final)

Total number of jobs: 2996
Category of jobs: 95


### 4. Construction

In [26]:
df_construction_final = data_cleaning(df_construction, construction_id)
df_construction_final.to_excel('Cleaned_NZ_Construction_JOBS.xlsx')
df_construction_final.head()

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,REGIONAL FOREMAN TELCO SECTOR TEAM LEADER AUCKLAND,https://www.seek.co.nz/job/50541089?type=promoted#searchRequestToken=4c2db3e6-3a96-4b1c-829d-b09d39b75a4c,Millennium Electrical Ltd,Construction,Foreperson/Supervisors,Auckland,Rodney & North Shore,NO DATA,,NO DATA,0
1,Bitumen Sprayer Operator,https://www.seek.co.nz/job/50548669?type=promoted#searchRequestToken=4c2db3e6-3a96-4b1c-829d-b09d39b75a4c,Johnstone & Masters Ltd,Construction,Plant & Machinery Operators,Bay of Plenty,Rotorua,NO DATA,,NO DATA,0
2,General Labourers,https://www.seek.co.nz/job/50640801?type=standout#searchRequestToken=4c2db3e6-3a96-4b1c-829d-b09d39b75a4c,AWF,Construction,Other,Northland,Whangarei,NO DATA,,NO DATA,0
3,Project Manager,https://www.seek.co.nz/job/50640890?type=standard#searchRequestToken=4c2db3e6-3a96-4b1c-829d-b09d39b75a4c,Faye Homes New Zealand Ltd,Construction,Project Management,Canterbury,Christchurch,NO DATA,,NO DATA,0
4,OPERATIONS MANAGER,https://www.seek.co.nz/job/50639831?type=standard#searchRequestToken=4c2db3e6-3a96-4b1c-829d-b09d39b75a4c,Fraemohs Homes NZ Ltd,Construction,Management,Canterbury,Christchurch,NO DATA,,NO DATA,0


##### Construction job info

In [27]:
print_info(df_construction_final)

Total number of jobs: 23
Category of jobs: 21
