In [1]:
# import
import numpy as np
import pandas as pd
import random
import re

In [2]:
admin_header_name = ["Job", "URL", "Company", "Location", "Posted Date", "Classification"]
admin_file_name = 'NZ_Admin_JOBS.xlsx'
admin_id = 0

banking_header_name = ["Job", "URL", "Company", "Location", "Classification_2", "Posted Date", "Classification"]
banking_file_name = 'NZ_Banking_JOBS.xlsx'
banking_id = 1

ceo_header_name = ["Job", "URL", "Company", "Location", "Classification", "Posted Date"]
ceo_file_name = 'NZ_CEO_JOBS.xlsx'
ceo_id = 2

construction_header_name = ["Job", "URL", "Company", "Location", "Classification_2", "Posted Date", "Classification"]
construction_file_name = 'NZ_Construction_JOBS.xlsx'
construction_id = 3

In [3]:
def read_file(file_name, header_name, file_id):
    # read the file and add a header
    df = pd.read_excel(file_name, header=None, names=header_name)
    if file_id == 1 or file_id == 3:
        df = df.drop('Classification_2',axis=1)
    elif file_id == 2 or file_id == 3:
        df = df.iloc[1:]
    return df

In [4]:
def cut_half(x):
    # a function used in data_cleaning
    # cut the duplicated string half
    # e.g., cut "TaurangaTauranga" to "Tauranga"
    
    if x != None:
        if x[-4:] == "area":
            return (x[0:((len(x)-4)//2)])
        else:
            return (x[0:(len(x)//2)])
    else:
        return None

In [5]:
def find_number(x):
    # separate "salary" and "salary info"
    if x != None and not pd.isna(x):
        if any(char.isdigit() for char in x):
            return x + ";"
        else:
            return ";" + x 

In [6]:
def low_high_salary(x, file_id):
    # refine the salary using regular expression
    # get the lowest and the highest salary
    
    output = []
    if x != None:
        x = x.replace(',','')
        x = ''.join((ch if ch in '0123456789.' else ' ') for ch in x)
        x = x.replace(' .','').replace(' 0','0')
        listOfNumbers = [float(i) for i in x.split()]
        if len(listOfNumbers) == 1:
            output = [listOfNumbers[0], listOfNumbers[0]]
        elif len(listOfNumbers) > 1:
            output = [listOfNumbers[0], listOfNumbers[1]]
        # if "0k " in x or "5k " in x or "0K " in x or "5K " in x:
        if bool(re.match("\dk", x)):
            output = [i * 1000 for i in output]
        if output and output[0] < 1000 and output[1] < 1000:
            output = [i * 1760 for i in output]
        if output and output[0] < 1000 and output[1] > 1000:
            output[0] = output[0] * 1000
    if not output:
        if file_id == 0:
            lo = random.uniform(35000, 55000)
            output = [lo, random.uniform(lo, 55000)]
        elif file_id == 1:
            lo = random.uniform(77000, 141000)
            output = [lo, random.uniform(lo, 141000)]
        elif file_id == 2:
            lo = random.uniform(84000, 255000)
            output = [lo, random.uniform(lo, 255000)]
    listToStr = ' '.join([str(int(i)) for i in output])       
    return listToStr

In [7]:
def fix_posted_date(x):
    # refine the posted date
    if "Featured" in x or bool(re.search("\d[h,m]", x)):
        return str(0)
    elif bool(re.search("\dd", x)):
        return ''.join((ch if ch in '0123456789' else '') for ch in x)

In [8]:
def fix_company(x):
    if len(x) > 3 and x[:3] == "at ":
        return x[3:]
    else:
        return x

In [9]:
def data_cleaning(df, file_id):
    # data cleaning
    # file_id identifies which file it is
    
    # 1. Refine the "posted date" column
    
    column_date_place = df["Posted Date"].str.split(",", n = -1, expand = True) 
    df["Posted Date"] = column_date_place[0]
    df["Posted Place"] = column_date_place[2]
    
    
    # 2. Refine the "location" column
    
    # 2.1 Some rows contain the salary after "," and we need to remove them
    df["Location"] = df["Location"].str.split(",", n = 1, expand = True)[0]
    
    # 2.2 Need to remove the "location:" at the beginning of the strings, and then separate the region and the city
    column_location = df["Location"].str.split(": ", n = 3, expand = True) 
    df['Region'] = column_location[1]
    df['City'] = column_location[2]
    df = df.drop('Location',axis=1)
    
    # 2.3 Remove duplications
    df["Region"] = df["Region"].apply(cut_half)
    df["City"] = df["City"].apply(cut_half)
    
    
    # 3. Refine "classification"
    # Some rows contain the "classification" and the "subclassification", others contain the salary
    # Analyse "classification" and "salary" separately as df_classification and df_salary
    
    # 3.1 analyse "classification" 
    
    df_classification = df[df.Classification.str.contains(':',case=False)]
    column_classifications = df_classification["Classification"].str.split("subClassification: ", n = -1, expand = True)
    df_classification['Classification'] = column_classifications[0].str.split("classification: ", n = -1, expand = True)[1].apply(cut_half)
    df_classification['Sub-classification'] = column_classifications[1].apply(cut_half)

    
    # 3.2 analyse "salary"
    df_salary = df[~df.Classification.str.contains(':',case=True)]
    df_salary = df_salary.rename(columns = {'Classification':'Salary'})
    
    # 3.3 merge "classification" and "salary"
    df = pd.merge(df_classification, df_salary, how='outer')
    df = df[["Job", "URL", "Company", "Posted Date", "Posted Place", "Classification", "Sub-classification", "Region", "City", "Salary"]]
    
    
    # 4. Refine "salary"
    
    # 4.1 separate into "salary" and "salary info"
    df["Salary"] = df["Salary"].apply(find_number)
    column_salary_info = df["Salary"].str.split(";", n = 2, expand = True) 
    df["Salary"] = column_salary_info[0]
    if column_salary_info.shape[1] != 1:
        df["Salary Info"] = column_salary_info[1]
    else:
        df["Salary Info"] = np.nan
    
    # 4.2 obtain the lowest and the highest salary
    column_low_high = df["Salary"].apply(low_high_salary, args=(file_id,))
    column_low_high = column_low_high.str.split(" ", n = -1, expand = True) 
    df["Lowest Salary"] = column_low_high[0]
    df["Higest Salary"] = column_low_high[1]
    df = df.drop('Salary',axis=1)
    
    
    # 5 Merge "Company" and "Posted Place"
    
    df["Company"] = df["Company"].fillna(df['Posted Place'])
    df = df.drop('Posted Place',axis=1)
    
    
    # 6 Refine the posted date
    df["Posted Date (Days Ago)"] = df["Posted Date"].apply(fix_posted_date)
    df = df.drop('Posted Date',axis=1)
    
    
    # 7 Refine "Company"
    df["Company"] = df["Company"].apply(fix_company)
    
    
    #8 Remove null
    df = df.fillna(value='NO DATA')
    
    return df


### Admin 

In [10]:
df_admin = read_file(admin_file_name, admin_header_name, admin_id)
df_admin_final = data_cleaning(df_admin, admin_id)
df_admin_final.to_excel('Cleaned_NZ_Admin_JOBS_01_02.xlsx',sheet_name='Sheet1')
df_admin_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,Private Advertiser,Administration & Office Support,Office Management,Bay of Plenty,Tauranga,NO DATA,38779,44715,0
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Administration & Office Support,Receptionists,Bay of Plenty,Tauranga,NO DATA,52548,54784,0
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Administration & Office Support,Other,Auckland,NO DATA,NO DATA,50772,52438,4
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Administration & Office Support,Administrative Assistants,Southland,Invercargill,NO DATA,35288,36931,0
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,Private Advertiser,Administration & Office Support,Client & Sales Administration,Canterbury,Christchurch,NO DATA,37519,41290,4
5,Support Officer,https://www.seek.co.nz/job/50640393?type=stand...,Ministry for Primary Industries,Administration & Office Support,Administrative Assistants,Northland,Whangarei,NO DATA,38064,41398,0
6,Support Officer,https://www.seek.co.nz/job/50615674?type=stand...,"Ministry of Business, Innovation and Employment",Administration & Office Support,Other,Wellington,Wellington Central,NO DATA,46058,50658,5
7,office administrator,https://www.seek.co.nz/job/50640166?type=stand...,Hepburn Electrical Ltd,Administration & Office Support,Administrative Assistants,Bay of Plenty,Rotorua,NO DATA,44063,49601,0
8,Office Administrator,https://www.seek.co.nz/job/50639248?type=stand...,Webster Holland Ltd,Administration & Office Support,Administrative Assistants,Bay of Plenty,Tauranga,NO DATA,38414,41209,0
9,Administration Officer,https://www.seek.co.nz/job/50629393?type=stand...,New Zealand Police,Administration & Office Support,Other,Canterbury,NO DATA,NO DATA,47632,48004,3


### Banking

In [11]:
df_banking = read_file(banking_file_name, banking_header_name, banking_id)
df_banking_final = data_cleaning(df_banking, banking_id)
df_banking_final.to_excel('Cleaned_NZ_Banking_JOBS_01_02.xlsx',sheet_name='Sheet1')
df_banking_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Accounts Receivable,https://www.seek.co.nz/job/50568753?type=promo...,MTF Finance Mt Wellington,Banking & Financial Services,Credit,Auckland,Auckland Central,NO DATA,102265,122734,0
1,Internal Audit Manager,https://www.seek.co.nz/job/50556333?type=promo...,Industrial and Commercial Bank of China (New Z...,Banking & Financial Services,Compliance & Risk,Auckland,Auckland Central,NO DATA,133833,133856,0
2,Client Services Officer,https://www.seek.co.nz/job/50638706?type=stand...,NZ Funds Management Limited,Banking & Financial Services,Client Services,Auckland,Auckland Central,NO DATA,92865,136380,0
3,Private Wealth Assistant - Queenstown,https://www.seek.co.nz/job/50617226?type=stand...,Craigs Investment Partners,Banking & Financial Services,Client Services,Otago,Queenstown & Wanaka,NO DATA,89538,122174,5
4,Business Banking Credit Analyst,https://www.seek.co.nz/job/50615555?type=stand...,Kiwibank,Banking & Financial Services,Banking - Business,Auckland,Auckland Central,NO DATA,119565,124173,5
5,Operations Analyst,https://www.seek.co.nz/job/50620895?type=stand...,Kin,Banking & Financial Services,Analysis & Reporting,Wellington,Wellington Central,NO DATA,81735,92646,4
6,Loan Administrator,https://www.seek.co.nz/job/50621868?type=stand...,Freehold Express Limited,Banking & Financial Services,Account & Relationship Management,Canterbury,Christchurch,NO DATA,85195,114967,4
7,Banking Specialist - Virtual Channel,https://www.seek.co.nz/job/50617084?type=stand...,SBS Bank,Banking & Financial Services,Banking - Retail/Branch,Southland,Invercargill,NO DATA,81237,86468,5
8,Junior Buyer,https://www.seek.co.nz/job/50613329?type=stand...,PAK'nSAVE,Banking & Financial Services,Analysis & Reporting,Hawkes Bay,Hastings,NO DATA,124587,126920,5
9,Adviser Support,https://www.seek.co.nz/job/50626143?type=stand...,NetYourJob,Banking & Financial Services,Mortgages,Auckland,Rodney & North Shore,NO DATA,93777,110863,4


In [12]:
df_banking = read_file(banking_file_name, banking_header_name, banking_id)
df_banking_final = data_cleaning(df_banking, banking_id)
df_banking_final.to_excel('Cleaned_NZ_Banking_JOBS_01_02.xlsx',sheet_name='Sheet1')
df_banking_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Accounts Receivable,https://www.seek.co.nz/job/50568753?type=promo...,MTF Finance Mt Wellington,Banking & Financial Services,Credit,Auckland,Auckland Central,NO DATA,93227,133863,0
1,Internal Audit Manager,https://www.seek.co.nz/job/50556333?type=promo...,Industrial and Commercial Bank of China (New Z...,Banking & Financial Services,Compliance & Risk,Auckland,Auckland Central,NO DATA,101254,120767,0
2,Client Services Officer,https://www.seek.co.nz/job/50638706?type=stand...,NZ Funds Management Limited,Banking & Financial Services,Client Services,Auckland,Auckland Central,NO DATA,107993,125308,0
3,Private Wealth Assistant - Queenstown,https://www.seek.co.nz/job/50617226?type=stand...,Craigs Investment Partners,Banking & Financial Services,Client Services,Otago,Queenstown & Wanaka,NO DATA,97431,123400,5
4,Business Banking Credit Analyst,https://www.seek.co.nz/job/50615555?type=stand...,Kiwibank,Banking & Financial Services,Banking - Business,Auckland,Auckland Central,NO DATA,102913,107474,5
5,Operations Analyst,https://www.seek.co.nz/job/50620895?type=stand...,Kin,Banking & Financial Services,Analysis & Reporting,Wellington,Wellington Central,NO DATA,84651,114207,4
6,Loan Administrator,https://www.seek.co.nz/job/50621868?type=stand...,Freehold Express Limited,Banking & Financial Services,Account & Relationship Management,Canterbury,Christchurch,NO DATA,122965,130566,4
7,Banking Specialist - Virtual Channel,https://www.seek.co.nz/job/50617084?type=stand...,SBS Bank,Banking & Financial Services,Banking - Retail/Branch,Southland,Invercargill,NO DATA,80510,103444,5
8,Junior Buyer,https://www.seek.co.nz/job/50613329?type=stand...,PAK'nSAVE,Banking & Financial Services,Analysis & Reporting,Hawkes Bay,Hastings,NO DATA,85721,119349,5
9,Adviser Support,https://www.seek.co.nz/job/50626143?type=stand...,NetYourJob,Banking & Financial Services,Mortgages,Auckland,Rodney & North Shore,NO DATA,105907,136403,4


### CEO 

In [13]:
df_ceo = read_file(ceo_file_name, ceo_header_name, ceo_id)
df_ceo_final = data_cleaning(df_ceo, ceo_id)
df_ceo_final.to_excel('Cleaned_NZ_CEO_JOBS_03_02.xlsx',sheet_name='Sheet1')
df_ceo_final

Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,Director (x2) - Auckland Transport,https://www.seek.co.nz/job/50540328?type=promo...,Auckland Council,CEO & General Management,Board Appointments,Auckland,Auckland Central,NO DATA,219786,251067,0
1,Director - Panuku Development,https://www.seek.co.nz/job/50540626?type=promo...,Auckland Council,CEO & General Management,Board Appointments,Auckland,Auckland Central,NO DATA,202289,253513,0
2,Chief Executive,https://www.seek.co.nz/job/50623700?type=stand...,JacksonStone & Partners,CEO & General Management,CEO,Wellington,Porirua & Kapiti Coast,NO DATA,103740,168818,4
3,Chief Executive,https://www.seek.co.nz/job/50623848?type=stand...,JacksonStone & Partners,CEO & General Management,CEO,Manawatu,Wanganui,NO DATA,248378,253261,4
4,Chief Executive,https://www.seek.co.nz/job/50638864?type=stand...,Asset Recruitment Ltd,CEO & General Management,CEO,Waikato,Hamilton,NO DATA,140898,162501,0
5,General Secretary (CEO),https://www.seek.co.nz/job/50626028?type=stand...,New Zealand Labour Party,CEO & General Management,CEO,Wellington,Wellington Central,NO DATA,85136,91013,4
6,Chief Executive,https://www.seek.co.nz/job/50642634?type=stand...,NES Global Talent,CEO & General Management,CEO,Taranaki,New Plymouth,NO DATA,213316,213673,0
7,Chief Executive Officer,https://www.seek.co.nz/job/50641284?type=stand...,Tribe (New Zealand) Limited,CEO & General Management,CEO,Auckland,NO DATA,NO DATA,131692,170016,0
8,CEO & General Management,https://www.seek.co.nz/job/50606372?type=stand...,EzyStream,CEO & General Management,CEO,Canterbury,Christchurch,NO DATA,88233,229913,6
9,Chief Operating Officer (COO),https://www.seek.co.nz/job/50571339?type=stand...,Talent Army,CEO & General Management,COO & MD,Auckland,Rodney & North Shore,NO DATA,142861,251688,13


In [14]:
df_construction = read_file(construction_file_name, construction_header_name, construction_id)
df_construction

Unnamed: 0,Job,URL,Company,Location,Posted Date,Classification
0,å­—æ®µ1,å­—æ®µ1_link,å­—æ®µ2,å­—æ®µ3,å­—æ®µ5,å­—æ®µ6
1,REGIONAL FOREMAN TELCO SECTOR TEAM LEADER AUCK...,https://www.seek.co.nz/job/50541089?type=promo...,at Millennium Electrical Ltd,location: AucklandAucklandarea: Rodney & North...,"Featured,at",classification: ConstructionConstructionsubCla...
2,Bitumen Sprayer Operator,https://www.seek.co.nz/job/50548669?type=promo...,at Johnstone & Masters Ltd,location: Bay of PlentyBay of Plentyarea: Roto...,"Featured,at",classification: ConstructionConstructionsubCla...
3,General Labourers,https://www.seek.co.nz/job/50640801?type=stand...,at AWF,location: NorthlandNorthlandarea: WhangareiWha...,"6h ago,at",classification: ConstructionConstructionsubCla...
4,Miller Operator/Trainee,https://www.seek.co.nz/job/50640842?type=stand...,at Fulton Hogan,"location: AucklandAuckland,To be discussed wit...","6h ago,at",To be discussed with the successful candidate
5,General Labourers,https://www.seek.co.nz/job/50639450?type=stand...,at Tradestaff,location: Bay of PlentyBay of Plentyarea: Roto...,"9h ago,at",NZD19.25 - NZD20 per hour
6,Project Manager,https://www.seek.co.nz/job/50640890?type=stand...,at Faye Homes New Zealand Ltd,location: CanterburyCanterburyarea: Christchur...,"6h ago,at",classification: ConstructionConstructionsubCla...
7,OPERATIONS MANAGER,https://www.seek.co.nz/job/50639831?type=stand...,at Fraemohs Homes NZ Ltd,location: CanterburyCanterburyarea: Christchur...,"8h ago,at",classification: ConstructionConstructionsubCla...
8,CONSTRUCTION LABOURERS,https://www.seek.co.nz/job/50588043?type=stand...,at Alignz Recruitment,location: AucklandAucklandarea: Manukau & East...,"10d ago,at",$20 - $24.99 per hour
9,Trainee Traffic Controller,https://www.seek.co.nz/job/50639052?type=stand...,at 1st Call Recruitment,location: WellingtonWellingtonarea: Wellington...,"10h ago,at",classification: ConstructionConstructionsubCla...


In [15]:
df_construction_final = data_cleaning(df_construction, construction_id)
df_construction_final.to_excel('Cleaned_NZ_Construction_JOBS_03_02.xlsx',sheet_name='Sheet1')
df_construction_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Job,URL,Company,Classification,Sub-classification,Region,City,Salary Info,Lowest Salary,Higest Salary,Posted Date (Days Ago)
0,REGIONAL FOREMAN TELCO SECTOR TEAM LEADER AUCK...,https://www.seek.co.nz/job/50541089?type=promo...,Millennium Electrical Ltd,Construction,Foreperson/Supervisors,Auckland,Rodney & North Shore,NO DATA,,NO DATA,0
1,Bitumen Sprayer Operator,https://www.seek.co.nz/job/50548669?type=promo...,Johnstone & Masters Ltd,Construction,Plant & Machinery Operators,Bay of Plenty,Rotorua,NO DATA,,NO DATA,0
2,General Labourers,https://www.seek.co.nz/job/50640801?type=stand...,AWF,Construction,Other,Northland,Whangarei,NO DATA,,NO DATA,0
3,Project Manager,https://www.seek.co.nz/job/50640890?type=stand...,Faye Homes New Zealand Ltd,Construction,Project Management,Canterbury,Christchurch,NO DATA,,NO DATA,0
4,OPERATIONS MANAGER,https://www.seek.co.nz/job/50639831?type=stand...,Fraemohs Homes NZ Ltd,Construction,Management,Canterbury,Christchurch,NO DATA,,NO DATA,0
5,Trainee Traffic Controller,https://www.seek.co.nz/job/50639052?type=stand...,1st Call Recruitment,Construction,Other,Wellington,Wellington Central,NO DATA,,NO DATA,0
6,Construction Supervisor - Residential,https://www.seek.co.nz/job/50632332?type=stand...,Fletcher Building Limited,Construction,Management,Auckland,Manukau & East Auckland,NO DATA,,NO DATA,3
7,Site Supervisor - Commercial,https://www.seek.co.nz/job/50628751?type=stand...,Summerset,Construction,Foreperson/Supervisors,Taranaki,New Plymouth,NO DATA,,NO DATA,3
8,Project Coordinator,https://www.seek.co.nz/job/50640012?type=stand...,Alpha Rail,Construction,Project Management,Auckland,Waitakere & West Auckland,NO DATA,,NO DATA,0
9,Construction Manager,https://www.seek.co.nz/job/50631119?type=stand...,Building Recruitment Limited,Construction,Foreperson/Supervisors,Auckland,Auckland Central,NO DATA,,NO DATA,3
