# Setup

## Library import
We import all the required Python libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30
pd.options.display.max_colwidth = None

# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.


# Data import
We retrieve all the required data for the analysis.

In [2]:
nz_admin = pd.read_excel(r'NZ_Admin_JOBS.xlsx')

In [3]:
nz_admin.head()

Unnamed: 0,字段1,字段1_link,字段2,字段3,字段4,字段5
0,Administrator,https://www.seek.co.nz/job/50582301?type=promoted#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,,location: Bay of PlentyBay of Plentyarea: TaurangaTauranga,"Featured,at,Private Advertiser",classification: Administration & Office SupportAdministration & Office SupportsubClassification: Office ManagementOffice Management
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promoted#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,Avenues Orthodontics,location: Bay of PlentyBay of Plentyarea: TaurangaTauranga,"Featured,at",classification: Administration & Office SupportAdministration & Office SupportsubClassification: ReceptionistsReceptionists
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=standard#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,New Zealand Police,location: AucklandAuckland,"4d ago,at",classification: Administration & Office SupportAdministration & Office SupportsubClassification: OtherOther
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=standard#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,Kew Pacific Island Early Learning Centre,location: SouthlandSouthlandarea: InvercargillInvercargill,"1h ago,at",classification: Administration & Office SupportAdministration & Office SupportsubClassification: Administrative AssistantsAdministrative Assistants
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=standout#searchRequestToken=feee129e-c80f-4f79-ac5f-98ddb6d6c22b,,location: CanterburyCanterburyarea: ChristchurchChristchurch,"4d ago,at,Private Advertiser",classification: Administration & Office SupportAdministration & Office SupportsubClassification: Client & Sales AdministrationClient & Sales Administration


In [4]:
nz_admin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   字段1       2708 non-null   object
 1   字段1_link  2708 non-null   object
 2   字段2       2686 non-null   object
 3   字段3       2708 non-null   object
 4   字段4       2708 non-null   object
 5   字段5       2708 non-null   object
dtypes: object(6)
memory usage: 127.1+ KB


## Function "renameCol"

In [5]:
def renameCol(df):
    return df.rename({
                '字段1': 'column1', 
                '字段1_link': 'column1_link',
                '字段2': 'column2', 
                '字段3': 'column3',
                '字段4': 'column4', 
                '字段5': 'column5'
            }, axis=1, inplace=True)

In [6]:
renameCol(nz_admin)

# Data processing
Put here the core of the notebook. Feel free di further split this section into subsections.

## Column1_link - types of job posted
## Function "cleanCol1_link"

In [7]:
def cleanCol1_link(df, col_edit, col_add, col_drop):
    df[col_add] = df[col_edit].apply(lambda row: re.search(r'=(\w+)#', row).group(1))
    df.drop(col_drop, axis=1, inplace=True)
    return df

In [8]:
df = cleanCol1_link(nz_admin, 'column1_link', 'job_advert_type', 'column1_link')

NameError: name 're' is not defined

## Column1 - Occupation

In [None]:
nz_admin.rename({'column1': 'occupation'}, axis=1, inplace=True)

## Column2 - company name

In [None]:
nz_admin['column2'].fillna(value="Private Advertiser", inplace=True)

In [None]:
nz_admin.rename({'column2': 'Company Name'}, axis=1, inplace=True)

## Column3 - Company location (Area) / Company location (suburb) / Salary (Benefit)
## Function "clean_Col3"

In [None]:
def address_area(address):
    lst_address = [x.strip() for x in address.split(":")][1]
    lst_area = re.findall('[A-Z][^A-Z]*', lst_address)
    area = ""
    for i in range(len(lst_area)//2):
        area += lst_area[i]
    return area

In [None]:
def address_suburb(address):
    lst_address = [x.strip() for x in address.split(":")]
    if len(lst_address) == 3:
        lst_suburb = re.findall('[A-Z][^A-Z]*', lst_address[2])
        suburb = ""
        for i in range(len(lst_suburb)//2):
            suburb += lst_suburb[i]
        return suburb

In [None]:
def address_benefit(address):
    benefit = address.split(',', 1)
    if len(benefit)>1:
        return benefit[1]

In [None]:
col_add = [
    'Company location (Area)',
    'Company location (suburb)',
    'Salary($)(Benefit)'
]

In [None]:
func = [
    address_area,
    address_suburb,
    address_benefit
]

In [None]:
def clean_Col3(df, col_edit, col_add, col_drop, func):
    df[col_add[0]] = df[col_edit].apply(lambda row: func[0](row))
    df[col_add[1]] = df[col_edit].apply(lambda row: func[1](row))
    df[col_add[2]] = df[col_edit].apply(lambda row: func[2](row))
    df.drop(col_drop, axis=1, inplace=True)
    return df

In [None]:
df = clean_Col3(nz_admin, 'column3', col_add, 'column3', func)

## Column4 - Job_Posted(days ago)

In [None]:
def job_posted_days(row):
    try:
        if re.search(r'(\d*)(\w)', re.search(r'(\d+)(\w)', row).group()).group(2) == 'd':
            return re.search(r'(\d*)(\w)', re.search(r'(\d+)(\w)', row).group()).group(1)
        else:
            return 0
    except:
      return None

In [None]:
def clean_Col4(df, col_edit, col_add, col_drop, func):
    df[col_add] = df[col_edit].apply(lambda row: func(row))
    df.drop(col_drop, axis=1, inplace=True)
    return df

In [None]:
df = clean_Col4(nz_admin, 'column4', 'Job_Posted(days ago)', 'column4', job_posted_days)

## Column5 - job_classification / job_subclassification

In [None]:
def return_classification(row):
    lst_row = row.split(':')
    classfication = ""
    if len(lst_row)==1:
        return None
    else:
        lst_class = re.findall('[A-Z][^A-Z]*', lst_row[1].split('subClassfication')[0])
        for i in range(len(lst_class)//2):
            classfication += lst_class[i]
        return classfication

In [None]:
def return_subclass(row):
    lst_row = row.split(':')
    subclass = ""
    if len(lst_row)==1:
        return None
    else:
        lst_subclass = re.findall('[A-Z][^A-Z]*', lst_row[2])
        for i in range(len(lst_subclass)//2):
            subclass += lst_subclass[i]
        return subclass

In [None]:
col_add = [
    'job_classification',
    'job_subclassification'
]

In [None]:
func = [
    return_classification,
    return_subclass
]

In [None]:
def clean_Col5(df, col_edit, col_add, col_drop, func):
    df[col_add[0]] = df[col_edit].apply(lambda row: func[0](row))
    df[col_add[1]] = df[col_edit].apply(lambda row: func[1](row))
    df.drop(col_drop, axis=1, inplace=True)
    return df

In [None]:
df = clean_Col5(nz_admin, 'column5', col_add, 'column5', func)

## Clean dataset

In [None]:
nz_admin_clean = nz_admin

In [None]:
nz_admin_clean.head(20)

# References
We report here relevant references:
1. author1, article1, journal1, year1, url1
2. author2, article2, journal2, year2, url2