In [47]:
import pandas as pd
import numpy as np

In [48]:
pd.set_option('display.max_columns', None)

In [49]:
df = pd.read_csv('education.csv', dtype=str)

In [50]:
for col in df.columns:
    df[col] = df[col].str.strip()
    df[col] = df[col].str.replace('  ', ' ')

In [51]:
# Clean credit card and phone
cols = ['credit_card_number', 'phone']
for col in cols:
    df[col] = df[col].str.replace(' ', '')

In [52]:
# Convert all to lower case
for col in df.columns:
    df[col] = df[col].str.lower()

In [53]:
# Fix '24' in timestamp - should be 00
cols = ['employment_timestamp']
for col in cols:
    df[col] = df[col].str.replace('t24', 't00')

In [54]:
def fix_email(row):
    email = str(row['email'])
    if '@' not in email:
        try:
            pre = email.split('.com')[0]
            post = email.split('.com')[1]
            pre_dot = pre.rfind('.')
            domain = pre[pre_dot+1:]
            user = pre[:pre_dot]
            return f'{user}@{domain}.com{post}'
        except IndexError:
            return 'unknown'
    else:
        return email

In [55]:
from datetime import datetime
df['employ_ts'] = [datetime.strptime(x, '%Y-%m-%dt%H:%M%z') for x in df['employment_timestamp']]

In [56]:
df['employ_year'] = [x.year for x in df['employ_ts']]
df['employ_month'] = [x.month for x in df['employ_ts']]
df['employ_day'] = [x.day for x in df['employ_ts']]
df['employ_hour'] = [x.hour for x in df['employ_ts']]
df['employ_minute'] = [x.minute for x in df['employ_ts']]
df['employ_dt'] = [x.date() for x in df['employ_ts']]

In [57]:
df['email_fix'] = df.apply(lambda row: fix_email(row), axis=1)

In [58]:
display(df)

Unnamed: 0,rec_id,ssn,first_name,middle_name,last_name,gender,current_age,birth_date,street_address,suburb,postcode,state,phone,email,education,occupation,salary,credit_card_number,years_of_experience,employment_timestamp,employ_ts,employ_year,employ_month,employ_day,employ_hour,employ_minute,employ_dt,email_fix
0,r64930958,f169308549,danny,harold,wallace,f,53,18/6/1964,42 bank street,west ryde,2114,nsw,0297335601,danny.wallace@gmail.com,advance-diploma,legal-professional,112638,3321271655315968,37,2017-06-02t23:53+00:00,2017-06-02 23:53:00+00:00,2017,6,2,23,53,2017-06-02,danny.wallace@gmail.com
1,r49823664,g196248364,george,allen,morrison,m,10,12/3/2010,1 leura lane watersley cottage,hamilton,3300,vic,,spfefrcmeq@mail.com,primary-school,,0,,0,2020-04-20t03:39+00:00,2020-04-20 03:39:00+00:00,2020,4,20,3,39,2020-04-20,spfefrcmeq@mail.com
2,r93715161,e111796351,stephon,hill,warren,f,31,25/12/1989,39 mcnabb loop collier park vlg,como,6152,wa,0871245514,stephon77@hotmail.com,secondary-school,casual-worker,9203,8296553479632282,12,2020-05-20t06:28+00:00,2020-05-20 06:28:00+00:00,2020,5,20,6,28,2020-05-20,stephon77@hotmail.com
3,r13797518,f113787195,matthew,e,heuer,m,23,3/11/1997,16 walker avenue flt 2,mitcham,3132,vic,0306974454,heuer21@mail.com,bachelor-degree,legal-professional,,9146477569607818,4,2020-06-26t11:01+00:00,2020-06-26 11:01:00+00:00,2020,6,26,11,1,2020-06-26,heuer21@mail.com
4,r56226376,g166722356,melisa,t,kellogg,f,63,12/11/1957,9 hallett close riverside garden,douglas,4814,qld,0732902798,,certificate-iv,other-technician,102427,1038252846505843,41,2017-08-16t03:18+00:00,2017-08-16 03:18:00+00:00,2017,8,16,3,18,2017-08-16,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,r52518161,i161581251,gracie,little,ratcliffe,f,12,12/8/2008,5 caporn avenue belgrave house,belgrave,3160,vic,0350799998,gracie.ratcliffe@aol.com,secondary-school,,0,,0,2020-07-12t17:35+00:00,2020-07-12 17:35:00+00:00,2020,7,12,17,35,2020-07-12,gracie.ratcliffe@aol.com
19996,r02061556,g152500166,allan,landon,hurdle,m,52,3/8/1968,5 barrett drive floreat village,alice springs,0870,nt,0860047509,sspgooetal@aol.com,certificate-iii,ict-technician,81792,6942787195925603,31,2018-06-01t20:09+00:00,2018-06-01 20:09:00+00:00,2018,6,1,20,9,2018-06-01,sspgooetal@aol.com
19997,r78814782,g178482871,joshua,michael,munn,f,12,22/4/2008,5 tiwi gardens road tiwi professional suites,tiwi,0810,nt,0869895100,joshua.munn@mail.com.au,secondary-school,,0,,0,2020-05-21t01:32+00:00,2020-05-21 01:32:00+00:00,2020,5,21,1,32,2020-05-21,joshua.munn@mail.com.au
19998,r74078696,h169776084,carol,lucille,harms,f,64,1/12/1956,3 ford avenue harmony,medowie,2318,nsw,0251342443,harms.carol@mail.com,certificate-i,clerks,-9999,1065138163971671,45,2020-04-24t00:34+00:00,2020-04-24 00:34:00+00:00,2020,4,24,0,34,2020-04-24,harms.carol@mail.com


In [59]:
# Validate if the suburb entered matches the postcode entered (0 = no, 1 = yes, invalid = postcode doesn't exist)
pc = pd.read_csv('australian_postcodes.csv', dtype=str)
pc = pc[['postcode', 'locality', 'state']]
pc1 = pc.groupby('postcode')['locality'].apply(list)
df = df.replace(np.nan, '', regex=True)

In [60]:
def pcode_valid(row):
    if row['postcode'] != '' and row['suburb'] != '':
        try:
            pcode_src = row['postcode']
            suburb_src = row['suburb']
            suburbs_lookup = [x.lower() for x in pc1[pcode_src]]
            if suburb_src in suburbs_lookup:
                return 1
            else:
                return 0
        except KeyError:
            return 'invalid postcode'

In [61]:
df['pcode_check'] = df.apply(lambda row: pcode_valid(row), axis=1)

In [62]:
pc2 = pc
pc2['key'] = pc.locality + pc.state
pc2['key'] = pc2.key.str.lower()

In [63]:
def suburb_valid(row):
    if row['postcode'] != '' and row['suburb'] != '' and row['state'] != '':
        pcode_src = row['postcode']
        suburb_src = row['suburb']
        state_src = row['state']
        src_key = suburb_src + state_src
        try:
            postcode_lookup = pc2.loc[pc2['key'] == src_key]['postcode'].values[0]
            if pcode_src == postcode_lookup:
                return 1
            else:
                return 0
        except IndexError:
            return 'invalid locality'

In [64]:
df['suburb_check'] = df.apply(lambda row: suburb_valid(row), axis=1)

In [65]:
# Convert date columns
cols = ['birth_date']
for col in cols:
    df[col+'_dt'] = [datetime.strptime(x, '%d/%m/%Y') for x in df[col]]

In [66]:
df.to_csv('education_clean_2.csv', index=False)