In [1]:
import pandas as pd

## Process the relationship between SA2 SUA and State
data collected from: https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/1270.0.55.001July%202016?OpenDocument

In [2]:
SA2_S = pd.read_csv('../data/SUDO/SA2_State.csv')
SA2_SUA = pd.read_csv('../data/SUDO/SA2_SUA.csv')
join_SA2_SUA = SA2_SUA[['SA2_MAINCODE_2016','SUA_NAME_2016','SUA_CODE_2016']]
join_SA2_S = SA2_S[['SA2_MAINCODE_2016']]
STATE_CODE = {1: "new south wales", 2: "victoria", 3: "queensland",
            4: "south australia", 5: "western australia", 6: "tasmania",
            7: "northern territory", 8: "australian capital territory", 
            9: "offshore territories"}

In [3]:
joined_area = pd.merge(join_SA2_SUA, join_SA2_S, how='inner', on = 'SA2_MAINCODE_2016')
joined_area['state'] = None
for idx,row in joined_area.iterrows():
    joined_area.loc[idx,'state'] = STATE_CODE[(row['SUA_CODE_2016']//1000)]

## Preprocess SUDO data
dataset name: 
- SA2-P40a Labour Force Status by Age by Sex-Census 2016
- SA2-P40b Labour Force Status by Age by Sex-Census 2016


In [4]:
all_a = pd.read_csv(r'../data/SUDO\SA2-P40_Labour_Force_Status-CSV\sa2_p40a_data.csv').drop(' sa2_name16',axis = 1)
all_b = pd.read_csv(r'../data/SUDO\SA2-P40_Labour_Force_Status-CSV\sa2_p40b_data.csv')

In [5]:
joined_all = pd.merge(all_a, all_b, how='inner', on = ' sa2_main16')

In [6]:
joined_all

Unnamed: 0,m_n_the_lf_15_19,m_emp_a_f_wrk_75_84,m_ue_lk_fr_ft_wrk_25_34,f_emp_wrk_ft_75_84,f_tot_emp_65_74,f_emp_a_f_wrk_75_84,f_emp_wrk_ft_65_74,f_tot_unemp_15_19,m_ue_lk_for_pt_wk_15_19,f_tot_unemp_total,...,p_tot_75_84_yr,p_tot_25_34_yr,f_tot_lf_45_54_yr,f_n_the_lf_tot,p_tot_unemp_35_44,p_emp_wrk_pt_65_74,f_lf_st_ns_65_74,p_lf_st_ns_75_84,p_ue_lk_fr_ft_wrk_15_19,f_tot_85_yr_over
0,41,0,4,0,58,0,26,0,3,28,...,234,292,203,616,5,75,43,26,4,51
1,115,0,15,0,46,0,18,23,12,94,...,302,1083,495,1045,34,49,20,25,17,59
2,57,0,0,9,59,0,27,4,0,24,...,197,258,224,460,3,54,22,28,0,19
3,199,3,14,8,160,0,55,12,15,73,...,438,2883,958,1363,28,221,124,77,8,42
4,67,0,22,3,48,0,14,9,7,101,...,703,677,310,1663,37,65,60,111,13,189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2286,0,0,0,0,0,0,0,0,0,0,...,0,4,0,0,0,0,0,0,0,0
2287,19,0,8,0,9,0,0,0,0,20,...,19,435,60,128,14,9,0,0,0,0
2288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0


### employment state 
for employment ratio calculation: ER = employed people / (total people - people not give labour force state) * 100%
- Assumption: the distribution for labour force states of persons labour force status not stated is the same as the area's labour force states distribution 

In [7]:
import re

for col_name in list(joined_all.columns):
    if 'tot' in col_name and not re.search('[a-zA-Z].*\d|\d.*[a-zA-Z]', col_name) and col_name[1] == 'p':
        print(col_name)

 p_tot_emp_tot
 p_n_the_lf_tot
 p_tot_unemp_total
 p_emp_wrk_pt_tot
 p_hr_wrk_ns_tot
 p_emp_a_f_wrk_tot
 p_tot_tot
 p_emp_wrk_ft_tot
 p_lf_st_ns_total
 p_ue_looking_for_ft_wk_tot
 p_ue_lk_for_pt_wrk_tot
 p_tot_lf_tot


- p_tot_emp_tot: Persons Total employed Total
- p_n_the_lf_tot: Persons Not in the labour force Total
- p_tot_unemp_total: Persons Total unemployed Total
- p_emp_wrk_pt_tot: Persons Employed worked Part-time Total
- p_hr_wrk_ns_tot: Persons Hours worked not stated Total
- p_emp_a_f_wrk_tot: Persons Employed away from work Total
- p_tot_tot: Persons Total Total
- p_emp_wrk_ft_tot: Persons Employed worked Full-time Total
- p_lf_st_ns_total: Persons Labour force status not stated Total
- p_ue_looking_for_ft_wk_tot: Persons Unemployed looking for Full-time work Total
- p_ue_lk_for_pt_wrk_tot: Persons Unemployed looking for Part-time work Total
- p_tot_lf_tot: Persons Total labour force Total

In [8]:
emp_df = joined_all[[' p_tot_emp_tot',' p_tot_tot',' p_lf_st_ns_total',' sa2_main16']].rename(columns={' p_tot_emp_tot': 'people_employed',' sa2_main16':'SA2_maincode'})
emp_df['people_total'] = emp_df[' p_tot_tot']-emp_df[' p_lf_st_ns_total']
emp_df = emp_df[['people_total','people_employed','SA2_maincode']]
emp_df

Unnamed: 0,people_total,people_employed,SA2_maincode
0,2781,1600,101021007
1,6202,4187,101021008
2,2614,1681,101031015
3,13017,10089,101031016
4,5901,2606,101041017
...,...,...,...
2286,4,3,801101134
2287,1242,1004,801101135
2288,0,0,801101136
2289,5,4,801101138


### gender
for Gender ratio calculation: GR = (Number of males / Number of females) * 100%

- m_tot_tot: Males Total Total
- f_tot_tot: Females Total Total

In [9]:
gender_df = joined_all[[' m_tot_tot',' f_tot_tot',' sa2_main16']].rename(columns={' m_tot_tot': 'males',' f_tot_tot':'females',' sa2_main16':'SA2_maincode'})
gender_df

Unnamed: 0,males,females,SA2_maincode
0,1569,1575,101021007
1,3211,3364,101021008
2,1466,1423,101031015
3,8877,7066,101031016
4,3187,3292,101041017
...,...,...,...
2286,16,14,801101134
2287,634,695,801101135
2288,0,0,801101136
2289,7,3,801101138


### age range
find the most popular age range in each area: mode 

calculate the percentage of people in this age stage in the area: (mode / tot_people) * 100%

In [10]:
for col_name in list(joined_all.columns):
    if col_name[1:7] == 'p_tot_' and 'emp' not in col_name and 'lf' not in col_name and col_name != ' p_tot_tot':
        print(col_name)

 p_tot_15_19_yr
 p_tot_55_64_yr
 p_tot_45_54_yr
 p_tot_85_yr_over
 p_tot_65_74_yr
 p_tot_20_24_yr
 p_tot_35_44_yr
 p_tot_75_84_yr
 p_tot_25_34_yr


In [11]:
import numpy as np

year_range = ['SA2_maincode','65_74_yr','75_84_yr','85_yr_over','total_people']
year_full = joined_all[[' sa2_main16',' p_tot_65_74_yr', ' p_tot_75_84_yr', ' p_tot_85_yr_over',' p_tot_tot']]
col_dict = dict(zip(year_full.columns, year_range))
year_full = year_full.rename(columns=col_dict)
year_full

Unnamed: 0,SA2_maincode,65_74_yr,75_84_yr,85_yr_over,total_people
0,101021007,582,234,83,3142
1,101021008,624,302,83,6570
2,101031015,486,197,42,2889
3,101031016,1722,438,98,15944
4,101041017,1254,703,296,6483
...,...,...,...,...,...
2286,801101134,3,0,0,29
2287,801101135,55,19,0,1335
2288,801101136,0,0,0,0
2289,801101138,0,0,0,5


## Merge SUDO with SUA

In [12]:
SUA_employ = pd.merge(joined_area, emp_df, how='inner', left_on = 'SA2_MAINCODE_2016', right_on='SA2_maincode')
SUA_employ = SUA_employ.drop(columns=['SA2_MAINCODE_2016','SA2_maincode'])

In [13]:
SUA_employ = SUA_employ.groupby(['SUA_NAME_2016','state'])['people_total','people_employed'].sum()
SUA_employ = SUA_employ.reset_index()
SUA_employ['employment_rate'] = SUA_employ['people_employed'] / SUA_employ['people_total'] * 100
SUA_employ

  SUA_employ = SUA_employ.groupby(['SUA_NAME_2016','state'])['people_total','people_employed'].sum()


Unnamed: 0,SUA_NAME_2016,state,people_total,people_employed,employment_rate
0,Adelaide,south australia,993435,572587,57.637087
1,Albany,western australia,24753,14137,57.112269
2,Albury - Wodonga,new south wales,66004,40396,61.202351
3,Alice Springs,northern territory,18594,13102,70.463590
4,Armidale,new south wales,18045,9737,53.959546
...,...,...,...,...,...
105,Warwick,queensland,11320,6033,53.295053
106,Whyalla,south australia,16214,8279,51.060812
107,Wollongong,new south wales,218570,122653,56.116118
108,Yanchep,western australia,7847,4692,59.793552


In [14]:
SUA_gender = pd.merge(joined_area, gender_df, how='inner', left_on = 'SA2_MAINCODE_2016', right_on='SA2_maincode')
SUA_gender = SUA_gender.drop(columns=['SA2_MAINCODE_2016','SA2_maincode'])

SUA_gender = SUA_gender.groupby(['SUA_NAME_2016','state'])['males','females'].sum()
SUA_gender = SUA_gender.reset_index()

SUA_gender['gender_ratio'] = SUA_gender['males'] / SUA_gender['females'] * 100
SUA_gender

  SUA_gender = SUA_gender.groupby(['SUA_NAME_2016','state'])['males','females'].sum()


Unnamed: 0,SUA_NAME_2016,state,males,females,gender_ratio
0,Adelaide,south australia,502651,539410,93.185332
1,Albany,western australia,12702,13638,93.136824
2,Albury - Wodonga,new south wales,34155,36751,92.936247
3,Alice Springs,northern territory,10439,11276,92.577155
4,Armidale,new south wales,9285,10340,89.796905
...,...,...,...,...,...
105,Warwick,queensland,5633,6348,88.736610
106,Whyalla,south australia,8756,8705,100.585870
107,Wollongong,new south wales,112172,118162,94.930688
108,Yanchep,western australia,4137,4537,91.183601


In [15]:
# define a function to extract lower and upper bounds of age
def extract_age_range(age_str):
    lower, upper = re.findall(r'\d+', age_str)
    return f"{lower} to {upper} years-old"

In [19]:
SUA_age = pd.merge(joined_area, year_full, how='inner', left_on = 'SA2_MAINCODE_2016', right_on='SA2_maincode')
SUA_age = SUA_age.drop(columns=['SA2_MAINCODE_2016','SA2_maincode'])

SUA_age = SUA_age.groupby(['SUA_NAME_2016','state'])['65_74_yr','75_84_yr','85_yr_over','total_people'].sum()
SUA_age = SUA_age.reset_index()

SUA_age['ageing_population'] = SUA_age['65_74_yr']+SUA_age['75_84_yr']+SUA_age['85_yr_over']
SUA_age['ageing_population_percentage'] = SUA_age['ageing_population'] / SUA_age['total_people'] * 100

SUA_puplar_age_range = SUA_age[['SUA_NAME_2016','state','ageing_population','total_people','ageing_population_percentage']]

  SUA_age = SUA_age.groupby(['SUA_NAME_2016','state'])['65_74_yr','75_84_yr','85_yr_over','total_people'].sum()


## Combine 3 datasets

In [20]:
SUA_age_gender = pd.merge(SUA_puplar_age_range, SUA_gender, how='inner', on=['SUA_NAME_2016','state'])
SUA_all = pd.merge(SUA_age_gender, SUA_employ, how='inner', on=['SUA_NAME_2016','state'])
SUA_all = SUA_all.rename(columns={'SUA_NAME_2016': 'SUA_NAME'})

with open('../data/SUDO/SUA_jsonfile.json', 'w') as f:
    SUA_all.to_json(f, orient='records')