In [1]:
# Setup: imports and display options
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

DATA_PATH = Path('/Users/dionpapadopoulos/Downloads/project-2-group-buy-now-pay-later-industry-project-44/data')

# Use 2024 Data by Region (DbR) SA2 income release + 2024 locality→SA2 mapping
SA2_INCOME_CSV = DATA_PATH / 'income/sa2_income.csv'
LOCALITY_TO_SA2_CSV = DATA_PATH / 'income/2024 Locality to 2021 SA2 Coding Index.csv'

sa2_income = pd.read_csv(SA2_INCOME_CSV)
locality_to_sa2 = pd.read_csv(LOCALITY_TO_SA2_CSV)

# Align key names
sa2_income = sa2_income.rename(columns={
    'Statistical Areas Level 2 2021 code': 'SA2_CODE_2021',
    'Statistical Areas Level 2 2021 name': 'SA2_NAME_2021',
})

# Coerce keys to consistent dtypes and trim whitespace
for df in (sa2_income, locality_to_sa2):
    for col in ('SA2_CODE_2021', 'SA2_NAME_2021'):
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip()

# Choose an income column from the 2024 DbR release (personal median total income)
income_candidates = [
    c for c in sa2_income.columns
    if c.startswith('Personal income: Median total income (excl. Government pensions and allowances)')
]
INCOME_COL = income_candidates[0] if income_candidates else None

cols_to_keep = ['SA2_CODE_2021', 'SA2_NAME_2021'] + ([INCOME_COL] if INCOME_COL else [])

merged = locality_to_sa2.merge(
    sa2_income[cols_to_keep],
    on=['SA2_CODE_2021', 'SA2_NAME_2021'],
    how='left'
)

print('Using income column:', INCOME_COL)
print(merged.shape)
merged.head(3)


Using income column: Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)
(34213, 8)


Unnamed: 0,LOCALITY_ID,LOCALITY_NAME,LOCALITY_TYPE,POSTCODE,STATE,SA2_CODE_2021,SA2_NAME_2021,Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)
0,ABS10742,RIVERLEA,EXTRA LOCALITY,2902.0,ACT,801111140,ACT - South West,68987.0
1,ABS10807,KINLYSIDE,EXTRA LOCALITY,2913.0,ACT,801041117,Gungahlin - West,
2,ABS11184,PARKWOOD,EXTRA LOCALITY,2618.0,ACT,801011144,West Belconnen,


In [2]:
# Missing values overview
na_counts = merged.isna().sum().sort_values(ascending=False)
na_pct = (merged.isna().mean() * 100).round(2).sort_values(ascending=False)

na_summary = (
    pd.concat([na_counts.rename('missing_count'), na_pct.rename('missing_pct')], axis=1)
    .sort_values(['missing_count', 'missing_pct'], ascending=False)
)

print(merged.shape)
na_summary.head(30)


(34213, 8)


Unnamed: 0,missing_count,missing_pct
Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020),1699,4.97
POSTCODE,185,0.54
LOCALITY_NAME,0,0.0
LOCALITY_ID,0,0.0
LOCALITY_TYPE,0,0.0
STATE,0,0.0
SA2_CODE_2021,0,0.0
SA2_NAME_2021,0,0.0


In [3]:
# Duplicate analysis
# 1) Full-row duplicates
full_dupe_count = merged.duplicated().sum()
print(f"Full-row duplicates: {full_dupe_count}")

# 2) Key subset duplicates: choose a join key
# Guessing columns from merged file header
candidate_keys = [
    'POSTCODE',
    'LOCALITY_NAME',
    'STATE',
    'SA2_CODE_2021',
    'SA2_NAME_2021'
]
existing_keys = [c for c in candidate_keys if c in merged.columns]
print('Existing key columns:', existing_keys)

if existing_keys:
    key_dupes = merged.duplicated(subset=existing_keys, keep=False)
    dupe_groups = (
        merged.loc[key_dupes, existing_keys]
        .value_counts()
        .reset_index(name='rows_per_key')
        .sort_values('rows_per_key', ascending=False)
    )
    print('Rows with duplicate keys:', key_dupes.sum())
    dupe_groups.head(20)
else:
    print('No expected key columns found; adjust candidate_keys if needed.')


Full-row duplicates: 0
Existing key columns: ['POSTCODE', 'LOCALITY_NAME', 'STATE', 'SA2_CODE_2021', 'SA2_NAME_2021']
Rows with duplicate keys: 4


In [4]:
# Relationship checks between locality and SA2
cols = merged.columns
has_locality = 'LOCALITY_NAME' in cols and 'POSTCODE' in cols and 'STATE' in cols
has_sa2 = 'SA2_CODE_2021' in cols and 'SA2_NAME_2021' in cols

if has_locality and has_sa2:
    locality_key = ['POSTCODE', 'STATE', 'LOCALITY_NAME']
    sa2_key = ['SA2_CODE_2021', 'SA2_NAME_2021']

    # Locality -> how many distinct SA2s?
    locality_to_sa2 = (
        merged.groupby(locality_key)[sa2_key]
        .nunique()
        .rename(columns={'SA2_CODE_2021': 'distinct_sa2_codes', 'SA2_NAME_2021': 'distinct_sa2_names'})
        .reset_index()
    )
    locality_multi = locality_to_sa2.query('distinct_sa2_codes > 1 or distinct_sa2_names > 1')

    # SA2 -> how many distinct localities?
    sa2_to_locality = (
        merged.groupby(sa2_key)[['POSTCODE', 'STATE', 'LOCALITY_NAME']]
        .nunique()
        .rename(columns={'POSTCODE': 'distinct_postcodes', 'STATE': 'distinct_states', 'LOCALITY_NAME': 'distinct_localities'})
        .reset_index()
    )
    sa2_multi = sa2_to_locality.query('distinct_postcodes > 1 or distinct_states > 1 or distinct_localities > 1')

    print('Localities mapping to multiple SA2s:', len(locality_multi))
    display(locality_multi.head(20))

    print('SA2s mapping to multiple localities/postcodes:', len(sa2_multi))
    display(sa2_multi.head(20))
else:
    print('Expected columns not present; check merge output.')


Localities mapping to multiple SA2s: 0


Unnamed: 0,POSTCODE,STATE,LOCALITY_NAME,distinct_sa2_codes,distinct_sa2_names


SA2s mapping to multiple localities/postcodes: 1936


Unnamed: 0,SA2_CODE_2021,SA2_NAME_2021,distinct_postcodes,distinct_states,distinct_localities
0,101021007,Braidwood,2,1,49
2,101021009,Queanbeyan,1,1,4
3,101021010,Queanbeyan - East,1,1,4
4,101021012,Queanbeyan West - Jerrabomberra,2,1,4
5,101021610,Googong,1,1,2
6,101021611,Queanbeyan Surrounds,6,1,41
7,101031013,Bombala,5,1,40
8,101031014,Cooma,1,1,6
9,101031015,Cooma Surrounds,4,1,61
10,101031016,Jindabyne - Berridale,5,1,68


In [5]:
# Outlier prep: create SA2-level frame and numeric income
assert INCOME_COL is not None, "Income column not detected. Check column selection in cell 0."

sa2_income_2024 = (
    merged[['SA2_CODE_2021', 'SA2_NAME_2021', INCOME_COL]]
    .drop_duplicates()
    .copy()
)

sa2_income_2024['income_numeric'] = pd.to_numeric(sa2_income_2024[INCOME_COL], errors='coerce')

coverage = {
    'sa2_rows': len(sa2_income_2024),
    'income_non_null': int(sa2_income_2024['income_numeric'].notna().sum()),
    'income_null': int(sa2_income_2024['income_numeric'].isna().sum()),
}
coverage


{'sa2_rows': 2312, 'income_non_null': 2008, 'income_null': 304}

In [6]:
# Distribution stats for SA2 income (2024)
stats = sa2_income_2024['income_numeric'].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])
stats


count      2008.000000
mean      52878.791335
std       11015.610128
min        8169.000000
1%        32266.910000
5%        37656.550000
25%       46227.750000
50%       51733.000000
75%       58600.250000
95%       71144.300000
99%       84696.910000
max      147857.000000
Name: income_numeric, dtype: float64

In [7]:
# IQR-based outliers
q1 = sa2_income_2024['income_numeric'].quantile(0.25)
q3 = sa2_income_2024['income_numeric'].quantile(0.75)
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

sa2_income_2024['outlier_iqr_low'] = sa2_income_2024['income_numeric'] < lower
sa2_income_2024['outlier_iqr_high'] = sa2_income_2024['income_numeric'] > upper

summary_iqr = {
    'Q1': q1,
    'Q3': q3,
    'IQR': iqr,
    'LowerFence': lower,
    'UpperFence': upper,
    'num_low': int(sa2_income_2024['outlier_iqr_low'].sum()),
    'num_high': int(sa2_income_2024['outlier_iqr_high'].sum()),
}

extreme_low = sa2_income_2024.loc[sa2_income_2024['outlier_iqr_low']].sort_values('income_numeric').head(20)
extreme_high = sa2_income_2024.loc[sa2_income_2024['outlier_iqr_high']].sort_values('income_numeric', ascending=False).head(20)

summary_iqr, extreme_low[['SA2_CODE_2021','SA2_NAME_2021','income_numeric']].head(10), extreme_high[['SA2_CODE_2021','SA2_NAME_2021','income_numeric']].head(10)


({'Q1': np.float64(46227.75),
  'Q3': np.float64(58600.25),
  'IQR': np.float64(12372.5),
  'LowerFence': np.float64(27669.0),
  'UpperFence': np.float64(77159.0),
  'num_low': 9,
  'num_high': 51},
       SA2_CODE_2021                   SA2_NAME_2021  income_numeric
 20229     404031104                Adelaide Airport          8169.0
 24839     205051099                     Alps - West         10949.0
 115       801051049                           Acton         10976.0
 10318     308031221                  Shoalwater Bay         19214.0
 1908      124011451          Blue Mountains - North         19249.0
 399       114011275  Ettrema - Sassafras - Budawang         20020.0
 25134     205031088                   French Island         21302.0
 7700      702031061                     West Arnhem         24641.0
 11703     315011395                         Aurukun         26086.0,
       SA2_CODE_2021           SA2_NAME_2021  income_numeric
 31340     503021037         Kings Park (WA)     

In [8]:
# Z-score outliers and mapping flags back to localities
mean_val = sa2_income_2024['income_numeric'].mean()
std_val = sa2_income_2024['income_numeric'].std(ddof=0)
sa2_income_2024['z_score'] = (sa2_income_2024['income_numeric'] - mean_val) / std_val

# Typical 3-sigma rule; adjust threshold if needed
z_thresh = 3.0
sa2_income_2024['outlier_z'] = sa2_income_2024['z_score'].abs() > z_thresh

z_summary = {
    'mean': mean_val,
    'std': std_val,
    'z_thresh': z_thresh,
    'num_z_outliers': int(sa2_income_2024['outlier_z'].sum()),
}

# Map flags back to localities
flags = sa2_income_2024[['SA2_CODE_2021','outlier_iqr_low','outlier_iqr_high','outlier_z']]
merged_with_flags = merged.merge(flags, on='SA2_CODE_2021', how='left')

# Show some flagged localities
flagged_localities = merged_with_flags[
    merged_with_flags[['outlier_iqr_low','outlier_iqr_high','outlier_z']].any(axis=1)
][['POSTCODE','STATE','LOCALITY_NAME','SA2_CODE_2021','SA2_NAME_2021',INCOME_COL,'outlier_iqr_low','outlier_iqr_high','outlier_z']]

z_summary, flagged_localities.head(20)


({'mean': np.float64(52878.79133466136),
  'std': 11012.866855242528,
  'z_thresh': 3.0,
  'num_z_outliers': 22},
      POSTCODE STATE                   LOCALITY_NAME SA2_CODE_2021  \
 10     2620.0   ACT                           KOWEN     801031032   
 33     2611.0   ACT                         CHAPMAN     801081091   
 34     2611.0   ACT                STROMLO DISTRICT     801081091   
 36     2603.0   ACT                         FORREST     801061063   
 58     2611.0   ACT                          WRIGHT     801101139   
 59     2611.0   ACT                    WESTON CREEK     801101139   
 61     2605.0   ACT                          GARRAN     801091102   
 84     2600.0   ACT                          BARTON     801061129   
 95     2603.0   ACT                        GRIFFITH     801061064   
 96     2603.0   ACT                CANBERRA CENTRAL     801061064   
 97     2603.0   ACT                          MANUKA     801061064   
 98     2914.0   ACT                          

In [9]:
# Create 5 income bins (quintiles) and map to localities
# Quintiles give roughly equal-sized groups: Very Low → Very High
values = sa2_income_2024['income_numeric']
qres = pd.qcut(values, q=5, duplicates='drop')
num_bins = qres.cat.categories.size
labels_all = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
labels = labels_all[:num_bins]

sa2_income_2024['income_quintile'] = pd.qcut(values, q=num_bins, labels=labels, duplicates='drop')

# Optional: also store a numeric rank (1..num_bins)
sa2_income_2024['income_quintile_rank'] = sa2_income_2024['income_quintile'].cat.codes + 1

# Map bins back to localities
merged_with_bins = merged.merge(
    sa2_income_2024[['SA2_CODE_2021','income_quintile','income_quintile_rank']],
    on='SA2_CODE_2021',
    how='left'
)

# Show counts per bin and a preview
bin_counts = sa2_income_2024['income_quintile'].value_counts(dropna=False).sort_index()
bin_counts, merged_with_bins[['POSTCODE','STATE','LOCALITY_NAME','SA2_NAME_2021','income_quintile','income_quintile_rank']].head(20)


(income_quintile
 Very Low     402
 Low          401
 Medium       402
 High         401
 Very High    402
 NaN          304
 Name: count, dtype: int64,
     POSTCODE STATE   LOCALITY_NAME     SA2_NAME_2021 income_quintile  \
 0     2902.0   ACT        RIVERLEA  ACT - South West       Very High   
 1     2913.0   ACT       KINLYSIDE  Gungahlin - West             NaN   
 2     2618.0   ACT        PARKWOOD    West Belconnen             NaN   
 3     2611.0   ACT         STROMLO  ACT - South West       Very High   
 4     2620.0   ACT         TENNENT           Namadgi        Very Low   
 5     2611.0   ACT          BRACKS           Namadgi        Very Low   
 6     2601.0   ACT  BLACK MOUNTAIN    Black Mountain             NaN   
 7     2620.0   ACT     CASTLE HILL  ACT - South West       Very High   
 8     2620.0   ACT           BOOTH           Namadgi        Very Low   
 9     2609.0   ACT   JERRABOMBERRA     Canberra East          Medium   
 10    2620.0   ACT           KOWEN         

In [10]:
# Impute missing locality-level income using SA2 averages
# Step 1: numeric income at locality level
merged['income_numeric_locality'] = pd.to_numeric(merged[INCOME_COL], errors='coerce')

before_missing = int(merged['income_numeric_locality'].isna().sum())

# Step 2: SA2 mean from available SA2-level values (exclude NaN and zero incomes)
valid_sa2 = sa2_income_2024.loc[
    sa2_income_2024['income_numeric'].notna() & (sa2_income_2024['income_numeric'] > 0)
]
sa2_mean_income = (
    valid_sa2.groupby('SA2_CODE_2021')['income_numeric']
    .mean()
)

# Step 3: fill NaNs with SA2 mean mapped by SA2 code
fill_map = merged['SA2_CODE_2021'].map(sa2_mean_income)
merged['income_filled'] = merged['income_numeric_locality'].fillna(fill_map)

filled_count = int((merged['income_numeric_locality'].isna() & merged['income_filled'].notna()).sum())
after_missing = int(merged['income_filled'].isna().sum())

{
    'missing_before': before_missing,
    'filled_with_sa2_mean': filled_count,
    'missing_after': after_missing
}, merged.loc[merged['income_numeric_locality'].isna() & merged['income_filled'].notna(), ['POSTCODE','STATE','LOCALITY_NAME','SA2_CODE_2021','SA2_NAME_2021']].head(10)


({'missing_before': 1699, 'filled_with_sa2_mean': 0, 'missing_after': 1699},
 Empty DataFrame
 Columns: [POSTCODE, STATE, LOCALITY_NAME, SA2_CODE_2021, SA2_NAME_2021]
 Index: [])

In [11]:
# Alternative imputation: use source SA2 file means (positive incomes only)
# Build source SA2 numeric income
sa2_income['income_numeric_source'] = pd.to_numeric(sa2_income[INCOME_COL], errors='coerce')
valid_src = sa2_income.loc[sa2_income['income_numeric_source'].notna() & (sa2_income['income_numeric_source'] > 0)]
source_sa2_mean = valid_src.groupby('SA2_CODE_2021')['income_numeric_source'].mean()

# Map to missing localities
missing_mask = merged['income_numeric_locality'].isna()
sa2_in_missing = merged.loc[missing_mask, 'SA2_CODE_2021']
sa2_has_mean = sa2_in_missing.isin(source_sa2_mean.index)

merged['income_filled_source'] = merged['income_numeric_locality'].where(~missing_mask, sa2_in_missing.map(source_sa2_mean))

filled_count_src = int((missing_mask & merged['income_filled_source'].notna()).sum())
missing_after_src = int(merged['income_filled_source'].isna().sum())

report = {
    'missing_before': int(missing_mask.sum()),
    'sa2_with_mean_for_missing': int(sa2_has_mean.sum()),
    'sa2_without_mean_for_missing': int((~sa2_has_mean).sum()),
    'filled_with_source_sa2_mean': filled_count_src,
    'missing_after': missing_after_src,
}
report


{'missing_before': 1699,
 'sa2_with_mean_for_missing': 0,
 'sa2_without_mean_for_missing': 1699,
 'filled_with_source_sa2_mean': 0,
 'missing_after': 1699}

In [12]:
# Reprocessing: fill locality incomes from SA2-locality means (exclude zeros)
# Build locality-level numeric income
merged['income_numeric_locality'] = pd.to_numeric(merged[INCOME_COL], errors='coerce')

# Compute SA2 means from available localities with positive income
locality_valid = merged.loc[merged['income_numeric_locality'].notna() & (merged['income_numeric_locality'] > 0)]
sa2_mean_from_localities = locality_valid.groupby('SA2_CODE_2021')['income_numeric_locality'].mean()

# Fill missing locality incomes with SA2 mean computed above
miss_mask = merged['income_numeric_locality'].isna()
merged['income_imputed'] = merged['income_numeric_locality'].where(~miss_mask, merged['SA2_CODE_2021'].map(sa2_mean_from_localities))

result_counts = {
    'missing_before': int(miss_mask.sum()),
    'filled_from_sa2_localities': int((miss_mask & merged['income_imputed'].notna()).sum()),
    'missing_after': int(merged['income_imputed'].isna().sum())
}

print(result_counts)

# Worked example: pick one imputed row and show the calculation
example = merged.loc[miss_mask & merged['income_imputed'].notna()].head(1).copy()
if not example.empty:
    ex_sa2 = example.iloc[0]['SA2_CODE_2021']
    ex_postcode = example.iloc[0]['POSTCODE']
    ex_locality = example.iloc[0]['LOCALITY_NAME']
    # All contributing localities in same SA2 with positive income
    contributors = locality_valid.loc[locality_valid['SA2_CODE_2021'] == ex_sa2, ['POSTCODE','LOCALITY_NAME','income_numeric_locality']]
    ex_mean = sa2_mean_from_localities.loc[ex_sa2]
    print({'example_sa2': ex_sa2, 'example_postcode': ex_postcode, 'example_locality': ex_locality, 'computed_mean_used': ex_mean})
    display(contributors.head(10))
else:
    print('No imputations occurred from SA2-locality means; check earlier cells or data coverage.')


{'missing_before': 1699, 'filled_from_sa2_localities': 0, 'missing_after': 1699}
No imputations occurred from SA2-locality means; check earlier cells or data coverage.
