# Preprocess population density data from 2001 to 2022

This notebook is to preprocess the population density data that was downloaded from ABS

### Import package

In [80]:
import pandas as pd
import requests


# Download data and read data

In [81]:
# Download 2021-2022 population data to landing

url = "https://www.abs.gov.au/statistics/people/population/regional-population/2021-22/32180DS0001_2021-22r.xlsx"
response = requests.get(url)

with open('../data/landing/population2021-22.xlsx', 'wb') as file:
    file.write(response.content)

df = pd.read_excel('../data/landing/population2021-22.xlsx',sheet_name="Table 2", engine="openpyxl", skiprows=7)
df.head(10)

Unnamed: 0,S/T code,S/T name,GCCSA code,GCCSA name,SA4 code,SA4 name,SA3 code,SA3 name,SA2 code,SA2 name,...,Unnamed: 12,no..2,%,Unnamed: 15,no..3,no..4,no..5,Unnamed: 19,km2,persons/km2
0,,,,,,,,,,,...,,,,,,,,,,
1,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011001.0,Alfredton,...,,1150.0,6.8,,169.0,899.0,82.0,,52.7,341.3
2,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011002.0,Ballarat,...,,-140.0,-1.2,,-83.0,-134.0,77.0,,12.4,963.8
3,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011005.0,Buninyong,...,,15.0,0.2,,32.0,-53.0,36.0,,51.6,140.4
4,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011006.0,Delacombe,...,,1142.0,10.7,,123.0,996.0,23.0,,34.2,345.1
5,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011007.0,Smythes Creek,...,,11.0,0.3,,17.0,-8.0,2.0,,104.7,40.3
6,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011008.0,Wendouree - Miners Rest,...,,68.0,0.4,,19.0,-17.0,66.0,,67.6,227.6
7,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011481.0,Ballarat East - Warrenheip,...,,-19.0,-0.2,,-39.0,-16.0,36.0,,19.2,501.4
8,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011482.0,Ballarat North - Invermay,...,,185.0,1.3,,67.0,67.0,51.0,,73.1,203.5
9,2.0,Victoria,2RVIC,Rest of Vic.,201.0,Ballarat,20101.0,Ballarat,201011483.0,Canadian - Mount Clear,...,,182.0,1.5,,34.0,59.0,89.0,,22.3,556.2


In [82]:
# Download 2001-2022 population data to landing

url = "https://www.abs.gov.au/statistics/people/population/regional-population/2021-22/32180DS0003_2001-22.xlsx"
response = requests.get(url)

with open('../data/landing/population2001-22.xlsx', 'wb') as file:
    file.write(response.content)

df = pd.read_excel('../data/landing/population2001-22.xlsx',sheet_name="Table 1", engine="openpyxl", skiprows=7)
df.columns


Index(['S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code',
       'SA4 name', 'SA3 code', 'SA3 name', 'SA2 code', 'SA2 name', 'no.',
       'no..1', 'no..2', 'no..3', 'no..4', 'no..5', 'no..6', 'no..7', 'no..8',
       'no..9', 'no..10', 'no..11', 'no..12', 'no..13', 'no..14', 'no..15',
       'no..16', 'no..17', 'no..18', 'no..19', 'no..20', 'no..21'],
      dtype='object')

In [83]:
# Select and save Victoria data in population2001-2022 to raw

df_vic = df[df['S/T name'] == 'Victoria']

# change column names
years = list(range(2001, 2023))
cols = df_vic.columns.tolist()
cols[10:] = years
df_vic.columns = cols

df_vic.to_csv('../data/raw/population2001-22_VIC.csv', index=False)

df_vic.columns

Index([  'S/T code',   'S/T name', 'GCCSA code', 'GCCSA name',   'SA4 code',
         'SA4 name',   'SA3 code',   'SA3 name',   'SA2 code',   'SA2 name',
               2001,         2002,         2003,         2004,         2005,
               2006,         2007,         2008,         2009,         2010,
               2011,         2012,         2013,         2014,         2015,
               2016,         2017,         2018,         2019,         2020,
               2021,         2022],
      dtype='object')


# Calculate population density for 2001-2022

In [84]:
df_population = pd.read_csv('../data/raw/population2001-22_VIC.csv')
df_area = pd.read_csv('../data/raw//SA2_area.csv')

# inner merge datasets on 'SA2 code'
merged_df = pd.merge(df_population, df_area, on='SA2 code', how='inner')

merged_df.columns

Index(['S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code',
       'SA4 name', 'SA3 code', 'SA3 name', 'SA2 code', 'SA2 name_x', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022', 'SA2 name_y', 'km2'],
      dtype='object')

In [87]:
# calculate population density for 2001-2022
for year in range(2001,2023):
    col_name = str(year)  # Making sure the column name is a string
    density_col_name = f'{year} Density'
    merged_df[density_col_name] = merged_df[col_name] / merged_df['km2']


merged_df.columns

Index(['S/T code', 'S/T name', 'GCCSA code', 'GCCSA name', 'SA4 code',
       'SA4 name', 'SA3 code', 'SA3 name', 'SA2 code', 'SA2 name_x', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022', 'SA2 name_y', 'km2', '2001 Density',
       '2002 Density', '2003 Density', '2004 Density', '2005 Density',
       '2006 Density', '2007 Density', '2008 Density', '2009 Density',
       '2010 Density', '2011 Density', '2012 Density', '2013 Density',
       '2014 Density', '2015 Density', '2016 Density', '2017 Density',
       '2018 Density', '2019 Density', '2020 Density', '2021 Density',
       '2022 Density'],
      dtype='object')