# Preprocessing 2021 ABS census data on population

This notebook is to preprocess the external population data per SA2 in 2021

## Import package

In [28]:
import pandas as pd

#### Preprocessing Population data
First, we want to know how many people per SA2 code in 2021


In [135]:
# load 2021 ABS Census data on family number
family_count_2021_path = "../data/landing/ABS_data/SA2_(EN)_by_FNOF_Family_Number.csv"
skip_rows = 9
family_count_2021_data = pd.read_csv(family_count_2021_path, skiprows=skip_rows)
family_count_2021_data

Unnamed: 0,FNOF Family Number,Primary family,Second family,Third family,Not applicable,Total,Unnamed: 6
0,SA2 (EN),,,,,,
1,Alfredton,4485,77.0,0.0,1728.0,6287.0,
2,Ballarat,3028,20.0,0.0,2426.0,5486.0,
3,Buninyong,1989,23.0,0.0,708.0,2716.0,
4,Delacombe,2887,45.0,0.0,1332.0,4266.0,
...,...,...,...,...,...,...,...
524,Total,1676132,41094.0,1546.0,931140.0,2649914.0,
525,"Dataset: Census of Population and Housing, 202...",,,,,,
526,INFO,Cells in this table have been randomly adjuste...,,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,,


In [136]:
family_count_2021_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   FNOF Family Number  529 non-null    object 
 1   Primary family      525 non-null    object 
 2   Second family       524 non-null    float64
 3   Third family        524 non-null    float64
 4   Not applicable      524 non-null    float64
 5   Total               524 non-null    float64
 6   Unnamed: 6          0 non-null      float64
dtypes: float64(5), object(2)
memory usage: 29.1+ KB


In [137]:
family_count_2021_data = family_count_2021_data.applymap(lambda x: 0 if "Cells "
                                                                          "in this table have been randomly adjusted to avoid the release of confidential data. "
                                                                          "No reliance should be "
                                                                          "placed on small cells."
                                                                          "" in str(x) else x)

In [138]:
family_count_2021_data['Primary family'] = family_count_2021_data['Primary family'].astype(float)
family_count_2021_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   FNOF Family Number  529 non-null    object 
 1   Primary family      525 non-null    float64
 2   Second family       524 non-null    float64
 3   Third family        524 non-null    float64
 4   Not applicable      524 non-null    float64
 5   Total               524 non-null    float64
 6   Unnamed: 6          0 non-null      float64
dtypes: float64(6), object(1)
memory usage: 29.1+ KB


In [139]:
# preprocessing
family_count_2021_data.drop(family_count_2021_data.index[0], inplace = True)
family_count_2021_data.drop(columns=family_count_2021_data.columns[-1], inplace = True)
family_count_2021_data

Unnamed: 0,FNOF Family Number,Primary family,Second family,Third family,Not applicable,Total
1,Alfredton,4485.0,77.0,0.0,1728.0,6287.0
2,Ballarat,3028.0,20.0,0.0,2426.0,5486.0
3,Buninyong,1989.0,23.0,0.0,708.0,2716.0
4,Delacombe,2887.0,45.0,0.0,1332.0,4266.0
5,Smythes Creek,1174.0,41.0,0.0,314.0,1528.0
...,...,...,...,...,...,...
524,Total,1676132.0,41094.0,1546.0,931140.0,2649914.0
525,"Dataset: Census of Population and Housing, 202...",,,,,
526,INFO,0.0,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,


In [140]:
family_count_2021_data.rename(columns={'Not applicable': 'Extended Family'}, inplace=True)
family_count_2021_data.rename(columns={'FNOF Family Number': 'SA2 Name'}, inplace=True)
family_count_2021_data.rename(columns={'Total': 'Total Suburb Population'}, inplace=True)

family_count_2021_data

Unnamed: 0,SA2 Name,Primary family,Second family,Third family,Extended Family,Total Suburb Population
1,Alfredton,4485.0,77.0,0.0,1728.0,6287.0
2,Ballarat,3028.0,20.0,0.0,2426.0,5486.0
3,Buninyong,1989.0,23.0,0.0,708.0,2716.0
4,Delacombe,2887.0,45.0,0.0,1332.0,4266.0
5,Smythes Creek,1174.0,41.0,0.0,314.0,1528.0
...,...,...,...,...,...,...
524,Total,1676132.0,41094.0,1546.0,931140.0,2649914.0
525,"Dataset: Census of Population and Housing, 202...",,,,,
526,INFO,0.0,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,


In [141]:
# family_count_2021_data.info()
family_count_2021_data['Immediate Family'] = family_count_2021_data['Total Suburb Population'] - family_count_2021_data['Extended Family']
family_count_2021_data

Unnamed: 0,SA2 Name,Primary family,Second family,Third family,Extended Family,Total Suburb Population,Immediate Family
1,Alfredton,4485.0,77.0,0.0,1728.0,6287.0,4559.0
2,Ballarat,3028.0,20.0,0.0,2426.0,5486.0,3060.0
3,Buninyong,1989.0,23.0,0.0,708.0,2716.0,2008.0
4,Delacombe,2887.0,45.0,0.0,1332.0,4266.0,2934.0
5,Smythes Creek,1174.0,41.0,0.0,314.0,1528.0,1214.0
...,...,...,...,...,...,...,...
524,Total,1676132.0,41094.0,1546.0,931140.0,2649914.0,1718774.0
525,"Dataset: Census of Population and Housing, 202...",,,,,,
526,INFO,0.0,,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,,


In [142]:
family_count_2021_data.drop(columns=family_count_2021_data.columns[1:4], inplace = True)
family_count_2021_data

Unnamed: 0,SA2 Name,Extended Family,Total Suburb Population,Immediate Family
1,Alfredton,1728.0,6287.0,4559.0
2,Ballarat,2426.0,5486.0,3060.0
3,Buninyong,708.0,2716.0,2008.0
4,Delacombe,1332.0,4266.0,2934.0
5,Smythes Creek,314.0,1528.0,1214.0
...,...,...,...,...
524,Total,931140.0,2649914.0,1718774.0
525,"Dataset: Census of Population and Housing, 202...",,,
526,INFO,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,


In [143]:
# rearrange columns positions
family_count_2021_data = family_count_2021_data[['SA2 Name', 'Immediate Family', 'Extended Family', 'Total Suburb Population']]
family_count_2021_data

Unnamed: 0,SA2 Name,Immediate Family,Extended Family,Total Suburb Population
1,Alfredton,4559.0,1728.0,6287.0
2,Ballarat,3060.0,2426.0,5486.0
3,Buninyong,2008.0,708.0,2716.0
4,Delacombe,2934.0,1332.0,4266.0
5,Smythes Creek,1214.0,314.0,1528.0
...,...,...,...,...
524,Total,1718774.0,931140.0,2649914.0
525,"Dataset: Census of Population and Housing, 202...",,,
526,INFO,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,


In [145]:
family_count_2021_data = family_count_2021_data.iloc[:-6, :]
family_count_2021_data.tail(10)

Unnamed: 0,SA2 Name,Immediate Family,Extended Family,Total Suburb Population
513,Camperdown,929.0,710.0,1639.0
514,Colac,3210.0,2394.0,5604.0
515,Colac Surrounds,1549.0,823.0,2372.0
516,Corangamite - North,1431.0,1016.0,2447.0
517,Corangamite - South,1948.0,1053.0,3001.0
518,Otway,1004.0,1078.0,2082.0
519,Moyne - East,1871.0,996.0,2867.0
520,Moyne - West,2793.0,1417.0,4210.0
521,Warrnambool - North,6111.0,3379.0,9490.0
522,Warrnambool - South,3406.0,2639.0,6045.0


In [146]:
# Specify the file path where you want to save the CSV file
family_file_path = "../data/curated/ABS_data/SA2_Family_Number_2021.csv"

# Use the to_csv method to save the DataFrame to a CSV file
family_count_2021_data.to_csv(family_file_path, index=False)

#### Then, we want to count how many people

In [148]:
# load 2021 ABS Census data on family number
person_count_2021_path = "../data/landing/ABS_data/SA2_(EN)_by_CPRF_Count_of_Persons_in_Family.csv"

skip_rows = 9
person_count_2021_data = pd.read_csv(person_count_2021_path, skiprows=skip_rows)
person_count_2021_data

Unnamed: 0,CPRF Count of Persons in Family,Two persons in family,Three persons in family,Four persons in family,Five persons in family,Six or more persons in family,Not applicable,Total,Unnamed: 8
0,SA2 (EN),,,,,,,,
1,Alfredton,1812,952.0,1144.0,506.0,151.0,1728.0,6287.0,
2,Ballarat,1687,598.0,512.0,205.0,48.0,2426.0,5486.0,
3,Buninyong,954,359.0,442.0,188.0,60.0,708.0,2716.0,
4,Delacombe,1361,687.0,588.0,215.0,82.0,1332.0,4266.0,
...,...,...,...,...,...,...,...,...,...
524,Total,789690,376100.0,370279.0,135812.0,46901.0,931140.0,2649914.0,
525,"Dataset: Census of Population and Housing, 202...",,,,,,,,
526,INFO,Cells in this table have been randomly adjuste...,,,,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,,,,


In [149]:
# preprocessing
person_count_2021_data.drop(person_count_2021_data.index[0], inplace = True)
person_count_2021_data.drop(columns=person_count_2021_data.columns[-1], inplace = True)
person_count_2021_data

Unnamed: 0,CPRF Count of Persons in Family,Two persons in family,Three persons in family,Four persons in family,Five persons in family,Six or more persons in family,Not applicable,Total
1,Alfredton,1812,952.0,1144.0,506.0,151.0,1728.0,6287.0
2,Ballarat,1687,598.0,512.0,205.0,48.0,2426.0,5486.0
3,Buninyong,954,359.0,442.0,188.0,60.0,708.0,2716.0
4,Delacombe,1361,687.0,588.0,215.0,82.0,1332.0,4266.0
5,Smythes Creek,496,241.0,279.0,133.0,64.0,314.0,1528.0
...,...,...,...,...,...,...,...,...
524,Total,789690,376100.0,370279.0,135812.0,46901.0,931140.0,2649914.0
525,"Dataset: Census of Population and Housing, 202...",,,,,,,
526,INFO,Cells in this table have been randomly adjuste...,,,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,,,


In [150]:
person_count_2021_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 1 to 528
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   CPRF Count of Persons in Family  528 non-null    object 
 1   Two persons in family            525 non-null    object 
 2   Three persons in family          524 non-null    float64
 3   Four persons in family           524 non-null    float64
 4   Five persons in family           524 non-null    float64
 5   Six or more persons in family    524 non-null    float64
 6   Not applicable                   524 non-null    float64
 7   Total                            524 non-null    float64
dtypes: float64(6), object(2)
memory usage: 33.1+ KB


In [151]:
person_count_2021_data = person_count_2021_data.applymap(lambda x: 0 if "Cells "
                                                                        "in this table have been randomly adjusted to avoid the release of confidential data. "
                                                                        "No reliance should be "
                                                                        "placed on small cells."
                                                                        "" in str(x) else x)

In [152]:

person_count_2021_data['Two persons in family'] = person_count_2021_data['Two persons in family'].astype(float)
person_count_2021_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 1 to 528
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   CPRF Count of Persons in Family  528 non-null    object 
 1   Two persons in family            525 non-null    float64
 2   Three persons in family          524 non-null    float64
 3   Four persons in family           524 non-null    float64
 4   Five persons in family           524 non-null    float64
 5   Six or more persons in family    524 non-null    float64
 6   Not applicable                   524 non-null    float64
 7   Total                            524 non-null    float64
dtypes: float64(7), object(1)
memory usage: 33.1+ KB


In [153]:
person_count_2021_data.rename(columns={'CPRF Count of Persons in Family': 'SA2 Name'}, inplace=True)
person_count_2021_data.rename(columns={'Total': 'Total Suburb Population'}, inplace=True)

person_count_2021_data

Unnamed: 0,SA2 Name,Two persons in family,Three persons in family,Four persons in family,Five persons in family,Six or more persons in family,Not applicable,Total Suburb Population
1,Alfredton,1812.0,952.0,1144.0,506.0,151.0,1728.0,6287.0
2,Ballarat,1687.0,598.0,512.0,205.0,48.0,2426.0,5486.0
3,Buninyong,954.0,359.0,442.0,188.0,60.0,708.0,2716.0
4,Delacombe,1361.0,687.0,588.0,215.0,82.0,1332.0,4266.0
5,Smythes Creek,496.0,241.0,279.0,133.0,64.0,314.0,1528.0
...,...,...,...,...,...,...,...,...
524,Total,789690.0,376100.0,370279.0,135812.0,46901.0,931140.0,2649914.0
525,"Dataset: Census of Population and Housing, 202...",,,,,,,
526,INFO,0.0,,,,,,
527,"Copyright Commonwealth of Australia, 2021, see...",,,,,,,


In [154]:

person_count_2021_data = person_count_2021_data.iloc[:-6, :]
person_count_2021_data.tail(10)

Unnamed: 0,SA2 Name,Two persons in family,Three persons in family,Four persons in family,Five persons in family,Six or more persons in family,Not applicable,Total Suburb Population
513,Camperdown,557.0,150.0,127.0,68.0,25.0,710.0,1639.0
514,Colac,1673.0,621.0,554.0,264.0,93.0,2394.0,5604.0
515,Colac Surrounds,811.0,266.0,249.0,167.0,54.0,823.0,2372.0
516,Corangamite - North,831.0,241.0,192.0,125.0,41.0,1016.0,2447.0
517,Corangamite - South,1057.0,345.0,279.0,175.0,91.0,1053.0,3001.0
518,Otway,656.0,149.0,144.0,45.0,14.0,1078.0,2082.0
519,Moyne - East,943.0,341.0,308.0,184.0,100.0,996.0,2867.0
520,Moyne - West,1512.0,469.0,503.0,242.0,69.0,1417.0,4210.0
521,Warrnambool - North,3080.0,1282.0,1104.0,508.0,149.0,3379.0,9490.0
522,Warrnambool - South,1879.0,669.0,573.0,229.0,62.0,2639.0,6045.0


#### Save the data

In [155]:
# Specify the file path where you want to save the CSV file
person_file_path = "../data/curated/ABS_data/SA2_Person_in_Family_2021.csv"

# Use the to_csv method to save the DataFrame to a CSV file
person_count_2021_data.to_csv(person_file_path, index=False)

