# Census Datasets 2011, 2016 & 2021: Clean & Transform  Datasets

Census data for each year contains population by age, sex and personal income in separate files.

Imported, cleaned, combined and exported census datasets of 2011, 2016 and 2021. Separate export for each year. 

In [143]:
import pandas as pd
import numpy as np
import os

## Import & Combine 2011 Census Datasets

### Import & Clean 2011 Population-Age Dataset

In [144]:
# set file path and import 2011 population by age dataset
path = r"/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/"

df_census_age_2011 = pd.read_csv(
    os.path.join(path, "original_datasets/census_data/2011/2011_population_age.csv"),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_age_2011.head()

Unnamed: 0,SA3,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+
0,10101,13648,7590,15585,18904,8708,3072
1,10102,11025,6762,15554,14760,4171,1239
2,10103,3641,2198,4485,5530,2264,818
3,10104,11507,6347,12033,21878,11980,4123
4,10201,30502,19699,37671,43996,20940,10297


In [145]:
df_census_age_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   SA3        351 non-null    int64
 1   age_0-14   351 non-null    int64
 2   age_15-24  351 non-null    int64
 3   age_25-44  351 non-null    int64
 4   age_45-64  351 non-null    int64
 5   age_65-79  351 non-null    int64
 6   age_80+    351 non-null    int64
dtypes: int64(7)
memory usage: 19.3 KB


In [146]:
# converting SA3 to be string for merges with other datasets (census and mbs)
df_census_age_2011["SA3"] = df_census_age_2011["SA3"].astype(("str"))
df_census_age_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   SA3        351 non-null    object
 1   age_0-14   351 non-null    int64 
 2   age_15-24  351 non-null    int64 
 3   age_25-44  351 non-null    int64 
 4   age_45-64  351 non-null    int64 
 5   age_65-79  351 non-null    int64 
 6   age_80+    351 non-null    int64 
dtypes: int64(6), object(1)
memory usage: 19.3+ KB


In [147]:
df_census_age_2011.describe()

Unnamed: 0,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+
count,351.0,351.0,351.0,351.0,351.0,351.0
mean,11806.190883,8166.660969,17179.524217,15541.148148,6189.88604,2392.233618
std,8339.142607,6068.821251,13064.111914,10225.126635,4261.153964,1902.499656
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,6461.0,4083.5,8002.0,8539.5,3072.5,1019.5
50%,9796.0,6702.0,13932.0,13716.0,5595.0,1987.0
75%,15842.5,10846.0,23419.0,21342.5,8696.5,3215.0
max,39077.0,28164.0,87146.0,45569.0,22249.0,10297.0


Total 351 SA3 areas in census population age dataset. This is correct since MBS 2013-19 contains 346 unique SA3 areas. Further checks on missing SA3 areas will be conducted before merging datasets

### Import & Clean 2011 Population Personal Income Dataset

In [148]:
# import 2011 population by income dataset

df_census_personal_income_2011 = pd.read_csv(
    os.path.join(
        path, "original_datasets/census_data/2011/2011_population_personal_income.csv"
    ),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_personal_income_2011.head()

Unnamed: 0,SA3,Negative income,Nil income,"$1-$199 ($1-$10,399)","$200-$299 ($10,400-$15,599)","$300-$399 ($15,600-$20,799)","$400-$599 ($20,800-$31,199)","$600-$799 ($31,200-$41,599)","$800-$999 ($41,600-$51,999)","$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,999 ($78,000-$103,999)","$2,000 or more ($104,000 or more)",Not stated,Not applicable,Total
0,10101,333,3126,3900,6157,6630,7080,5958,4256,3934,2656,3222,2258,4338,13648,67500
1,10102,174,2503,2489,2660,3051,3978,4239,3859,4422,3528,4495,4085,3007,11025,53511
2,10103,83,799,1103,1479,1746,2064,1878,1344,1134,695,787,530,1642,3641,18931
3,10104,295,2846,4480,8533,8105,9082,6388,4243,3191,1679,1965,1123,4432,11507,67880
4,10201,640,8641,10113,13944,15751,17655,13977,10481,9723,6587,8093,6307,10701,30502,163110


In [149]:
df_census_personal_income_2011.shape

(351, 16)

In [150]:
df_census_personal_income_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   SA3                                351 non-null    int64
 1   Negative income                    351 non-null    int64
 2   Nil income                         351 non-null    int64
 3   $1-$199 ($1-$10,399)               351 non-null    int64
 4   $200-$299 ($10,400-$15,599)        351 non-null    int64
 5   $300-$399 ($15,600-$20,799)        351 non-null    int64
 6   $400-$599 ($20,800-$31,199)        351 non-null    int64
 7   $600-$799 ($31,200-$41,599)        351 non-null    int64
 8   $800-$999 ($41,600-$51,999)        351 non-null    int64
 9   $1,000-$1,249 ($52,000-$64,999)    351 non-null    int64
 10  $1,250-$1,499 ($65,000-$77,999)    351 non-null    int64
 11  $1,500-$1,999 ($78,000-$103,999)   351 non-null    int64
 12  $2,000 or more ($104,0

In [151]:
# converting SA3 to be string for merges with other datasets (census and mbs)
df_census_personal_income_2011["SA3"] = df_census_personal_income_2011["SA3"].astype(
    ("str")
)
df_census_personal_income_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   SA3                                351 non-null    object
 1   Negative income                    351 non-null    int64 
 2   Nil income                         351 non-null    int64 
 3   $1-$199 ($1-$10,399)               351 non-null    int64 
 4   $200-$299 ($10,400-$15,599)        351 non-null    int64 
 5   $300-$399 ($15,600-$20,799)        351 non-null    int64 
 6   $400-$599 ($20,800-$31,199)        351 non-null    int64 
 7   $600-$799 ($31,200-$41,599)        351 non-null    int64 
 8   $800-$999 ($41,600-$51,999)        351 non-null    int64 
 9   $1,000-$1,249 ($52,000-$64,999)    351 non-null    int64 
 10  $1,250-$1,499 ($65,000-$77,999)    351 non-null    int64 
 11  $1,500-$1,999 ($78,000-$103,999)   351 non-null    int64 
 12  $2,000 o

In [152]:
df_census_personal_income_2011.describe()

Unnamed: 0,Negative income,Nil income,"$1-$199 ($1-$10,399)","$200-$299 ($10,400-$15,599)","$300-$399 ($15,600-$20,799)","$400-$599 ($20,800-$31,199)","$600-$799 ($31,200-$41,599)","$800-$999 ($41,600-$51,999)","$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,999 ($78,000-$103,999)","$2,000 or more ($104,000 or more)",Not stated,Not applicable,Total
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,290.849003,3744.478632,3662.404558,5120.683761,4888.470085,5715.242165,5127.193732,4092.131054,3908.675214,2733.763533,3193.450142,3080.592593,3910.615385,11806.193732,61275.410256
std,215.583083,3165.936958,2641.206295,3776.026852,3382.369116,3862.770369,3568.099756,2917.502148,2842.593911,2061.148821,2553.227144,3521.87954,2887.13986,8339.139214,41775.190657
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,146.0,1678.0,1863.0,2451.0,2439.5,2989.0,2527.0,2109.5,1919.0,1292.5,1402.0,871.5,1997.5,6461.0,33165.5
50%,245.0,2844.0,3079.0,4415.0,4464.0,5119.0,4493.0,3610.0,3329.0,2247.0,2552.0,1936.0,3264.0,9796.0,53373.0
75%,377.0,4752.5,4871.0,7091.5,6687.0,7828.0,6700.5,5469.0,5286.5,3795.5,4480.0,4029.5,5182.0,15842.5,82641.0
max,1211.0,16505.0,13276.0,23608.0,16338.0,18110.0,16934.0,12938.0,13685.0,11634.0,15848.0,21922.0,22876.0,39077.0,178432.0


In [153]:
df_census_personal_income_2011.drop(["Total"], axis=1, inplace=True)
df_census_personal_income_2011.columns

Index(['SA3', 'Negative income', 'Nil income', '$1-$199 ($1-$10,399)',
       '$200-$299 ($10,400-$15,599)', '$300-$399 ($15,600-$20,799)',
       '$400-$599 ($20,800-$31,199)', '$600-$799 ($31,200-$41,599)',
       '$800-$999 ($41,600-$51,999)', '$1,000-$1,249 ($52,000-$64,999)',
       '$1,250-$1,499 ($65,000-$77,999)', '$1,500-$1,999 ($78,000-$103,999)',
       '$2,000 or more ($104,000 or more)', 'Not stated', 'Not applicable'],
      dtype='object')

In [154]:
rename_columns = {
    "$1-$199 ($1-$10,399)": "$1-$10,399",
    "$200-$299 ($10,400-$15,599)": "$10,400-$15,599",
    "$300-$399 ($15,600-$20,799)": "$15,600-$20,799",
    "$400-$599 ($20,800-$31,199)": "$20,800-$31,199",
    "$600-$799 ($31,200-$41,599)": "$31,200-$41,599",
    "$800-$999 ($41,600-$51,999)": "$41,600-$51,999",
    "$1,000-$1,249 ($52,000-$64,999)": "$52,000-$64,999",
    "$1,250-$1,499 ($65,000-$77,999)": "$65,000-$77,999",
    "$1,500-$1,999 ($78,000-$103,999)": "$78,000-$103,999",
    "$2,000 or more ($104,000 or more)": "$104,000+",
}

df_census_personal_income_2011.rename(columns=rename_columns, inplace=True)
df_census_personal_income_2011

Unnamed: 0,SA3,Negative income,Nil income,"$1-$10,399","$10,400-$15,599","$15,600-$20,799","$20,800-$31,199","$31,200-$41,599","$41,600-$51,999","$52,000-$64,999","$65,000-$77,999","$78,000-$103,999","$104,000+",Not stated,Not applicable
0,10101,333,3126,3900,6157,6630,7080,5958,4256,3934,2656,3222,2258,4338,13648
1,10102,174,2503,2489,2660,3051,3978,4239,3859,4422,3528,4495,4085,3007,11025
2,10103,83,799,1103,1479,1746,2064,1878,1344,1134,695,787,530,1642,3641
3,10104,295,2846,4480,8533,8105,9082,6388,4243,3191,1679,1965,1123,4432,11507
4,10201,640,8641,10113,13944,15751,17655,13977,10481,9723,6587,8093,6307,10701,30502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,90101,3,68,27,39,38,50,109,128,181,110,116,88,834,263
347,90102,8,35,27,49,45,71,47,21,24,17,23,18,36,129
348,90103,0,20,15,44,30,27,22,22,15,22,20,10,26,103
349,99797,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Import & Clean 2011 Population-Gender Dataset

In [155]:
# import 2011 population by gender dataset

df_census_gender_2011 = pd.read_csv(
    os.path.join(path, "original_datasets/census_data/2011/2011_population_sex.csv"),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_gender_2011.head()

Unnamed: 0,SA3,male_pop,female_pop,total_population
0,10101,33658,33841,67500
1,10102,26852,26660,53511
2,10103,9736,9193,18931
3,10104,33368,34514,67880
4,10201,78677,84435,163110


In [156]:
df_census_gender_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   SA3               351 non-null    int64
 1   male_pop          351 non-null    int64
 2   female_pop        351 non-null    int64
 3   total_population  351 non-null    int64
dtypes: int64(4)
memory usage: 11.1 KB


In [157]:
df_census_gender_2011["SA3"] = df_census_gender_2011["SA3"].astype("str")
df_census_gender_2011["SA3"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 351 entries, 0 to 350
Series name: SA3
Non-Null Count  Dtype 
--------------  ----- 
351 non-null    object
dtypes: object(1)
memory usage: 2.9+ KB


In [158]:
df_census_gender_2011.describe()

Unnamed: 0,male_pop,female_pop,total_population
count,351.0,351.0,351.0
mean,30296.333333,30979.34188,61275.410256
std,20556.012001,21249.844061,41775.190657
min,0.0,0.0,0.0
25%,16518.5,16347.5,33165.5
50%,26221.0,26755.0,53373.0
75%,40762.0,42135.0,82641.0
max,92088.0,90795.0,178432.0


### Combine 2011 Age, Gender, Income Datasets

In [159]:
# check that all datasets have the same SA3 values

# extracted SA3 values in to a set (set will remove duplicates if any)
sa3_age = set(df_census_age_2011["SA3"])
sa3_income = set(df_census_personal_income_2011["SA3"])
sa3_gender = set(df_census_gender_2011["SA3"])

# checks if sets are equal. expected diff_sa3 to be False, if they don't contain the same values
diff_sa3 = sa3_age == sa3_income == sa3_gender
print(diff_sa3)

True


In [160]:
df_census_age_income_2011 = df_census_age_2011.merge(
    df_census_personal_income_2011, how="inner", on="SA3"
)
df_census_combined_2011 = df_census_age_income_2011.merge(
    df_census_gender_2011, how="inner", on="SA3"
)
df_census_combined_2011.shape

(351, 24)

In [161]:
df_census_combined_2011.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 0 to 350
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   SA3               351 non-null    object
 1   age_0-14          351 non-null    int64 
 2   age_15-24         351 non-null    int64 
 3   age_25-44         351 non-null    int64 
 4   age_45-64         351 non-null    int64 
 5   age_65-79         351 non-null    int64 
 6   age_80+           351 non-null    int64 
 7   Negative income   351 non-null    int64 
 8   Nil income        351 non-null    int64 
 9   $1-$10,399        351 non-null    int64 
 10  $10,400-$15,599   351 non-null    int64 
 11  $15,600-$20,799   351 non-null    int64 
 12  $20,800-$31,199   351 non-null    int64 
 13  $31,200-$41,599   351 non-null    int64 
 14  $41,600-$51,999   351 non-null    int64 
 15  $52,000-$64,999   351 non-null    int64 
 16  $65,000-$77,999   351 non-null    int64 
 17  $78,000-$103,999

In [162]:
df_census_combined_2011.describe()

Unnamed: 0,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+,Negative income,Nil income,"$1-$10,399","$10,400-$15,599",...,"$41,600-$51,999","$52,000-$64,999","$65,000-$77,999","$78,000-$103,999","$104,000+",Not stated,Not applicable,male_pop,female_pop,total_population
count,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
mean,11806.190883,8166.660969,17179.524217,15541.148148,6189.88604,2392.233618,290.849003,3744.478632,3662.404558,5120.683761,...,4092.131054,3908.675214,2733.763533,3193.450142,3080.592593,3910.615385,11806.193732,30296.333333,30979.34188,61275.410256
std,8339.142607,6068.821251,13064.111914,10225.126635,4261.153964,1902.499656,215.583083,3165.936958,2641.206295,3776.026852,...,2917.502148,2842.593911,2061.148821,2553.227144,3521.87954,2887.13986,8339.139214,20556.012001,21249.844061,41775.190657
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6461.0,4083.5,8002.0,8539.5,3072.5,1019.5,146.0,1678.0,1863.0,2451.0,...,2109.5,1919.0,1292.5,1402.0,871.5,1997.5,6461.0,16518.5,16347.5,33165.5
50%,9796.0,6702.0,13932.0,13716.0,5595.0,1987.0,245.0,2844.0,3079.0,4415.0,...,3610.0,3329.0,2247.0,2552.0,1936.0,3264.0,9796.0,26221.0,26755.0,53373.0
75%,15842.5,10846.0,23419.0,21342.5,8696.5,3215.0,377.0,4752.5,4871.0,7091.5,...,5469.0,5286.5,3795.5,4480.0,4029.5,5182.0,15842.5,40762.0,42135.0,82641.0
max,39077.0,28164.0,87146.0,45569.0,22249.0,10297.0,1211.0,16505.0,13276.0,23608.0,...,12938.0,13685.0,11634.0,15848.0,21922.0,22876.0,39077.0,92088.0,90795.0,178432.0


In [163]:
df_census_combined_2011.to_pickle(
    os.path.join(path, "clean_datasets/census_data/2011_cenus_combined.pkl")
)

## Import & Combine 2016 Census Datasets

### Import & Clean 2016 Population-Age Dataset

In [164]:
# set file path and import 2016 population by age dataset

df_census_age_2016 = pd.read_csv(
    os.path.join(path, "original_datasets/census_data/2016/2016_population_age.csv"),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_age_2016.head()

Unnamed: 0,SA3,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+
0,10102,11235,6877,15839,16352,5629,1395
1,10103,3275,2164,4489,5677,2830,936
2,10104,10610,6067,11644,22307,15245,4768
3,10105,6398,3988,8133,9835,5391,1822
4,10106,7255,3921,7649,10066,5398,1650


In [165]:
df_census_age_2016.tail()

Unnamed: 0,SA3,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+
353,90102,116,48,139,158,68,14
354,90103,78,72,121,97,24,0
355,90104,296,110,341,577,322,91
356,99797,0,0,6,0,0,0
357,99999,9,12,8,14,14,0


In [166]:
df_census_age_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   SA3        358 non-null    int64
 1   age_0-14   358 non-null    int64
 2   age_15-24  358 non-null    int64
 3   age_25-44  358 non-null    int64
 4   age_45-64  358 non-null    int64
 5   age_65-79  358 non-null    int64
 6   age_80+    358 non-null    int64
dtypes: int64(7)
memory usage: 19.7 KB


In [167]:
df_census_age_2016["SA3"].value_counts()

10102    1
31402    1
39797    1
31905    1
31904    1
        ..
20607    1
20606    1
20605    1
20604    1
99999    1
Name: SA3, Length: 358, dtype: int64

In [168]:
df_census_age_2016["SA3"] = df_census_age_2016["SA3"].astype("str")
df_census_age_2016["SA3"].value_counts()

10102    1
31402    1
39797    1
31905    1
31904    1
        ..
20607    1
20606    1
20605    1
20604    1
99999    1
Name: SA3, Length: 358, dtype: int64

### Import & Clean 2016 Population Personal Income Dataset

In [169]:
# import 2016 population by income dataset

df_census_personal_income_2016 = pd.read_csv(
    os.path.join(
        path, "original_datasets/census_data/2016/2016_population_personal_income.csv"
    ),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_personal_income_2016.head()

Unnamed: 0,SA3,Negative income,Nil income,"$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)","$300-$399 ($15,600-$20,799)","$400-$499 ($20,800-$25,999)","$500-$649 ($26,000-$33,799)","$650-$799 ($33,800-$41,599)","$800-$999 ($41,600-$51,999)","$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$2,000-$2,999 ($104,000-$155,999)","$3,000 or more ($156,000 or more)",Not stated,Not applicable,Total
0,10102,161,3063,1697,2188,2658,2732,2657,3187,3921,4450,3639,3312,2220,4091,1812,4308,11235,57333
1,10103,69,839,603,1028,1435,1437,1462,1490,1543,1362,871,669,459,561,292,1973,3275,19365
2,10104,250,3168,2481,5055,7702,7132,6035,5404,4999,4212,2405,1633,1122,1224,671,6536,10610,70642
3,10105,143,1895,1099,2051,2774,2833,2487,2473,2430,2360,1429,1219,879,1055,399,3640,6398,35561
4,10106,169,1757,1186,1897,2699,2605,2432,2349,2418,2226,1465,1406,948,1513,817,2803,7255,35939


In [170]:
df_census_personal_income_2016.shape

(358, 19)

In [171]:
df_census_personal_income_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   SA3                                358 non-null    int64
 1   Negative income                    358 non-null    int64
 2   Nil income                         358 non-null    int64
 3   $1-$149 ($1-$7,799)                358 non-null    int64
 4   $150-$299 ($7,800-$15,599)         358 non-null    int64
 5   $300-$399 ($15,600-$20,799)        358 non-null    int64
 6   $400-$499 ($20,800-$25,999)        358 non-null    int64
 7   $500-$649 ($26,000-$33,799)        358 non-null    int64
 8   $650-$799 ($33,800-$41,599)        358 non-null    int64
 9   $800-$999 ($41,600-$51,999)        358 non-null    int64
 10  $1,000-$1,249 ($52,000-$64,999)    358 non-null    int64
 11  $1,250-$1,499 ($65,000-$77,999)    358 non-null    int64
 12  $1,500-$1,749 ($78,000

In [172]:
# converting SA3 to be string for merges with other datasets (census and mbs)
df_census_personal_income_2016["SA3"] = df_census_personal_income_2016["SA3"].astype(
    ("str")
)
df_census_personal_income_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   SA3                                358 non-null    object
 1   Negative income                    358 non-null    int64 
 2   Nil income                         358 non-null    int64 
 3   $1-$149 ($1-$7,799)                358 non-null    int64 
 4   $150-$299 ($7,800-$15,599)         358 non-null    int64 
 5   $300-$399 ($15,600-$20,799)        358 non-null    int64 
 6   $400-$499 ($20,800-$25,999)        358 non-null    int64 
 7   $500-$649 ($26,000-$33,799)        358 non-null    int64 
 8   $650-$799 ($33,800-$41,599)        358 non-null    int64 
 9   $800-$999 ($41,600-$51,999)        358 non-null    int64 
 10  $1,000-$1,249 ($52,000-$64,999)    358 non-null    int64 
 11  $1,250-$1,499 ($65,000-$77,999)    358 non-null    int64 
 12  $1,500-$

In [173]:
df_census_personal_income_2016.describe()

Unnamed: 0,Negative income,Nil income,"$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)","$300-$399 ($15,600-$20,799)","$400-$499 ($20,800-$25,999)","$500-$649 ($26,000-$33,799)","$650-$799 ($33,800-$41,599)","$800-$999 ($41,600-$51,999)","$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$2,000-$2,999 ($104,000-$155,999)","$3,000 or more ($156,000 or more)",Not stated,Not applicable,Total
count,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0
mean,276.243017,4769.539106,2238.494413,3754.502793,4523.703911,4341.162011,3981.318436,3984.639665,4343.335196,4437.089385,3043.907821,2577.860335,1784.885475,2686.421788,1666.256983,4767.371508,12191.51676,65368.477654
std,236.839885,4431.003748,1698.844405,2815.911278,3343.960031,3096.220392,2788.142379,2884.480722,3226.960756,3324.680308,2342.484175,2014.595213,1440.788992,2507.418336,2262.967387,3286.956303,9266.882438,46736.444696
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,129.0,1929.5,1123.0,1855.0,2085.25,2042.5,1895.25,1768.0,2038.75,2160.75,1359.5,1138.5,767.0,899.25,419.75,2593.0,6281.5,33696.5
50%,214.0,3335.5,1844.0,3238.0,3937.5,3910.0,3572.5,3489.0,3753.0,3658.0,2466.0,2110.5,1419.0,1900.0,984.0,4245.5,9552.5,54549.0
75%,351.75,6106.5,2915.0,5052.25,6391.0,6006.25,5451.5,5297.25,5719.0,5962.75,4289.0,3670.25,2463.0,3715.5,1984.0,6157.0,16259.5,89195.0
max,1338.0,23876.0,7737.0,16764.0,18964.0,15022.0,12895.0,14044.0,16188.0,16378.0,12913.0,12585.0,9411.0,18404.0,15561.0,27238.0,54610.0,221898.0


All values are integers, no misisng or negative values.

In [174]:
# dropping total column as its not required. Total population is in population_gender file.
df_census_personal_income_2016.drop(["Total"], axis=1, inplace=True)
df_census_personal_income_2016.columns

Index(['SA3', 'Negative income', 'Nil income', '$1-$149 ($1-$7,799)',
       '$150-$299 ($7,800-$15,599)', '$300-$399 ($15,600-$20,799)',
       '$400-$499 ($20,800-$25,999)', '$500-$649 ($26,000-$33,799)',
       '$650-$799 ($33,800-$41,599)', '$800-$999 ($41,600-$51,999)',
       '$1,000-$1,249 ($52,000-$64,999)', '$1,250-$1,499 ($65,000-$77,999)',
       '$1,500-$1,749 ($78,000-$90,999)', '$1,750-$1,999 ($91,000-$103,999)',
       '$2,000-$2,999 ($104,000-$155,999)',
       '$3,000 or more ($156,000 or more)', 'Not stated', 'Not applicable'],
      dtype='object')

### Import & Clean 2016 Population-Gender Dataset

In [175]:
# import 2011 population by gender dataset

df_census_gender_2016 = pd.read_csv(
    os.path.join(path, "original_datasets/census_data/2016/2016_population_sex.csv"),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_gender_2016.head()

Unnamed: 0,SA3,male_pop,female_pop,total_population
0,10102,28830,28502,57333
1,10103,10013,9356,19365
2,10104,34606,36035,70642
3,10105,17991,17571,35561
4,10106,17677,18259,35939


In [176]:
df_census_gender_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   SA3               358 non-null    int64
 1   male_pop          358 non-null    int64
 2   female_pop        358 non-null    int64
 3   total_population  358 non-null    int64
dtypes: int64(4)
memory usage: 11.3 KB


In [177]:
df_census_gender_2016["SA3"] = df_census_gender_2016["SA3"].astype("str")
df_census_gender_2016["SA3"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 358 entries, 0 to 357
Series name: SA3
Non-Null Count  Dtype 
--------------  ----- 
358 non-null    object
dtypes: object(1)
memory usage: 2.9+ KB


In [178]:
df_census_gender_2016.describe()

Unnamed: 0,male_pop,female_pop,total_population
count,358.0,358.0,358.0
mean,32253.069832,33115.25419,65368.477654
std,22998.45448,23764.832765,46736.444696
min,0.0,0.0,0.0
25%,16946.25,17047.25,33696.5
50%,27028.0,27617.5,54549.0
75%,43462.75,44968.25,89195.0
max,111369.0,110528.0,221898.0


### Combine 2016 Age, Gender, Income Datasets

In [179]:
# check that all datasets have the same SA3 values

# extracted SA3 values in to a set (set will remove duplicates if any)
sa3_age = set(df_census_age_2016["SA3"])
sa3_income = set(df_census_personal_income_2016["SA3"])
sa3_gender = set(df_census_gender_2016["SA3"])

# checks if sets are equal. expected diff_sa3 to be False, if they don't contain the same values
diff_sa3 = sa3_age == sa3_income == sa3_gender
print(diff_sa3)

True


In [180]:
df_census_age_income_2016 = df_census_age_2016.merge(
    df_census_personal_income_2016, how="inner", on="SA3"
)
df_census_combined_2016 = df_census_age_income_2016.merge(
    df_census_gender_2016, how="inner", on="SA3"
)
df_census_combined_2016.shape

(358, 27)

In [181]:
df_census_combined_2016.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 358 entries, 0 to 357
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   SA3                                358 non-null    object
 1   age_0-14                           358 non-null    int64 
 2   age_15-24                          358 non-null    int64 
 3   age_25-44                          358 non-null    int64 
 4   age_45-64                          358 non-null    int64 
 5   age_65-79                          358 non-null    int64 
 6   age_80+                            358 non-null    int64 
 7   Negative income                    358 non-null    int64 
 8   Nil income                         358 non-null    int64 
 9   $1-$149 ($1-$7,799)                358 non-null    int64 
 10  $150-$299 ($7,800-$15,599)         358 non-null    int64 
 11  $300-$399 ($15,600-$20,799)        358 non-null    int64 
 12  $400-$49

In [182]:
df_census_combined_2016.describe()

Unnamed: 0,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+,Negative income,Nil income,"$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)",...,"$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$2,000-$2,999 ($104,000-$155,999)","$3,000 or more ($156,000 or more)",Not stated,Not applicable,male_pop,female_pop,total_population
count,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,...,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0
mean,12191.51676,8347.365922,18193.726257,16365.203911,7623.905028,2646.23743,276.243017,4769.539106,2238.494413,3754.502793,...,3043.907821,2577.860335,1784.885475,2686.421788,1666.256983,4767.371508,12191.51676,32253.069832,33115.25419,65368.477654
std,9266.882438,6712.229282,15020.39028,11054.287733,5209.452452,2065.665474,236.839885,4431.003748,1698.844405,2815.911278,...,2342.484175,2014.595213,1440.788992,2507.418336,2262.967387,3286.956303,9266.882438,22998.45448,23764.832765,46736.444696
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6281.5,3861.5,8011.5,8766.75,3858.0,1170.5,129.0,1929.5,1123.0,1855.0,...,1359.5,1138.5,767.0,899.25,419.75,2593.0,6281.5,16946.25,17047.25,33696.5
50%,9552.5,6574.5,14170.5,14375.5,6989.5,2267.0,214.0,3335.5,1844.0,3238.0,...,2466.0,2110.5,1419.0,1900.0,984.0,4245.5,9552.5,27028.0,27617.5,54549.0
75%,16259.5,11345.5,25371.25,22578.5,10747.0,3674.0,351.75,6106.5,2915.0,5052.25,...,4289.0,3670.25,2463.0,3715.5,1984.0,6157.0,16259.5,43462.75,44968.25,89195.0
max,54610.0,40357.0,105848.0,49083.0,28099.0,10468.0,1338.0,23876.0,7737.0,16764.0,...,12913.0,12585.0,9411.0,18404.0,15561.0,27238.0,54610.0,111369.0,110528.0,221898.0


In [183]:
df_census_combined_2016.to_pickle(
    os.path.join(path, "clean_datasets/census_data/2016_cenus_combined.pkl")
)

## Import & Combine 2021 Census Datasets

### Import & Clean 2021 Population-Age Dataset

In [184]:
# set file path and import 2021 population by age dataset

df_census_age_2021 = pd.read_csv(
    os.path.join(path, "original_datasets/census_data/2021/2021_population_age.csv"),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_age_2021.head()

Unnamed: 0,SA3,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+
0,10102,12643,6973,18711,17687,7006,1767
1,10103,3354,2275,5029,5915,3148,1001
2,10104,11075,6302,13133,22453,18433,5339
3,10105,6771,4065,8928,10245,6322,2071
4,10106,7401,4045,8076,10522,6198,1911


In [185]:
df_census_age_2021.tail()

Unnamed: 0,SA3,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+
353,90102,130,49,147,166,80,21
354,90103,65,42,69,84,38,10
355,90104,351,186,404,705,423,117
356,99797,0,0,0,0,0,0
357,99999,0,0,4,3,0,0


In [186]:
df_census_age_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   SA3        358 non-null    int64
 1   age_0-14   358 non-null    int64
 2   age_15-24  358 non-null    int64
 3   age_25-44  358 non-null    int64
 4   age_45-64  358 non-null    int64
 5   age_65-79  358 non-null    int64
 6   age_80+    358 non-null    int64
dtypes: int64(7)
memory usage: 19.7 KB


In [187]:
df_census_age_2021["SA3"].value_counts()

10102    1
31402    1
39797    1
31905    1
31904    1
        ..
20607    1
20606    1
20605    1
20604    1
99999    1
Name: SA3, Length: 358, dtype: int64

In [188]:
df_census_age_2021["SA3"] = df_census_age_2021["SA3"].astype("str")
df_census_age_2021["SA3"].value_counts()

10102    1
31402    1
39797    1
31905    1
31904    1
        ..
20607    1
20606    1
20605    1
20604    1
99999    1
Name: SA3, Length: 358, dtype: int64

### Import & Clean 2021 Population Personal Income Dataset

In [189]:
# import 2021 population by income dataset

df_census_personal_income_2021 = pd.read_csv(
    os.path.join(
        path, "original_datasets/census_data/2021/2021_population_personal_income.csv"
    ),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_personal_income_2021.head()

Unnamed: 0,SA3,Negative income,Nil income,"$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)","$300-$399 ($15,600-$20,799)","$400-$499 ($20,800-$25,999)","$500-$649 ($26,000-$33,799)","$650-$799 ($33,800-$41,599)","$800-$999 ($41,600-$51,999)","$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$2,000-$2,999 ($104,000-$155,999)","$3,000-$3,499 ($156,000-$181,999)","$3,500 or more ($182,000 or more)",Not stated,Not applicable,Total
0,10102,258,3127,1472,1796,2460,2849,2713,2900,3841,4916,4288,4387,3454,6822,1378,2286,3193,12643,64793
1,10103,127,911,516,679,1243,1345,1315,1474,1623,1655,1194,1018,795,1099,234,402,1722,3354,20717
2,10104,468,3564,2315,3611,7147,7214,6426,5676,5766,5771,3585,2760,1838,2465,469,846,5731,11075,76736
3,10105,270,2052,1014,1463,2576,2938,2500,2400,2706,2947,2030,1763,1257,1923,319,516,2962,6771,38403
4,10106,331,1922,1021,1437,2359,2556,2169,2221,2511,2777,1874,1834,1414,2353,538,886,2544,7401,38159


In [190]:
df_census_personal_income_2021.shape

(358, 20)

In [191]:
df_census_personal_income_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   SA3                                358 non-null    int64
 1   Negative income                    358 non-null    int64
 2   Nil income                         358 non-null    int64
 3   $1-$149 ($1-$7,799)                358 non-null    int64
 4   $150-$299 ($7,800-$15,599)         358 non-null    int64
 5   $300-$399 ($15,600-$20,799)        358 non-null    int64
 6   $400-$499 ($20,800-$25,999)        358 non-null    int64
 7   $500-$649 ($26,000-$33,799)        358 non-null    int64
 8   $650-$799 ($33,800-$41,599)        358 non-null    int64
 9   $800-$999 ($41,600-$51,999)        358 non-null    int64
 10  $1,000-$1,249 ($52,000-$64,999)    358 non-null    int64
 11  $1,250-$1,499 ($65,000-$77,999)    358 non-null    int64
 12  $1,500-$1,749 ($78,000

In [192]:
# converting SA3 to be string for merges with other datasets (census and mbs)
df_census_personal_income_2021["SA3"] = df_census_personal_income_2021["SA3"].astype(
    ("str")
)
df_census_personal_income_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 20 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   SA3                                358 non-null    object
 1   Negative income                    358 non-null    int64 
 2   Nil income                         358 non-null    int64 
 3   $1-$149 ($1-$7,799)                358 non-null    int64 
 4   $150-$299 ($7,800-$15,599)         358 non-null    int64 
 5   $300-$399 ($15,600-$20,799)        358 non-null    int64 
 6   $400-$499 ($20,800-$25,999)        358 non-null    int64 
 7   $500-$649 ($26,000-$33,799)        358 non-null    int64 
 8   $650-$799 ($33,800-$41,599)        358 non-null    int64 
 9   $800-$999 ($41,600-$51,999)        358 non-null    int64 
 10  $1,000-$1,249 ($52,000-$64,999)    358 non-null    int64 
 11  $1,250-$1,499 ($65,000-$77,999)    358 non-null    int64 
 12  $1,500-$

In [193]:
df_census_personal_income_2021.describe()

Unnamed: 0,Negative income,Nil income,"$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)","$300-$399 ($15,600-$20,799)","$400-$499 ($20,800-$25,999)","$500-$649 ($26,000-$33,799)","$650-$799 ($33,800-$41,599)","$800-$999 ($41,600-$51,999)","$1,000-$1,249 ($52,000-$64,999)","$1,250-$1,499 ($65,000-$77,999)","$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$2,000-$2,999 ($104,000-$155,999)","$3,000-$3,499 ($156,000-$181,999)","$3,500 or more ($182,000 or more)",Not stated,Not applicable,Total
count,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0
mean,410.128492,4663.458101,1898.835196,2807.393855,4395.575419,4417.972067,4221.01676,4019.379888,4694.360335,5214.455307,3918.687151,3433.002793,2538.002793,4399.527933,1023.340782,1850.969274,4151.452514,12955.231844,71013.212291
std,358.365204,4315.454299,1386.842498,2096.979903,3319.643804,3177.695822,3022.077089,2926.346033,3542.528195,3940.877698,3017.026662,2639.316859,1982.135688,3793.497265,1046.862931,2527.800581,2892.789254,10472.254115,51836.230309
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,182.5,1902.25,975.25,1398.75,1991.5,2151.75,2012.5,1863.5,2165.75,2492.75,1792.0,1536.25,1088.25,1653.0,291.75,444.75,2115.0,6407.5,36142.5
50%,325.0,3274.5,1659.0,2440.0,3752.0,3959.0,3729.0,3442.5,4006.5,4469.5,3292.5,2815.0,2041.5,3272.5,690.5,1076.0,3588.5,10077.5,58863.5
75%,517.25,5820.5,2575.5,3672.75,6067.75,6215.5,5751.75,5484.0,6405.75,6991.25,5466.5,4827.25,3571.75,6180.25,1377.5,2166.0,5259.75,17197.25,96576.5
max,2204.0,26787.0,6729.0,11954.0,20746.0,15831.0,14272.0,14949.0,20376.0,22909.0,16555.0,14067.0,11893.0,26688.0,7737.0,17716.0,16110.0,74874.0,296748.0


In [194]:
df_census_personal_income_2021.drop(["Total"], inplace=True, axis=1)
df_census_personal_income_2021.columns

Index(['SA3', 'Negative income', 'Nil income', '$1-$149 ($1-$7,799)',
       '$150-$299 ($7,800-$15,599)', '$300-$399 ($15,600-$20,799)',
       '$400-$499 ($20,800-$25,999)', '$500-$649 ($26,000-$33,799)',
       '$650-$799 ($33,800-$41,599)', '$800-$999 ($41,600-$51,999)',
       '$1,000-$1,249 ($52,000-$64,999)', '$1,250-$1,499 ($65,000-$77,999)',
       '$1,500-$1,749 ($78,000-$90,999)', '$1,750-$1,999 ($91,000-$103,999)',
       '$2,000-$2,999 ($104,000-$155,999)',
       '$3,000-$3,499 ($156,000-$181,999)',
       '$3,500 or more ($182,000 or more)', 'Not stated', 'Not applicable'],
      dtype='object')

### Import & Clean 2021 Population-Gender Dataset

In [195]:
# import 2021 population by gender dataset

df_census_gender_2021 = pd.read_csv(
    os.path.join(path, "original_datasets/census_data/2021/2021_population_sex.csv"),
    index_col=False,
    encoding="ISO-8859-1",
)
df_census_gender_2021.head()

Unnamed: 0,SA3,male_pop,female_pop,total_population
0,10102,32664,32122,64793
1,10103,10674,10046,20717
2,10104,37632,39101,76736
3,10105,19335,19072,38403
4,10106,18787,19369,38159


In [196]:
df_census_gender_2021.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   SA3               358 non-null    int64
 1   male_pop          358 non-null    int64
 2   female_pop        358 non-null    int64
 3   total_population  358 non-null    int64
dtypes: int64(4)
memory usage: 11.3 KB


In [197]:
df_census_gender_2021["SA3"] = df_census_gender_2021["SA3"].astype("str")
df_census_gender_2021["SA3"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 358 entries, 0 to 357
Series name: SA3
Non-Null Count  Dtype 
--------------  ----- 
358 non-null    object
dtypes: object(1)
memory usage: 2.9+ KB


In [198]:
df_census_gender_2021.describe()

Unnamed: 0,male_pop,female_pop,total_population
count,358.0,358.0,358.0
mean,35042.22905,35971.013966,71013.212291
std,25608.538012,26254.400918,51836.230309
min,0.0,0.0,0.0
25%,17899.0,18410.0,36142.5
50%,28869.5,29892.0,58863.5
75%,46866.25,49601.0,96576.5
max,149962.0,146783.0,296748.0


### Combine 2021 Age, Gender, Income Datasets

In [199]:
# check that all datasets have the same SA3 values

# extracted SA3 values in to a set (set will remove duplicates if any)
sa3_age = set(df_census_age_2021["SA3"])
sa3_income = set(df_census_personal_income_2021["SA3"])
sa3_gender = set(df_census_gender_2021["SA3"])

# checking number of values is as expected - 358
print(len(sa3_age))

# checks if sets are equal. expected diff_sa3 to be False, if they don't contain the same values
diff_sa3 = sa3_age == sa3_income == sa3_gender
print(diff_sa3)

358
True


In [200]:
df_census_age_income_2021 = df_census_age_2021.merge(
    df_census_personal_income_2021, how="inner", on="SA3"
)
df_census_combined_2021 = df_census_age_income_2021.merge(
    df_census_gender_2021, how="inner", on="SA3"
)
df_census_combined_2021.shape

(358, 28)

In [201]:
df_census_combined_2021.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 358 entries, 0 to 357
Data columns (total 28 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   SA3                                358 non-null    object
 1   age_0-14                           358 non-null    int64 
 2   age_15-24                          358 non-null    int64 
 3   age_25-44                          358 non-null    int64 
 4   age_45-64                          358 non-null    int64 
 5   age_65-79                          358 non-null    int64 
 6   age_80+                            358 non-null    int64 
 7   Negative income                    358 non-null    int64 
 8   Nil income                         358 non-null    int64 
 9   $1-$149 ($1-$7,799)                358 non-null    int64 
 10  $150-$299 ($7,800-$15,599)         358 non-null    int64 
 11  $300-$399 ($15,600-$20,799)        358 non-null    int64 
 12  $400-$49

In [202]:
df_census_combined_2021.describe()

Unnamed: 0,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+,Negative income,Nil income,"$1-$149 ($1-$7,799)","$150-$299 ($7,800-$15,599)",...,"$1,500-$1,749 ($78,000-$90,999)","$1,750-$1,999 ($91,000-$103,999)","$2,000-$2,999 ($104,000-$155,999)","$3,000-$3,499 ($156,000-$181,999)","$3,500 or more ($182,000 or more)",Not stated,Not applicable,male_pop,female_pop,total_population
count,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,...,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0,358.0
mean,12955.231844,8484.231844,19867.153631,17477.293296,9165.231844,3064.178771,410.128492,4663.458101,1898.835196,2807.393855,...,3433.002793,2538.002793,4399.527933,1023.340782,1850.969274,4151.452514,12955.231844,35042.22905,35971.013966,71013.212291
std,10472.254115,6732.953512,16925.289416,12001.958004,6232.718495,2326.576573,358.365204,4315.454299,1386.842498,2096.979903,...,2639.316859,1982.135688,3793.497265,1046.862931,2527.800581,2892.789254,10472.254115,25608.538012,26254.400918,51836.230309
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6407.5,3908.0,8553.5,9219.75,4794.25,1394.25,182.5,1902.25,975.25,1398.75,...,1536.25,1088.25,1653.0,291.75,444.75,2115.0,6407.5,17899.0,18410.0,36142.5
50%,10077.5,6530.5,15206.0,15096.5,8262.0,2674.5,325.0,3274.5,1659.0,2440.0,...,2815.0,2041.5,3272.5,690.5,1076.0,3588.5,10077.5,28869.5,29892.0,58863.5
75%,17197.25,11344.75,27403.0,23725.75,12972.5,4213.5,517.25,5820.5,2575.5,3672.75,...,4827.25,3571.75,6180.25,1377.5,2166.0,5259.75,17197.25,46866.25,49601.0,96576.5
max,74874.0,34675.0,110560.0,55602.0,33680.0,12294.0,2204.0,26787.0,6729.0,11954.0,...,14067.0,11893.0,26688.0,7737.0,17716.0,16110.0,74874.0,149962.0,146783.0,296748.0


In [203]:
df_census_combined_2021.to_pickle(
    os.path.join(path, "clean_datasets/census_data/2021_cenus_combined.pkl")
)