# Census & MBS: Combine Datasets

In [662]:
# import libraries
import pandas as pd
import numpy as np
import os

pd.set_option("display.max_rows", 100)

## Import & Ready MBS Combined Dataset

In [663]:
# import the transformed mbs file and assign to a dataframe

# setup path to original dataset
path = r"/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/"

df_mbs_2014_23 = pd.read_pickle(
    os.path.join(path, "clean_datasets/mbs_data/2014-22_phc_combined_mbs.pkl")
)
df_mbs_2014_23.info(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258535 entries, 0 to 258534
Data columns (total 16 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Year                                          258535 non-null  int64  
 1   StateTerritory                                258535 non-null  object 
 2   GeographicCode                                258535 non-null  object 
 3   GeographicAreaName                            258535 non-null  object 
 4   GeographicGroup                               258535 non-null  object 
 5   ServiceLevel                                  258535 non-null  object 
 6   Service                                       258535 non-null  object 
 7   DemographicGroup                              258535 non-null  object 
 8   Medicare benefits per 100 people ($)          234301 non-null  float64
 9   No. of patients                               23

In [664]:
# rename columns for easy references
df_mbs_2014_23.rename(
    columns={
        "Medicare benefits per 100 people ($)": "MBS_per_100",
        "No. of patients": "No_of_patients",
        "No. of services": "No_of_services",
        "Percentage of people who had the service (%)": "%_People_had_service",
        "Services per 100 people": "Services_100_people",
        "Total Medicare benefits paid ($)": "Total_mbs_paid_$",
        "Total provider fees ($)": "Total_provider_fees_$",
        "Estimated resident population": "ERP",
    },
    inplace=True,
)

In [665]:
# create key for reference and comparison
df_mbs_2014_23["key"] = (
    df_mbs_2014_23["Year"].astype("str") + "-" + df_mbs_2014_23["GeographicCode"]
)
df_mbs_2014_23.head(3)

Unnamed: 0,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,MBS_per_100,No_of_patients,No_of_services,%_People_had_service,Services_100_people,Total_mbs_paid_$,Total_provider_fees_$,ERP,key
0,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,2576.0,5624,10879,17.27,33.41,838549.0,1026474.0,32558,2014-80101
1,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,4004.0,7714,15870,24.75,50.93,1247656.0,1600846.0,31163,2014-80101
2,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,4672.0,8998,15754,41.32,72.35,1017264.0,1197133.0,21774,2014-80101


### MBS: Subset National & State Level Data

Subset MBS data by separating State from National level data. This done by creating 3 new dataset containing:
1. National-SA3Group
2. National
3. State

Rows associalted with State Territory National-SA3Group and National will be removed from State MBS dataset

In [666]:
df_mbs_2014_23["StateTerritory"].value_counts()

NSW                  68503
Qld                  61334
Vic                  49367
WA                   25414
SA                   20941
Tas                  11210
ACT                   7480
NT                    6721
National-SA3Group     4488
Other Territories     2827
National               250
Name: StateTerritory, dtype: int64

#### Subset National Level Data

In [667]:
# create subset containing dataset National level. Not this only has data from 2020-2022
df_national_au = df_mbs_2014_23[df_mbs_2014_23["StateTerritory"].isin(["National"])]
print(df_national_au.shape)
print(df_national_au["Year"].value_counts(dropna=False))

(250, 17)
2022    84
2020    83
2021    83
Name: Year, dtype: int64


#### Subset SA3 Group Level Data

In [668]:
# subset data by National-SA3 Group. These were rows with State value that were updated in script 1 before pivotinig
df_sa3_group_au = df_mbs_2014_23[
    df_mbs_2014_23["StateTerritory"].isin(["National-SA3Group"])
]
print(df_sa3_group_au.shape)
print(df_sa3_group_au["Year"].value_counts(dropna=False))

(4488, 17)
2022    504
2014    498
2015    498
2016    498
2017    498
2018    498
2019    498
2020    498
2021    498
Name: Year, dtype: int64


There are 6 more rows in 2022 than other years. Below investigation found "Other GP Services" was added in 2022 service level 3 list

In [669]:
# checking unique count per geographic code. Appears there is an extra service row for each area
df_sa3_group_au[["Year", "GeographicCode"]].value_counts(dropna=False)

Year  GeographicCode
2022  004-06            84
      004-05            84
      004-04            84
      004-03            84
      004-02            84
      004-01            84
2020  004-03            83
2018  004-06            83
2019  004-01            83
      004-02            83
      004-03            83
      004-04            83
      004-05            83
      004-06            83
2020  004-01            83
      004-02            83
      004-06            83
      004-04            83
      004-05            83
2014  004-02            83
2021  004-01            83
      004-02            83
      004-03            83
      004-04            83
      004-05            83
      004-06            83
2018  004-05            83
2014  004-01            83
2018  004-03            83
      004-02            83
2014  004-03            83
      004-04            83
      004-05            83
      004-06            83
2015  004-01            83
      004-02            83
      0

In [670]:
# extracting unique list of services for 2021 and 2022 to determine service new service added in 2022
df_service_level_3_2021 = (
    df_sa3_group_au[
        (df_sa3_group_au["Year"] == 2021)
        & (df_sa3_group_au["ServiceLevel"] == "Level 3")
    ]["Service"]
).unique()
df_service_level_3_2022 = (
    df_sa3_group_au[
        (df_sa3_group_au["Year"] == 2022)
        & (df_sa3_group_au["ServiceLevel"] == "Level 3")
    ]["Service"]
).unique()

# finding the difference in service list
np.setdiff1d(df_service_level_3_2022, df_service_level_3_2021)

array(['Other GP Services'], dtype=object)

### Subset State Level SA3 Data

In [671]:
# remove national and national-sagroup data from masterset to create a state only dataset
df_mbs_state_sa3_complete = df_mbs_2014_23[
    ~df_mbs_2014_23["StateTerritory"].isin(["National", "National-SA3Group"])
]
df_mbs_state_sa3_complete.shape

(253797, 17)

In [672]:
df_mbs_state_sa3_complete["StateTerritory"].value_counts(dropna=False)

NSW                  68503
Qld                  61334
Vic                  49367
WA                   25414
SA                   20941
Tas                  11210
ACT                   7480
NT                    6721
Other Territories     2827
Name: StateTerritory, dtype: int64

In [673]:
df_mbs_state_sa3_complete["GeographicCode"].value_counts(dropna=False)

80101    748
40203    748
40703    748
40702    748
40701    748
        ... 
90104    720
10803    691
90101    691
90102    668
12402    498
Name: GeographicCode, Length: 340, dtype: int64

In [674]:
print(df_mbs_state_sa3_complete["Year"].min())
print(df_mbs_state_sa3_complete["Year"].max())

2014
2022


### Pivot State MBS by Demograpghic Data

In [675]:
# find unique demographic values
df_mbs_state_sa3_complete["DemographicGroup"].unique()

array(['0-24', '25-44', '45-64', '65+', 'All persons', 'Females', 'Males'],
      dtype=object)

In [676]:
df_mbs_state_sa3_complete.head(5)

Unnamed: 0,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,MBS_per_100,No_of_patients,No_of_services,%_People_had_service,Services_100_people,Total_mbs_paid_$,Total_provider_fees_$,ERP,key
0,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,2576.0,5624,10879,17.27,33.41,838549.0,1026474.0,32558,2014-80101
1,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,4004.0,7714,15870,24.75,50.93,1247656.0,1600846.0,31163,2014-80101
2,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,4672.0,8998,15754,41.32,72.35,1017264.0,1197133.0,21774,2014-80101
3,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),65+,5819.0,6397,12316,55.07,106.01,675946.0,761837.0,11617,2014-80101
4,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),All persons,3892.0,28733,54818,29.59,56.45,3779415.0,4586290.0,97112,2014-80101


In [677]:
# Use pivot_table to reshape the DataFrame
df_mbs_state_sa3_pivot = df_mbs_state_sa3_complete.pivot_table(
    index=[
        "key",
        "Year",
        "StateTerritory",
        "GeographicCode",
        "GeographicAreaName",
        "GeographicGroup",
        "ServiceLevel",
        "Service",
    ],
    columns="DemographicGroup",
    values=[
        "MBS_per_100",
        "No_of_patients",
        "No_of_services",
        "%_People_had_service",
        "Services_100_people",
        "Total_mbs_paid_$",
        "Total_provider_fees_$",
        "ERP",
    ],
    aggfunc="first",
)  # 'first' is used to pick the first value in case of duplicates
df_mbs_state_sa3_pivot.reset_index(inplace=True)

In [678]:
# Flatten the MultiIndex in columns and format new column names as 'demographic_value'
df_mbs_state_sa3_pivot.columns = [
    "_".join(col).strip() if col[1] else col[0]
    for col in df_mbs_state_sa3_pivot.columns.values
]

In [679]:
df_mbs_state_sa3_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162087 entries, 0 to 162086
Data columns (total 64 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   key                                162087 non-null  object 
 1   Year                               162087 non-null  int64  
 2   StateTerritory                     162087 non-null  object 
 3   GeographicCode                     162087 non-null  object 
 4   GeographicAreaName                 162087 non-null  object 
 5   GeographicGroup                    162087 non-null  object 
 6   ServiceLevel                       162087 non-null  object 
 7   Service                            162087 non-null  object 
 8   %_People_had_service_0-24          14724 non-null   float64
 9   %_People_had_service_25-44         14802 non-null   float64
 10  %_People_had_service_45-64         14767 non-null   float64
 11  %_People_had_service_65+           1474

In [680]:
df_mbs_state_sa3_pivot.head(3)

Unnamed: 0,key,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,%_People_had_service_0-24,%_People_had_service_25-44,...,Total_mbs_paid_$_All persons,Total_mbs_paid_$_Females,Total_mbs_paid_$_Males,Total_provider_fees_$_0-24,Total_provider_fees_$_25-44,Total_provider_fees_$_45-64,Total_provider_fees_$_65+,Total_provider_fees_$_All persons,Total_provider_fees_$_Females,Total_provider_fees_$_Males
0,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),18.8,20.65,...,1938341.0,1201578.0,736763.0,580491.0,633972.0,717555.0,360781.0,2292799.0,1428964.0,863835.0
1,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Diagnostic Imaging (total),17.59,27.72,...,4702507.0,2761452.0,1941056.0,657658.0,1586180.0,2387889.0,1558873.0,6190599.0,3768680.0,2421920.0
2,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,GP attendances (total),76.02,74.84,...,10780077.0,6339946.0,4440131.0,3207438.0,3764233.0,4500909.0,3067848.0,14540427.0,8587817.0,5952610.0


In [681]:
# checking the pivot values
df_mbs_state_sa3_pivot[
    (df_mbs_state_sa3_pivot["Year"] == 2014)
    & (df_mbs_state_sa3_pivot["GeographicCode"] == "80101")
    & (df_mbs_state_sa3_pivot["ServiceLevel"] == "Level 1")
    & (
        df_mbs_state_sa3_pivot["Service"].isin(
            ["Allied Health attendances (total)", "Diagnostic Imaging (total)"]
        )
    )
]

Unnamed: 0,key,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,%_People_had_service_0-24,%_People_had_service_25-44,...,Total_mbs_paid_$_All persons,Total_mbs_paid_$_Females,Total_mbs_paid_$_Males,Total_provider_fees_$_0-24,Total_provider_fees_$_25-44,Total_provider_fees_$_45-64,Total_provider_fees_$_65+,Total_provider_fees_$_All persons,Total_provider_fees_$_Females,Total_provider_fees_$_Males
17278,2014-80101,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),17.27,24.75,...,3779415.0,2394671.0,1384744.0,1026474.0,1600846.0,1197133.0,761837.0,4586290.0,2929743.0,1656547.0
17279,2014-80101,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Diagnostic Imaging (total),17.18,30.48,...,8274560.0,5024192.0,3250368.0,1051718.0,3173663.0,3601039.0,3040462.0,10866882.0,6807431.0,4059451.0


Note, there are is no data by demograpgic for service level 2 & 3. Due to this, significant number of blank values exists in measure_value_demographic columns except for all person

In [682]:
# exporting dataset for analysis of the pivot
df_mbs_state_sa3_pivot.to_csv(
    os.path.join(path, "clean_datasets/mbs_data/mbs_state_pivot_datatset_complete.csv")
)

## Import & Ready Census Dataset

In [683]:
# import the transformed mbs file and assign to a dataframe

df_census_2011_22 = pd.read_pickle(
    os.path.join(path, "clean_datasets/census_data/2011_22_census_complete.pkl")
)
df_census_2011_22.info(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4296 entries, 0 to 4295
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   SA3                      4296 non-null   object 
 1   Year                     4296 non-null   int64  
 2   age_0-14                 4296 non-null   float64
 3   age_15-24                4296 non-null   float64
 4   age_25-44                4296 non-null   float64
 5   age_45-64                4296 non-null   float64
 6   age_65-79                4296 non-null   float64
 7   age_80+                  4296 non-null   float64
 8   negative_income          4296 non-null   float64
 9   no_income                4296 non-null   float64
 10  average_income_$5200     4296 non-null   float64
 11  average_income_$13000    4296 non-null   float64
 12  average_income_$18200    4296 non-null   float64
 13  average_income_$26000    4296 non-null   float64
 14  average_income_$36400   

### Extract data from 2014

In [684]:
# extracting data from 2014 onwards to match the MBS range
df_census_2014_22 = df_census_2011_22[df_census_2011_22["Year"] >= 2014]
df_census_2014_22["Year"].value_counts(dropna=False)

2014    358
2015    358
2016    358
2017    358
2018    358
2019    358
2020    358
2021    358
2022    358
Name: Year, dtype: int64

### Align Age Groups with MBS

1. 0-24
2. 25-44
3. 45-64
4. 65+

In [685]:
# make a copy of the dataset to avoid changing the original imported version
df_census_2014_22_new = df_census_2014_22.copy()

In [686]:
# combine population of age bracks 0-14 and 15-24 to create a 0-24 bracket
df_census_2014_22_new["age_0-24"] = (
    df_census_2014_22_new["age_0-14"] + df_census_2014_22_new["age_15-24"]
)
# combine population of age bracks 65-79 and 8-+ to create a 65+ bracket
df_census_2014_22_new["age_65+"] = (
    df_census_2014_22_new["age_65-79"] + df_census_2014_22_new["age_80+"]
)

In [687]:
df_census_2014_22_new.head(3)

Unnamed: 0,SA3,Year,age_0-14,age_15-24,age_25-44,age_45-64,age_65-79,age_80+,negative_income,no_income,...,average_income_$91000,average_income_$130000,not_stated,not_applicable,male_pop,female_pop,total_population,average_income_$169000+,age_0-24,age_65+
3,10102,2014,11151.0,6831.0,15725.0,15715.2,5045.8,1332.6,166.2,2839.0,...,5117.2,4088.6,3787.6,11151.0,28038.8,27765.2,55804.2,,17982.0,6378.4
4,10102,2015,11193.0,6854.0,15782.0,16033.6,5337.4,1363.8,163.6,2951.0,...,5324.6,4089.8,4047.8,11193.0,28434.4,28133.6,56568.6,,18047.0,6701.2
5,10102,2016,11235.0,6877.0,15839.0,16352.0,5629.0,1395.0,161.0,3063.0,...,5532.0,4091.0,4308.0,11235.0,28830.0,28502.0,57333.0,1812.0,18112.0,7024.0


In [688]:
# Drop columns age_0-14, age_15-24, age_65-79, age_80+ as aggregate columns are formed

df_census_2014_22_new.drop(
    ["age_0-14", "age_15-24", "age_65-79", "age_80+"], axis=1, inplace=True
)
df_census_2014_22_new.columns

Index(['SA3', 'Year', 'age_25-44', 'age_45-64', 'negative_income', 'no_income',
       'average_income_$5200', 'average_income_$13000',
       'average_income_$18200', 'average_income_$26000',
       'average_income_$36400', 'average_income_$46800',
       'average_income_$58500', 'average_income_$71500',
       'average_income_$91000', 'average_income_$130000', 'not_stated',
       'not_applicable', 'male_pop', 'female_pop', 'total_population',
       'average_income_$169000+', 'age_0-24', 'age_65+'],
      dtype='object')

### Compare SA3 values

Found SA3 values exists in census but not in MBS. This is expected and is not an issue for the join

Make census + year key to find the missing year-census 

In [689]:
# creating key for reference and investigations
df_census_2014_22_new["key"] = (
    df_census_2014_22_new["Year"].astype("str") + "-" + df_census_2014_22_new["SA3"]
)

In [690]:
df_mbs_state_sa3_pivot.head(3)

Unnamed: 0,key,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,%_People_had_service_0-24,%_People_had_service_25-44,...,Total_mbs_paid_$_All persons,Total_mbs_paid_$_Females,Total_mbs_paid_$_Males,Total_provider_fees_$_0-24,Total_provider_fees_$_25-44,Total_provider_fees_$_45-64,Total_provider_fees_$_65+,Total_provider_fees_$_All persons,Total_provider_fees_$_Females,Total_provider_fees_$_Males
0,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),18.8,20.65,...,1938341.0,1201578.0,736763.0,580491.0,633972.0,717555.0,360781.0,2292799.0,1428964.0,863835.0
1,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Diagnostic Imaging (total),17.59,27.72,...,4702507.0,2761452.0,1941056.0,657658.0,1586180.0,2387889.0,1558873.0,6190599.0,3768680.0,2421920.0
2,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,GP attendances (total),76.02,74.84,...,10780077.0,6339946.0,4440131.0,3207438.0,3764233.0,4500909.0,3067848.0,14540427.0,8587817.0,5952610.0


In [691]:
census_sa3_list = pd.Series(df_census_2014_22_new["key"].unique())
mbs_sa3_list = pd.Series(df_mbs_state_sa3_pivot["key"].unique())

census_sa3_list.to_clipboard()

In [692]:
# diff_values = np.setdiff1d(census_sa3_list, mbs_sa3_list)
diff_values = mbs_sa3_list[~mbs_sa3_list.isin(census_sa3_list)]
diff_values

Series([], dtype: object)

Diff_values shows keys in mbs also exists in census

In [693]:
# checking population data of SA3 codes added in 2016 and backfilled values
df_census_2014_22_new[
    (
        df_census_2014_22_new["Year"].isin([2014, 2015])
        & (df_census_2014_22_new["SA3"] == "90104")
    )
]

Unnamed: 0,SA3,Year,age_25-44,age_45-64,negative_income,no_income,average_income_$5200,average_income_$13000,average_income_$18200,average_income_$26000,...,average_income_$130000,not_stated,not_applicable,male_pop,female_pop,total_population,average_income_$169000+,age_0-24,age_65+,key
4263,90104,2014,341.0,577.0,9.0,67.0,54.0,98.0,146.0,344.0,...,33.0,138.0,296.0,820.0,925.0,1748.0,,406.0,413.0,2014-90104
4264,90104,2015,341.0,577.0,9.0,67.0,54.0,98.0,146.0,344.0,...,33.0,138.0,296.0,820.0,925.0,1748.0,,406.0,413.0,2015-90104


## Combine MBS and Census

Combining MBS state complete dataset with census. Expected 162,087 rows, 88 columns. The join will occur on SA3 (GeographicCode) and Year

In [694]:
df_mbs_state_sa3_pivot.shape

(162087, 64)

In [695]:
df_census_2014_22_new.shape

(3222, 25)

In [696]:
df_census_mbs_combined = df_mbs_state_sa3_pivot.merge(
    df_census_2014_22_new,
    how="left",
    left_on=["Year", "GeographicCode"],
    right_on=["Year", "SA3"],
    indicator=True,
)

In [697]:
df_census_mbs_combined.shape

(162087, 89)

In [698]:
df_census_mbs_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162087 entries, 0 to 162086
Data columns (total 89 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   key_x                              162087 non-null  object  
 1   Year                               162087 non-null  int64   
 2   StateTerritory                     162087 non-null  object  
 3   GeographicCode                     162087 non-null  object  
 4   GeographicAreaName                 162087 non-null  object  
 5   GeographicGroup                    162087 non-null  object  
 6   ServiceLevel                       162087 non-null  object  
 7   Service                            162087 non-null  object  
 8   %_People_had_service_0-24          14724 non-null   float64 
 9   %_People_had_service_25-44         14802 non-null   float64 
 10  %_People_had_service_45-64         14767 non-null   float64 
 11  %_People_had_service_65+  

In [699]:
df_census_mbs_combined.isnull().sum()

key_x                                     0
Year                                      0
StateTerritory                            0
GeographicCode                            0
GeographicAreaName                        0
GeographicGroup                           0
ServiceLevel                              0
Service                                   0
%_People_had_service_0-24            147363
%_People_had_service_25-44           147285
%_People_had_service_45-64           147320
%_People_had_service_65+             147345
%_People_had_service_All persons      21276
%_People_had_service_Females         147226
%_People_had_service_Males           147225
ERP_0-24                             146812
ERP_25-44                            146802
ERP_45-64                            146817
ERP_65+                              146807
ERP_All persons                           0
ERP_Females                          146802
ERP_Males                            146802
MBS_per_100_0-24                

Investigated the lot of nulls. This is due to Service Level 2 and 3 not having gender and age bracket information. Because of this, for the analysis, splitting data into 3 datasets.

1. 2014_22_mbs_census_combined : full combined dataset
2. 2014_22_mbs_census_service_level_1
3. 2014_22_mbs_census_service_level_2_3

In [700]:
# export complete census and mbs data combined.
df_census_mbs_combined.to_pickle(
    os.path.join(path, "clean_datasets/2014_22_mbs_census_combined.pkl")
)

#### Subset Service Level 1

In [701]:
# subsetting service level 1 dataset
df_mbs_census_service_level_1 = df_census_mbs_combined[
    df_census_mbs_combined["ServiceLevel"] == "Level 1"
]
df_mbs_census_service_level_1.shape

(15285, 89)

In [702]:
df_mbs_census_service_level_1.isnull().sum()

key_x                                   0
Year                                    0
StateTerritory                          0
GeographicCode                          0
GeographicAreaName                      0
GeographicGroup                         0
ServiceLevel                            0
Service                                 0
%_People_had_service_0-24             561
%_People_had_service_25-44            483
%_People_had_service_45-64            518
%_People_had_service_65+              543
%_People_had_service_All persons      420
%_People_had_service_Females          424
%_People_had_service_Males            423
ERP_0-24                               10
ERP_25-44                               0
ERP_45-64                              15
ERP_65+                                 5
ERP_All persons                         0
ERP_Females                             0
ERP_Males                               0
MBS_per_100_0-24                      561
MBS_per_100_25-44                 

In [703]:
df_mbs_census_service_level_1[df_mbs_census_service_level_1["Year"] == 2014]

Unnamed: 0,key_x,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,%_People_had_service_0-24,%_People_had_service_25-44,...,not_stated,not_applicable,male_pop,female_pop,total_population,average_income_$169000+,age_0-24,age_65+,key_y,_merge
0,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),18.80,20.65,...,3787.6,11151.0,28038.8,27765.2,55804.2,,17982.0,6378.4,2014-10102,both
1,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Diagnostic Imaging (total),17.59,27.72,...,3787.6,11151.0,28038.8,27765.2,55804.2,,17982.0,6378.4,2014-10102,both
2,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,GP attendances (total),76.02,74.84,...,3787.6,11151.0,28038.8,27765.2,55804.2,,17982.0,6378.4,2014-10102,both
3,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Nursing and Aboriginal Health Workers (total),1.14,0.59,...,3787.6,11151.0,28038.8,27765.2,55804.2,,17982.0,6378.4,2014-10102,both
4,2014-10102,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Specialist attendances (total),14.60,17.92,...,3787.6,11151.0,28038.8,27765.2,55804.2,,17982.0,6378.4,2014-10102,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17967,2014-90104,2014,Other Territories,90104,Norfolk Island,Remote (incl. very remote),Level 1,Allied Health attendances (total),,,...,138.0,296.0,820.0,925.0,1748.0,,406.0,413.0,2014-90104,both
17968,2014-90104,2014,Other Territories,90104,Norfolk Island,Remote (incl. very remote),Level 1,Diagnostic Imaging (total),,,...,138.0,296.0,820.0,925.0,1748.0,,406.0,413.0,2014-90104,both
17969,2014-90104,2014,Other Territories,90104,Norfolk Island,Remote (incl. very remote),Level 1,GP attendances (total),,,...,138.0,296.0,820.0,925.0,1748.0,,406.0,413.0,2014-90104,both
17970,2014-90104,2014,Other Territories,90104,Norfolk Island,Remote (incl. very remote),Level 1,Nursing and Aboriginal Health Workers (total),,,...,138.0,296.0,820.0,925.0,1748.0,,406.0,413.0,2014-90104,both


In [704]:
# checking 2016 new SA3 codes having backfilled population values
old_sa3_codes = df_mbs_census_service_level_1[
    (
        df_mbs_census_service_level_1["GeographicCode"].isin(
            ["80110", "80111", "10106", "90104", "30805", "31608", "21704", "51003"]
        )
    )
    & (df_mbs_census_service_level_1["total_population"].isnull())
]
old_sa3_codes["Year"].value_counts(dropna=False)

Series([], Name: Year, dtype: int64)

In [705]:
df_mbs_census_service_level_1_new = df_mbs_census_service_level_1.drop(
    columns=["key_y", "_merge", "SA3"]
)

In [706]:
# Exporting df_mbs_census_service_level_1 into pickle file
df_mbs_census_service_level_1_new.to_pickle(
    os.path.join(path, "clean_datasets/2014_22_mbs_census_service_level_1.pkl")
)

#### Subset Service Level 2 & 3

In [707]:
# subsetting service level 2 and 3 dataset
df_mbs_census_service_level_2_3 = df_census_mbs_combined[
    df_census_mbs_combined["ServiceLevel"].isin(["Level 2", "Level 3"])
]
df_mbs_census_service_level_2_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146802 entries, 5 to 162086
Data columns (total 89 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   key_x                              146802 non-null  object  
 1   Year                               146802 non-null  int64   
 2   StateTerritory                     146802 non-null  object  
 3   GeographicCode                     146802 non-null  object  
 4   GeographicAreaName                 146802 non-null  object  
 5   GeographicGroup                    146802 non-null  object  
 6   ServiceLevel                       146802 non-null  object  
 7   Service                            146802 non-null  object  
 8   %_People_had_service_0-24          0 non-null       float64 
 9   %_People_had_service_25-44         0 non-null       float64 
 10  %_People_had_service_45-64         0 non-null       float64 
 11  %_People_had_service_65+  

In [708]:
df_mbs_census_service_level_2_3.columns

Index(['key_x', 'Year', 'StateTerritory', 'GeographicCode',
       'GeographicAreaName', 'GeographicGroup', 'ServiceLevel', 'Service',
       '%_People_had_service_0-24', '%_People_had_service_25-44',
       '%_People_had_service_45-64', '%_People_had_service_65+',
       '%_People_had_service_All persons', '%_People_had_service_Females',
       '%_People_had_service_Males', 'ERP_0-24', 'ERP_25-44', 'ERP_45-64',
       'ERP_65+', 'ERP_All persons', 'ERP_Females', 'ERP_Males',
       'MBS_per_100_0-24', 'MBS_per_100_25-44', 'MBS_per_100_45-64',
       'MBS_per_100_65+', 'MBS_per_100_All persons', 'MBS_per_100_Females',
       'MBS_per_100_Males', 'No_of_patients_0-24', 'No_of_patients_25-44',
       'No_of_patients_45-64', 'No_of_patients_65+',
       'No_of_patients_All persons', 'No_of_patients_Females',
       'No_of_patients_Males', 'No_of_services_0-24', 'No_of_services_25-44',
       'No_of_services_45-64', 'No_of_services_65+',
       'No_of_services_All persons', 'No_of_services

In [709]:
df_mbs_census_service_level_2_3_new = df_mbs_census_service_level_2_3.copy()
df_mbs_census_service_level_2_3_new.drop(
    columns=[
        "Total_provider_fees_$_Males",
        "Total_provider_fees_$_Females",
        "Total_provider_fees_$_65+",
        "Total_provider_fees_$_45-64",
        "Total_provider_fees_$_25-44",
        "Total_provider_fees_$_0-24",
        "Services_100_people_Males",
        "Services_100_people_Females",
        "Services_100_people_65+",
        "Services_100_people_45-64",
        "Services_100_people_25-44",
        "Services_100_people_0-24",
        "No_of_services_Males",
        "No_of_services_Females",
        "No_of_services_0-24",
        "No_of_services_25-44",
        "No_of_services_45-64",
        "No_of_services_65+",
        "%_People_had_service_0-24",
        "%_People_had_service_25-44",
        "%_People_had_service_45-64",
        "%_People_had_service_65+",
        "%_People_had_service_Females",
        "%_People_had_service_Males",
        "ERP_0-24",
        "ERP_25-44",
        "ERP_45-64",
        "ERP_65+",
        "ERP_Females",
        "ERP_Males",
        "MBS_per_100_0-24",
        "MBS_per_100_25-44",
        "MBS_per_100_45-64",
        "MBS_per_100_65+",
        "MBS_per_100_Females",
        "MBS_per_100_Males",
        "No_of_patients_0-24",
        "No_of_patients_25-44",
        "No_of_patients_45-64",
        "No_of_patients_65+",
        "No_of_patients_Females",
        "No_of_patients_Males",
        "Total_mbs_paid_$_0-24",
        "Total_mbs_paid_$_25-44",
        "Total_mbs_paid_$_45-64",
        "Total_mbs_paid_$_65+",
        "Total_mbs_paid_$_Females",
        "Total_mbs_paid_$_Males",
        "_merge",
        "SA3",
        "key_y",
    ],
    inplace=True,
)
df_mbs_census_service_level_2_3_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146802 entries, 5 to 162086
Data columns (total 38 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   key_x                              146802 non-null  object 
 1   Year                               146802 non-null  int64  
 2   StateTerritory                     146802 non-null  object 
 3   GeographicCode                     146802 non-null  object 
 4   GeographicAreaName                 146802 non-null  object 
 5   GeographicGroup                    146802 non-null  object 
 6   ServiceLevel                       146802 non-null  object 
 7   Service                            146802 non-null  object 
 8   %_People_had_service_All persons   125946 non-null  float64
 9   ERP_All persons                    146802 non-null  Int64  
 10  MBS_per_100_All persons            125946 non-null  float64
 11  No_of_patients_All persons         1259

In [710]:
# Exporting df_mbs_census_service_level_1 into pickle file
df_mbs_census_service_level_2_3_new.to_pickle(
    os.path.join(path, "clean_datasets/2014_22_mbs_census_service_level_2_3.pkl")
)