# Combine: Cenus & MBS Datasets

This script combines non pivot version of MBS and Census datasets, making it easier to manage columns and EDA. The MBS dataset contains service level 1,2 and 3. Out of pocket by income % is calculated again here.

1. Imported Census data. Pivoted the demograghic columns into rows. Standardized the age values to be the same as MBS
2. Joined Cenus to MBS dataset
3. Derived OOP % by income for each income bracket.

In [4]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

## Import Census and MBS Datasets

### Import Census

In [5]:
# import the census and assign to a dataframe

# setup path to original dataset
path = r"/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/"
df_census_2014_22 = pd.read_pickle(
    os.path.join(
        path, "clean_datasets/census_data/2014_22_census_complelete_standardized.pkl"
    )
)
df_census_2014_22.info(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3222 entries, 3 to 4295
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   SA3                          3222 non-null   object
 1   Year                         3222 non-null   int64 
 2   age_25-44                    3222 non-null   int64 
 3   age_45-64                    3222 non-null   int64 
 4   negative_income              3222 non-null   int64 
 5   average_income_$5200         3222 non-null   int64 
 6   average_income_$13000        3222 non-null   int64 
 7   average_income_$18200        3222 non-null   int64 
 8   average_income_$26000        3222 non-null   int64 
 9   average_income_$36400        3222 non-null   int64 
 10  average_income_$46800        3222 non-null   int64 
 11  average_income_$58500        3222 non-null   int64 
 12  average_income_$71500        3222 non-null   int64 
 13  average_income_$91000        3222

### Pivot Demograhic Columns to Rows

In [6]:
# defining columns to pivot
age_gender_p_columns = [
    "age_0-24",
    "age_25-44",
    "age_45-64",
    "age_65+",
    "male_pop",
    "female_pop",
    "total_population",
]
# defining columns to remain as is (id columns). expected these to duplicate
id_columns = [
    "key",
    "SA3",
    "Year",
    "negative_income",
    "no_income_or_not_applicable",
    "average_income_$5200",
    "average_income_$13000",
    "average_income_$18200",
    "average_income_$26000",
    "average_income_$36400",
    "average_income_$46800",
    "average_income_$58500",
    "average_income_$71500",
    "average_income_$91000",
    "average_income_$130000",
    "average_income_$169000+",
]

# using melt function to pivot the demograhic into rows, naming columns Population_demograhic. Values stored in Population
df_census_2014_22_pivot = df_census_2014_22.melt(
    id_vars=id_columns,
    value_vars=age_gender_p_columns,
    var_name="Population_demographic",
    value_name="Population",
)
df_census_2014_22_pivot

Unnamed: 0,key,SA3,Year,negative_income,no_income_or_not_applicable,average_income_$5200,average_income_$13000,average_income_$18200,average_income_$26000,average_income_$36400,average_income_$46800,average_income_$58500,average_income_$71500,average_income_$91000,average_income_$130000,average_income_$169000+,Population_demographic,Population
0,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,age_0-24,17982
1,2015-10102,10102,2015,163,18191,1855,2282,2736,5106,3397,3908,4444,3616,5324,4089,0,age_0-24,18047
2,2016-10102,10102,2016,161,18606,1697,2188,2658,5389,3187,3921,4450,3639,5532,4091,1812,age_0-24,18112
3,2017-10102,10102,2017,180,18677,1652,2109,2618,5423,3129,3905,4543,3768,5993,4637,2182,age_0-24,18412
4,2018-10102,10102,2018,199,18748,1607,2031,2578,5458,3072,3889,4636,3898,6455,5183,2552,age_0-24,18713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22549,2018-99999,99999,2018,3,15,0,2,0,7,3,0,1,0,0,0,0,total_population,36
22550,2019-99999,99999,2019,2,11,0,1,0,5,3,0,1,0,0,0,0,total_population,28
22551,2020-99999,99999,2020,1,7,0,0,0,2,3,0,0,0,0,0,0,total_population,19
22552,2021-99999,99999,2021,0,3,0,0,0,0,3,0,0,0,0,0,0,total_population,11


In [7]:
df_census_2014_22_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22554 entries, 0 to 22553
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   key                          22554 non-null  object
 1   SA3                          22554 non-null  object
 2   Year                         22554 non-null  int64 
 3   negative_income              22554 non-null  int64 
 4   no_income_or_not_applicable  22554 non-null  int64 
 5   average_income_$5200         22554 non-null  int64 
 6   average_income_$13000        22554 non-null  int64 
 7   average_income_$18200        22554 non-null  int64 
 8   average_income_$26000        22554 non-null  int64 
 9   average_income_$36400        22554 non-null  int64 
 10  average_income_$46800        22554 non-null  int64 
 11  average_income_$58500        22554 non-null  int64 
 12  average_income_$71500        22554 non-null  int64 
 13  average_income_$91000        22

In [8]:
# checking values before and after pivots
df_census_2014_22[df_census_2014_22["key"] == "2014-10102"]

Unnamed: 0,SA3,Year,age_25-44,age_45-64,negative_income,average_income_$5200,average_income_$13000,average_income_$18200,average_income_$26000,average_income_$36400,...,average_income_$91000,average_income_$130000,male_pop,female_pop,total_population,average_income_$169000+,no_income_or_not_applicable,age_0-24,age_65+,key
3,10102,2014,15725,15715,166,2013,2376,2815,4824,3607,...,5117,4088,28038,27765,55804,0,17777,17982,6378,2014-10102


In [9]:
df_census_2014_22_pivot[df_census_2014_22_pivot["key"] == "2014-10102"]

Unnamed: 0,key,SA3,Year,negative_income,no_income_or_not_applicable,average_income_$5200,average_income_$13000,average_income_$18200,average_income_$26000,average_income_$36400,average_income_$46800,average_income_$58500,average_income_$71500,average_income_$91000,average_income_$130000,average_income_$169000+,Population_demographic,Population
0,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,age_0-24,17982
3222,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,age_25-44,15725
6444,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,age_45-64,15715
9666,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,age_65+,6378
12888,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,male_pop,28038
16110,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,female_pop,27765
19332,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,total_population,55804


In [10]:
# extracting unique values, which need to be updated to reflect same as MBS for join
df_census_2014_22_pivot["Population_demographic"].unique()

array(['age_0-24', 'age_25-44', 'age_45-64', 'age_65+', 'male_pop',
       'female_pop', 'total_population'], dtype=object)

In [11]:
# Update demographic value to match the MBS dataset for left join
demographic_mapping = {
    "age_0-24": "0-24",
    "age_25-44": "25-44",
    "age_45-64": "45-64",
    "age_65+": "65+",
    "male_pop": "Males",
    "female_pop": "Females",
    "total_population": "All persons",
}
df_census_2014_22_pivot["Population_demographic"] = df_census_2014_22_pivot[
    "Population_demographic"
].map(demographic_mapping)
df_census_2014_22_pivot["Population_demographic"].unique()

array(['0-24', '25-44', '45-64', '65+', 'Males', 'Females', 'All persons'],
      dtype=object)

In [12]:
df_census_2014_22_pivot[df_census_2014_22_pivot["key"] == "2014-10102"]

Unnamed: 0,key,SA3,Year,negative_income,no_income_or_not_applicable,average_income_$5200,average_income_$13000,average_income_$18200,average_income_$26000,average_income_$36400,average_income_$46800,average_income_$58500,average_income_$71500,average_income_$91000,average_income_$130000,average_income_$169000+,Population_demographic,Population
0,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,0-24,17982
3222,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,25-44,15725
6444,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,45-64,15715
9666,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,65+,6378
12888,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,Males,28038
16110,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,Females,27765
19332,2014-10102,10102,2014,166,17777,2013,2376,2815,4824,3607,3896,4438,3594,5117,4088,0,All persons,55804


In [13]:
# import the mbs file and assign to a dataframe

df_mbs_2014_23 = pd.read_pickle(
    os.path.join(
        path, "clean_datasets/mbs_data/2014-22_mbs_state_complete_no_pivot.pkl"
    )
)
df_mbs_2014_23.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229432 entries, 0 to 258533
Data columns (total 22 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Year                           229432 non-null  int64  
 1   StateTerritory                 229432 non-null  object 
 2   GeographicCode                 229432 non-null  object 
 3   GeographicAreaName             229432 non-null  object 
 4   GeographicGroup                229432 non-null  object 
 5   ServiceLevel                   229432 non-null  object 
 6   Service                        229432 non-null  object 
 7   DemographicGroup               229432 non-null  object 
 8   MBS_per_100                    229432 non-null  float64
 9   No_of_patients                 229432 non-null  Int32  
 10  No_of_services                 229432 non-null  Int64  
 11  %_People_had_service           229432 non-null  float64
 12  Services_100_people           

## Combine Census & MBS

Census will be combined with MBS using left join. Key columns will be Year, GeographicCode (SA3), DemograhicGroup (Population Demographic)

In [14]:
df_census_2014_22_pivot.shape

(22554, 18)

In [15]:
df_mbs_2014_23.shape

(229432, 22)

Expected 229,569 rows and 42 columns - number of rows will be the same as MBS. No nulls expected

In [16]:
# left join Cenus to MBS
df_mbs_census_combined_np = df_mbs_2014_23.merge(
    df_census_2014_22_pivot,
    how="left",
    left_on=["Year", "GeographicCode", "DemographicGroup"],
    right_on=["Year", "SA3", "Population_demographic"],
    indicator=True,
)
df_mbs_census_combined_np.shape

(229432, 40)

In [17]:
# checking for any left only joins
df_mbs_census_combined_np["_merge"].value_counts(dropna=False)

both          229432
left_only          0
right_only         0
Name: _merge, dtype: int64

In [18]:
# checking for nulls
df_mbs_census_combined_np.isna().sum()

Year                             0
StateTerritory                   0
GeographicCode                   0
GeographicAreaName               0
GeographicGroup                  0
ServiceLevel                     0
Service                          0
DemographicGroup                 0
MBS_per_100                      0
No_of_patients                   0
No_of_services                   0
%_People_had_service             0
Services_100_people              0
Total_mbs_paid_$                 0
Total_provider_fees_$            0
ERP                              0
key_x                            0
Out_of_Pocket                    0
Out_of_pocket_cost_%             0
Out_of_pocket_cost_per_person    0
No_of_service_per_person         0
Patient_ERP_Flag                 0
key_y                            0
SA3                              0
negative_income                  0
no_income_or_not_applicable      0
average_income_$5200             0
average_income_$13000            0
average_income_$1820

In [19]:
# checking specific key to ensure the join has worked
df_mbs_census_combined_np[
    (df_mbs_census_combined_np["key_x"] == "2014-10102")
    & (df_mbs_census_combined_np["Service"] == "Allied Health attendances (total)")
]

Unnamed: 0,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,MBS_per_100,No_of_patients,...,average_income_$36400,average_income_$46800,average_income_$58500,average_income_$71500,average_income_$91000,average_income_$130000,average_income_$169000+,Population_demographic,Population,_merge
508,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),0-24,2534.0,3536,...,3607,3896,4438,3594,5117,4088,0,0-24,17982,both
509,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),25-44,3160.0,3377,...,3607,3896,4438,3594,5117,4088,0,25-44,15725,both
510,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),45-64,3924.0,5547,...,3607,3896,4438,3594,5117,4088,0,45-64,15715,both
511,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),65+,5215.0,3128,...,3607,3896,4438,3594,5117,4088,0,65+,6378,both
512,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),All persons,3390.0,15587,...,3607,3896,4438,3594,5117,4088,0,All persons,55804,both
513,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),Females,4240.0,9172,...,3607,3896,4438,3594,5117,4088,0,Females,27765,both
514,2014,NSW,10102,Queanbeyan,Major cities - higher SES,Level 1,Allied Health attendances (total),Males,2554.0,6415,...,3607,3896,4438,3594,5117,4088,0,Males,28038,both


### Drop Unrequired Columns

In [20]:
# checking column that have duplicated or not required to drop
df_mbs_census_combined_np.columns

Index(['Year', 'StateTerritory', 'GeographicCode', 'GeographicAreaName',
       'GeographicGroup', 'ServiceLevel', 'Service', 'DemographicGroup',
       'MBS_per_100', 'No_of_patients', 'No_of_services',
       '%_People_had_service', 'Services_100_people', 'Total_mbs_paid_$',
       'Total_provider_fees_$', 'ERP', 'key_x', 'Out_of_Pocket',
       'Out_of_pocket_cost_%', 'Out_of_pocket_cost_per_person',
       'No_of_service_per_person', 'Patient_ERP_Flag', 'key_y', 'SA3',
       'negative_income', 'no_income_or_not_applicable',
       'average_income_$5200', 'average_income_$13000',
       'average_income_$18200', 'average_income_$26000',
       'average_income_$36400', 'average_income_$46800',
       'average_income_$58500', 'average_income_$71500',
       'average_income_$91000', 'average_income_$130000',
       'average_income_$169000+', 'Population_demographic', 'Population',
       '_merge'],
      dtype='object')

In [21]:
# dropping merge, key_y and SA3 columns
df_mbs_census_combined_np.drop(columns=["_merge", "key_y", "SA3"], inplace=True)
df_mbs_census_combined_np.columns

Index(['Year', 'StateTerritory', 'GeographicCode', 'GeographicAreaName',
       'GeographicGroup', 'ServiceLevel', 'Service', 'DemographicGroup',
       'MBS_per_100', 'No_of_patients', 'No_of_services',
       '%_People_had_service', 'Services_100_people', 'Total_mbs_paid_$',
       'Total_provider_fees_$', 'ERP', 'key_x', 'Out_of_Pocket',
       'Out_of_pocket_cost_%', 'Out_of_pocket_cost_per_person',
       'No_of_service_per_person', 'Patient_ERP_Flag', 'negative_income',
       'no_income_or_not_applicable', 'average_income_$5200',
       'average_income_$13000', 'average_income_$18200',
       'average_income_$26000', 'average_income_$36400',
       'average_income_$46800', 'average_income_$58500',
       'average_income_$71500', 'average_income_$91000',
       'average_income_$130000', 'average_income_$169000+',
       'Population_demographic', 'Population'],
      dtype='object')

## Derive %OOP from Income

In [22]:
df_mbs_census_combined_np_new = df_mbs_census_combined_np.copy()

In [23]:
df_mbs_census_combined_np_new["%_out_of_pocket_by_$5200"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 5200
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$13000"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 13000
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$18200"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 18200
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$26000"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 26000
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$36400"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 36400
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$46800"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 46800
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$58500"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 58500
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$71500"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 71500
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$91000"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 91000
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$130000"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 130000
) * 100
df_mbs_census_combined_np_new["%_out_of_pocket_by_$169000+"] = (
    df_mbs_census_combined_np_new["Out_of_pocket_cost_per_person"] / 169000
) * 100

In [24]:
df_mbs_census_combined_np_new.head(3)

Unnamed: 0,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,MBS_per_100,No_of_patients,...,%_out_of_pocket_by_$13000,%_out_of_pocket_by_$18200,%_out_of_pocket_by_$26000,%_out_of_pocket_by_$36400,%_out_of_pocket_by_$46800,%_out_of_pocket_by_$58500,%_out_of_pocket_by_$71500,%_out_of_pocket_by_$91000,%_out_of_pocket_by_$130000,%_out_of_pocket_by_$169000+
0,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,2576.0,5624,...,0.257037,0.183598,0.128519,0.091799,0.071399,0.057119,0.046734,0.03672,0.025704,0.019772
1,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,4004.0,7714,...,0.352197,0.251569,0.176098,0.125785,0.097832,0.078266,0.064036,0.050314,0.03522,0.027092
2,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,4672.0,8998,...,0.153768,0.109835,0.076884,0.054917,0.042713,0.034171,0.027958,0.021967,0.015377,0.011828


In [25]:
df_mbs_census_combined_np_new.isna().sum()

Year                             0
StateTerritory                   0
GeographicCode                   0
GeographicAreaName               0
GeographicGroup                  0
ServiceLevel                     0
Service                          0
DemographicGroup                 0
MBS_per_100                      0
No_of_patients                   0
No_of_services                   0
%_People_had_service             0
Services_100_people              0
Total_mbs_paid_$                 0
Total_provider_fees_$            0
ERP                              0
key_x                            0
Out_of_Pocket                    0
Out_of_pocket_cost_%             0
Out_of_pocket_cost_per_person    0
No_of_service_per_person         0
Patient_ERP_Flag                 0
negative_income                  0
no_income_or_not_applicable      0
average_income_$5200             0
average_income_$13000            0
average_income_$18200            0
average_income_$26000            0
average_income_$3640

In [26]:
for col in df_mbs_census_combined_np_new.columns:
    if col.startswith("%_out_of_pocket_by"):
        df_mbs_census_combined_np_new[col] = df_mbs_census_combined_np_new[col].round(3)

df_mbs_census_combined_np_new.head(3)

Unnamed: 0,Year,StateTerritory,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,MBS_per_100,No_of_patients,...,%_out_of_pocket_by_$13000,%_out_of_pocket_by_$18200,%_out_of_pocket_by_$26000,%_out_of_pocket_by_$36400,%_out_of_pocket_by_$46800,%_out_of_pocket_by_$58500,%_out_of_pocket_by_$71500,%_out_of_pocket_by_$91000,%_out_of_pocket_by_$130000,%_out_of_pocket_by_$169000+
0,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,2576.0,5624,...,0.257,0.184,0.129,0.092,0.071,0.057,0.047,0.037,0.026,0.02
1,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,4004.0,7714,...,0.352,0.252,0.176,0.126,0.098,0.078,0.064,0.05,0.035,0.027
2,2014,ACT,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,4672.0,8998,...,0.154,0.11,0.077,0.055,0.043,0.034,0.028,0.022,0.015,0.012


In [27]:
df_mbs_census_combined_np_new.shape

(229432, 48)

### Export MBS Census Combined Dataset

In [28]:
df_mbs_census_combined_np_new.to_pickle(
    os.path.join(path, "clean_datasets/2014_22_mbs_cenus_combined_no_pivot.pkl")
)

#### Exporting SA3 & Names

In [29]:
sa_codes_names = df_mbs_census_combined_np_new[
    ["GeographicCode", "GeographicAreaName"]
].copy()
sa_codes_names.drop_duplicates(inplace=True)
sa_codes_names
sa_codes_names.to_pickle(
    os.path.join(path, "clean_datasets/geojson/sa3_name_mbs_list.pkl")
)