# Combine ERP & MBS Datasets

Estimated Resident Population data is missing from 2013-19 and 2019-21 datasets. The script checks and cleans respective SA3 datasets and is combined with corresponding MBS data. 

In [42]:
# import libraries
import pandas as pd
import numpy as np
import os

## 1.Load and Combine 2013-19 MBS and ERP dataset

#### 1.1 Import MBS and SA3 ERP files

In [43]:
# import the transformed mbs file and assign to a dataframe

# setup path to original dataset
path = r'/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/'

df_mbs_201319 = pd.read_csv(os.path.join(path, 'clean_datasets/mbs_data/2013-19_phc_mbs.csv'), encoding='ISO-8859-1', index_col=[0])
df_mbs_201319.head(3)

  df_mbs_201319 = pd.read_csv(os.path.join(path, 'clean_datasets/mbs_data/2013-19_phc_mbs.csv'), encoding='ISO-8859-1', index_col=[0])


Unnamed: 0,Year,StateTerritory,GeographicUnit,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,Medicare benefits per 100 people ($),No. of patients,No. of services,Percentage of people who had the service (%),Services per 100 people,Total Medicare benefits paid ($),Total provider fees ($)
0,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,2576.0,5624.0,10879.0,17.27,33.41,838549.0,1026474.0
1,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,4004.0,7714.0,15870.0,24.75,50.93,1247656.0,1600846.0
2,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,4672.0,8998.0,15754.0,41.32,72.35,1017264.0,1197133.0


In [44]:
df_mbs_201319.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172323 entries, 0 to 172322
Data columns (total 16 columns):
 #   Column                                        Non-Null Count   Dtype 
---  ------                                        --------------   ----- 
 0   Year                                          172323 non-null  object
 1   StateTerritory                                172323 non-null  object
 2   GeographicUnit                                172323 non-null  object
 3   GeographicCode                                172323 non-null  object
 4   GeographicAreaName                            172323 non-null  object
 5   GeographicGroup                               172323 non-null  object
 6   ServiceLevel                                  172323 non-null  object
 7   Service                                       172323 non-null  object
 8   DemographicGroup                              172323 non-null  object
 9   Medicare benefits per 100 people ($)          172308 non-nu

In [45]:
mixed_type_columns = df_mbs_201319.applymap(type).nunique() > 1
mixed_type_columns

Year                                            False
StateTerritory                                  False
GeographicUnit                                  False
GeographicCode                                   True
GeographicAreaName                              False
GeographicGroup                                 False
ServiceLevel                                    False
Service                                         False
DemographicGroup                                False
Medicare benefits per 100 people ($)             True
No. of patients                                  True
No. of services                                  True
Percentage of people who had the service (%)     True
Services per 100 people                          True
Total Medicare benefits paid ($)                 True
Total provider fees ($)                          True
dtype: bool

In [46]:
# import sa3 file containing estimated resident population for 2013-19

# import relevant columns for merging purposes
filter_cols = ['Year', 'GeographicCode', 'DemographicGroup', 'EstimatedResidentPopulation']
df_sa3_erp_1319 = pd.read_csv(os.path.join(path, 'original_datasets/mbs_data/phc-mbs-2013-2019/SA3_ERP_CSV.csv'), 
                              usecols=filter_cols, encoding='ISO-8859-1', index_col=None)
df_sa3_erp_1319.head(5)

Unnamed: 0,Year,GeographicCode,DemographicGroup,EstimatedResidentPopulation
0,2013-14,001NAT,0-14,4377926
1,2013-14,001NAT,0-24,7489910
2,2013-14,001NAT,0-64,19797751
3,2013-14,001NAT,15-24,3111984
4,2013-14,001NAT,25-44,6596790


#### 1.2 Compare SA3 geographical area list in mbs and sa3 datasets

In [47]:
df_sa3_erp_1319.nunique()

Year                               6
GeographicCode                   347
DemographicGroup                  10
EstimatedResidentPopulation    17390
dtype: int64

347 unique SA3 values. Expected the same for mbs data file.

In [48]:
df_mbs_201319['GeographicCode'] = df_mbs_201319['GeographicCode'].astype('str')
df_mbs_201319.nunique()

Year                                                 6
StateTerritory                                      10
GeographicUnit                                       1
GeographicCode                                     346
GeographicAreaName                                 346
GeographicGroup                                      7
ServiceLevel                                         3
Service                                             53
DemographicGroup                                     7
Medicare benefits per 100 people ($)             29547
No. of patients                                  35793
No. of services                                  58679
Percentage of people who had the service (%)      9802
Services per 100 people                          33996
Total Medicare benefits paid ($)                137047
Total provider fees ($)                         138572
dtype: int64

#### 1.3 Combine SA3 and MBS datasets

In [49]:
# left merge mbs data with SA3 to retrieve corresponding ERP
df_mbs_201319_sa3_combined = df_mbs_201319.merge(df_sa3_erp_1319, how='left', 
                                                 on=['Year', 'GeographicCode', 'DemographicGroup'], indicator=True)
df_mbs_201319_sa3_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172323 entries, 0 to 172322
Data columns (total 18 columns):
 #   Column                                        Non-Null Count   Dtype   
---  ------                                        --------------   -----   
 0   Year                                          172323 non-null  object  
 1   StateTerritory                                172323 non-null  object  
 2   GeographicUnit                                172323 non-null  object  
 3   GeographicCode                                172323 non-null  object  
 4   GeographicAreaName                            172323 non-null  object  
 5   GeographicGroup                               172323 non-null  object  
 6   ServiceLevel                                  172323 non-null  object  
 7   Service                                       172323 non-null  object  
 8   DemographicGroup                              172323 non-null  object  
 9   Medicare benefits per 100 people ($) 

In [50]:
df_mbs_201319_sa3_combined['_merge'].value_counts()

both          172323
left_only          0
right_only         0
Name: _merge, dtype: int64

In [51]:
df_mbs_201319_sa3_combined.head()

Unnamed: 0,Year,StateTerritory,GeographicUnit,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,Medicare benefits per 100 people ($),No. of patients,No. of services,Percentage of people who had the service (%),Services per 100 people,Total Medicare benefits paid ($),Total provider fees ($),EstimatedResidentPopulation,_merge
0,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,2576.0,5624.0,10879.0,17.27,33.41,838549.0,1026474.0,32558,both
1,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,4004.0,7714.0,15870.0,24.75,50.93,1247656.0,1600846.0,31163,both
2,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,4672.0,8998.0,15754.0,41.32,72.35,1017264.0,1197133.0,21774,both
3,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),65+,5819.0,6397.0,12316.0,55.07,106.01,675946.0,761837.0,11617,both
4,2013-14,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),All persons,3892.0,28733.0,54818.0,29.59,56.45,3779415.0,4586290.0,97112,both


## 2.Load and Combine 2019-21 MBS and ERP dataset

#### 2.1 Import MBS and SA3 ERP files

In [52]:
# import the transformed mbs file and assign to a dataframe

# setup path to original dataset
path = r'/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/'

df_mbs_201921 = pd.read_csv(os.path.join(path, 'clean_datasets/mbs_data/2019-21_phc_mbs.csv'), encoding='ISO-8859-1', index_col=[0])
df_mbs_201921.head(3)

Unnamed: 0,Year,StateTerritory,GeographicUnit,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,Medicare benefits per 100 people ($),No. of patients,No. of services,Percentage of people who had the service (%),Services per 100 people,Total Medicare benefits paid ($),Total provider fees ($)
0,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,3275.33,6314,14134,18.89,42.29,1094649,1612037
1,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,5036.77,8464,19537,27.12,62.6,1571825,2307710
2,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,5641.12,9429,18660,43.35,85.8,1226832,1605423


In [53]:
df_mbs_201921.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57263 entries, 0 to 57262
Data columns (total 16 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   Year                                          57263 non-null  object
 1   StateTerritory                                57263 non-null  object
 2   GeographicUnit                                57263 non-null  object
 3   GeographicCode                                57263 non-null  object
 4   GeographicAreaName                            57263 non-null  object
 5   GeographicGroup                               57263 non-null  object
 6   ServiceLevel                                  57263 non-null  object
 7   Service                                       57263 non-null  object
 8   DemographicGroup                              57263 non-null  object
 9   Medicare benefits per 100 people ($)          57263 non-null  object
 10

In [54]:
# import sa3 file containing estimated resident population for 2019-21

# import relevant columns for merging purposes
filter_cols = ['Year', 'GeographicCode', 'DemographicGroup', 'EstimatedResidentPopulation']
df_sa3_erp_1921 = pd.read_csv(os.path.join(path, 'original_datasets/mbs_data/phc-mbs-2019-2021/SA3_ERP_CSV_1920_2021.csv'), 
                              usecols=filter_cols, encoding='ISO-8859-1', index_col=None)
df_sa3_erp_1921.tail(5)

Unnamed: 0,Year,GeographicCode,DemographicGroup,EstimatedResidentPopulation
4839,2020-21,90104,45-64,574
4840,2020-21,90104,65+,482
4841,2020-21,90104,All persons,1734
4842,2020-21,90104,F,901
4843,2020-21,90104,M,833


#### 1.2 Compare SA3 geographical area list in mbs and sa3 datasets

In [55]:
df_sa3_erp_1921.nunique()

Year                              2
GeographicCode                  346
DemographicGroup                  7
EstimatedResidentPopulation    4642
dtype: int64

346 unique SA3 values. Expected the same for mbs data file.

In [56]:
df_mbs_201921['GeographicCode'] = df_mbs_201921['GeographicCode'].astype('str')
df_mbs_201921.nunique()

Year                                                2
StateTerritory                                     11
GeographicUnit                                      2
GeographicCode                                    346
GeographicAreaName                                346
GeographicGroup                                     8
ServiceLevel                                        3
Service                                            53
DemographicGroup                                    7
Medicare benefits per 100 people ($)            45311
No. of patients                                 21132
No. of services                                 29779
Percentage of people who had the service (%)     8618
Services per 100 people                         19058
Total Medicare benefits paid ($)                51273
Total provider fees ($)                         51548
dtype: int64

#### 2.3 Combine SA3 and MBS datasets

In [57]:
# left merge mbs data with SA3 to retrieve corresponding ERP
df_mbs_201921_sa3_combined = df_mbs_201921.merge(df_sa3_erp_1921, how='left', 
                                                 on=['Year', 'GeographicCode', 'DemographicGroup'], indicator=True)
df_mbs_201921_sa3_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57263 entries, 0 to 57262
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   Year                                          57263 non-null  object  
 1   StateTerritory                                57263 non-null  object  
 2   GeographicUnit                                57263 non-null  object  
 3   GeographicCode                                57263 non-null  object  
 4   GeographicAreaName                            57263 non-null  object  
 5   GeographicGroup                               57263 non-null  object  
 6   ServiceLevel                                  57263 non-null  object  
 7   Service                                       57263 non-null  object  
 8   DemographicGroup                              57263 non-null  object  
 9   Medicare benefits per 100 people ($)          5726

In [58]:
df_mbs_201921_sa3_combined['_merge'].value_counts()

both          57263
left_only         0
right_only        0
Name: _merge, dtype: int64

In [59]:
df_mbs_201921_sa3_combined.head()

Unnamed: 0,Year,StateTerritory,GeographicUnit,GeographicCode,GeographicAreaName,GeographicGroup,ServiceLevel,Service,DemographicGroup,Medicare benefits per 100 people ($),No. of patients,No. of services,Percentage of people who had the service (%),Services per 100 people,Total Medicare benefits paid ($),Total provider fees ($),EstimatedResidentPopulation,_merge
0,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),0-24,3275.33,6314,14134,18.89,42.29,1094649,1612037,33421,both
1,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),25-44,5036.77,8464,19537,27.12,62.6,1571825,2307710,31207,both
2,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),45-64,5641.12,9429,18660,43.35,85.8,1226832,1605423,21748,both
3,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),65+,7827.74,8973,19925,61.57,136.71,1140815,1421204,14574,both
4,2019-20,ACT,SA3,80101,Belconnen,Major cities - medium SES,Level 1,Allied Health attendances (total),All persons,4986.75,33180,72256,32.87,71.58,5034121,6946375,100950,both


## Combine MBS Datasets