# MBS Out of Pocket Cost Per Service By State & Year Dataset

Cleaning and transforming dataset prior to combining with master dataset

In [130]:
import pandas as pd
import os

## Import Dataset

In [131]:
path = r"/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/"

df_oop_service = pd.read_csv(
    os.path.join(path, "original_datasets/mbs_data/state_year_oop_cost.csv")
)
df_oop_service.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1815 entries, 0 to 1814
Data columns (total 4 columns):
 #   Column                                                                  Non-Null Count  Dtype 
---  ------                                                                  --------------  ----- 
 0   State                                                                   1815 non-null   object
 1   Financial Year                                                          1815 non-null   object
 2   Broad Type of Service                                                   1815 non-null   object
 3    Avg Patient Contribution Per Service: Out of Hospital Patient Billed   1815 non-null   object
dtypes: object(4)
memory usage: 56.8+ KB


## Cleaning Dataset

### Renaming Columns

In [132]:
# Renaming columns
df_oop_service.rename(
    columns={
        "Financial Year": "Year",
        "Broad Type of Service": "Service",
        " Avg Patient Contribution Per Service: Out of Hospital Patient Billed ": "Patient_OOP_per_service",
    },
    inplace=True,
)
df_oop_service

Unnamed: 0,State,Year,Service,Patient_OOP_per_service
0,Australia,2022-23,Total Medicare,74.28
1,NSW,2022-23,Total Medicare,81.37
2,Vic,2022-23,Total Medicare,74.64
3,Qld,2022-23,Total Medicare,71.12
4,SA,2022-23,Total Medicare,61.95
...,...,...,...,...
1810,WA,2009-10,Other MBS Services,55.37
1811,Tas,2009-10,Other MBS Services,50.16
1812,NT,2009-10,Other MBS Services,137.70
1813,ACT,2009-10,Other MBS Services,91.29


### Subseting to Extract 2014-2022 Dataset

In [133]:
df_oop_service["Year"].unique()

array(['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18',
       '2016-17', '2015-16', '2014-15', '2013-14', '2012-13', '2011-12',
       '2010-11', '2009-10'], dtype=object)

In [134]:
df_oop_service["Service"].unique()

array(['Total Medicare', 'Total GP Non-Referred Attendances',
       'Practice Nurse', 'Other Allied Health', 'Specialist Attendances',
       'Obstetrics', 'Anaesthetics', 'Total Pathology',
       'Diagnostic Imaging', 'Total Operations', 'Optometry',
       'Radiotherapy and Therapeutic Nuclear Medicine',
       'Other MBS Services'], dtype=object)

In [135]:
df_oop_service_subset = df_oop_service[
    (
        df_oop_service["Year"].isin(
            [
                "2021-22",
                "2020-21",
                "2019-20",
                "2018-19",
                "2017-18",
                "2016-17",
                "2015-16",
                "2014-15",
                "2013-14",
            ]
        )
    )
    & df_oop_service["Service"].isin(
        [
            "Total GP Non-Referred Attendances",
            "Practice Nurse",
            "Other Allied Health",
            "Specialist Attendances",
            "Diagnostic Imaging",
        ]
    )
    & df_oop_service["State"].isin(
        ["NSW", "Vic", "Qld", "SA", "WA", "Tas", "NT", "ACT"]
    )
].copy()
df_oop_service_subset

Unnamed: 0,State,Year,Service,Patient_OOP_per_service
141,NSW,2021-22,Total GP Non-Referred Attendances,41.29
142,Vic,2021-22,Total GP Non-Referred Attendances,43.05
143,Qld,2021-22,Total GP Non-Referred Attendances,43.18
144,SA,2021-22,Total GP Non-Referred Attendances,35.77
145,WA,2021-22,Total GP Non-Referred Attendances,42.65
...,...,...,...,...
1251,SA,2013-14,Diagnostic Imaging,86.53
1252,WA,2013-14,Diagnostic Imaging,103.68
1253,Tas,2013-14,Diagnostic Imaging,92.99
1254,NT,2013-14,Diagnostic Imaging,105.32


In [136]:
df_oop_service_subset["Service"].unique()

array(['Total GP Non-Referred Attendances', 'Practice Nurse',
       'Other Allied Health', 'Specialist Attendances',
       'Diagnostic Imaging'], dtype=object)

In [137]:
df_oop_service_subset["State"].unique()

array(['NSW', 'Vic', 'Qld', 'SA', 'WA', 'Tas', 'NT', 'ACT'], dtype=object)

In [138]:
df_oop_service_subset["Year"].unique()

array(['2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17',
       '2015-16', '2014-15', '2013-14'], dtype=object)

### Standardisation of Values

#### Year

In [139]:
year_replacement = {
    "2021-22": "2022",
    "2020-21": "2021",
    "2019-20": "2020",
    "2018-19": "2019",
    "2017-18": "2018",
    "2016-17": "2017",
    "2015-16": "2016",
    "2014-15": "2015",
    "2013-14": "2014",
}

df_oop_service_subset["Year"] = df_oop_service_subset["Year"].replace(year_replacement)

In [140]:
df_oop_service_subset["Year"].unique()

array(['2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015',
       '2014'], dtype=object)

#### Service Names

In [141]:
service_name_replacement = {
    "Total GP Non-Referred Attendances": "GP attendances (total)",
    "Practice Nurse": "Nursing and Aboriginal Health Workers (total)",
    "Other Allied Health": "Allied Health attendances (total)",
    "Specialist Attendances": "Specialist attendances (total)",
    "Diagnostic Imaging": "Diagnostic Imaging (total)",
}
df_oop_service_subset["Service"] = df_oop_service_subset["Service"].replace(
    service_name_replacement
)

In [142]:
df_oop_service_subset["Service"].unique()

array(['GP attendances (total)',
       'Nursing and Aboriginal Health Workers (total)',
       'Allied Health attendances (total)',
       'Specialist attendances (total)', 'Diagnostic Imaging (total)'],
      dtype=object)

#### Updating Data Types

In [143]:
df_oop_service_subset["Year"] = df_oop_service_subset["Year"].astype("int")

In [144]:
df_oop_service_subset["Patient_OOP_per_service"] = df_oop_service_subset[
    "Patient_OOP_per_service"
].astype("float")

In [145]:
df_oop_service_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 360 entries, 141 to 1255
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   State                    360 non-null    object 
 1   Year                     360 non-null    int64  
 2   Service                  360 non-null    object 
 3   Patient_OOP_per_service  360 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 14.1+ KB


## Export to Pickle & CSV Files

In [146]:
# exporting MBS-Census-State-Year-Service Level 1 (No demographic Data)
df_oop_service_subset.to_csv(
    os.path.join(
        path, "clean_datasets/cleaned_csv/mbs_oop_service_state_year_clean.csv"
    )
)

In [148]:
df_oop_service_subset.to_pickle(
    os.path.join(path, "clean_datasets/mbs_data/mbs_oop_service_state_year_clean.pkl")
)