# Clean & Format Patient Experience Dataset

In [228]:
import pandas as pd
import numpy as np
import os
import matplotlib as mp
import seaborn as sns

In [229]:
path = r"/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/"

df_px_gp_2018_23 = pd.read_csv(
    os.path.join(path, "clean_datasets/patient_experience/2023-2018_gp_services.csv")
)
df_px_gp_2018_23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             40 non-null     object 
 1   survey option                    40 non-null     object 
 2   15–24                            40 non-null     object 
 3   25–34                            40 non-null     object 
 4   35–44                            40 non-null     object 
 5   45–54                            40 non-null     float64
 6   55–64                            40 non-null     float64
 7   65–74                            40 non-null     object 
 8   75–84                            40 non-null     object 
 9   85 and over                      40 non-null     object 
 10  Males                            40 non-null     object 
 11  Females                          40 non-null     object 
 12  major cities of australi

In [230]:
df_px_gp_2018_23.head(5)

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
0,2022-23,Did not need to see a GP,26.9,21.9,19.8,15.7,11.1,6.1,2.8,3.0,20.8,12.0,16.2,16.0,17.7
1,2022-23,Needed to see a GP,72.8,78.0,80.0,84.3,88.8,93.9,97.3,96.2,79.1,87.9,83.7,83.9,82
2,2022-23,Needed to but did not see a GP at all,2.3,2.5,1.7,1.5,0.9,0.6,0.4,0.0,1.8,1.2,1.4,1.8,1.4
3,2022-23,Needed to and saw a GP,97.5,97.5,98.3,98.3,99.2,99.6,99.4,99.9,98.1,98.8,98.6,98.1,#98.7
4,2022-23,At no time delayed seeing or did not see a GP ...,69.9,64.0,64.2,64.6,71.9,77.5,82.3,82.6,73.6,66.3,71.0,66.0,67


## Data Consistency & Cleaning

### Year

In [231]:
df_px_gp_2018_23["year"].value_counts()

2022-23    8
2021-22    8
2020-21    8
2019-20    8
2018-19    8
Name: year, dtype: int64

Update mapping of years to be as below:  This is so the years are consistent with mbs and census datasets

- 2022-23 : 2023
- 2021-22 : 2022
-  2020-21 : 2021
- 2019-20 : 2020
- 2018-19 : 2019


In [232]:
# replacing year field to standardize
year_replacement = {
    "2022-23": "2023",
    "2021-22": "2022",
    "2020-21": "2021",
    "2019-20": "2020",
    "2018-19": "2019",
}

df_px_gp_2018_23["year"] = df_px_gp_2018_23["year"].replace(year_replacement)
df_px_gp_2018_23["year"].value_counts()

2023    8
2022    8
2021    8
2020    8
2019    8
Name: year, dtype: int64

In [233]:
df_px_gp_2018_23.head(10)

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
0,2023,Did not need to see a GP,26.9,21.9,19.8,15.7,11.1,6.1,2.8,3,20.8,12,16.2,16.0,17.7
1,2023,Needed to see a GP,72.8,78.0,80.0,84.3,88.8,93.9,97.3,96.2,79.1,87.9,83.7,83.9,82
2,2023,Needed to but did not see a GP at all,2.3,2.5,1.7,1.5,0.9,0.6,0.4,0,1.8,1.2,1.4,1.8,1.4
3,2023,Needed to and saw a GP,97.5,97.5,98.3,98.3,99.2,99.6,99.4,99.9,98.1,98.8,98.6,98.1,#98.7
4,2023,At no time delayed seeing or did not see a GP ...,69.9,64.0,64.2,64.6,71.9,77.5,82.3,82.6,73.6,66.3,71.0,66.0,67
5,2023,At least once delayed seeing or did not see a ...,29.8,35.9,35.9,35.3,28.3,22.7,17.4,17.8,26.4,33.7,28.9,34.1,33.2
6,2023,At least once delayed seeing or did not see a ...,8.2,10.2,8.2,6.7,6.1,4.5,3.3,2.3,5.5,8.4,6.7,8.3,6.9
7,2023,At least once delayed seeing or did not see a ...,21.6,25.7,27.6,28.6,22.2,18.3,14.1,15.2,20.9,25.4,22.2,25.9,26.2
8,2022,Did not need to see a GP,27.1,20.3,18.6,15.6,11.2,5.4,2.7,3,#2.3,#2.2,15.3,16.1,17.8
9,2022,Needed to see a GP,72.9,79.7,81.5,84.6,88.8,94.4,97.1,#97.3,#97.1,97.9,84.7,83.9,82.2


In [234]:
# convert to int64 for consistency with census and mbs
df_px_gp_2018_23["year"] = df_px_gp_2018_23["year"].astype("int")
df_px_gp_2018_23.dtypes

year                                 int64
survey option                       object
15–24                               object
25–34                               object
35–44                               object
45–54                              float64
55–64                              float64
65–74                               object
75–84                               object
85 and over                         object
Males                               object
Females                             object
major cities of australia          float64
inner regional australia            object
outer regional remote australia     object
dtype: object

### Age Columns

#### Age 15-24

In [235]:
df_px_gp_2018_23["15–24"].value_counts()

72       2
24.8     2
26.9     1
76.7     1
3        1
20.3     1
28.1     1
1.4      1
98.4     1
75       1
3.8      1
20.8     1
28.9     1
71.3     1
0.9      1
98.7     1
75.9     1
23.9     1
3.7      1
23.2     1
98.9     1
72.8     1
27.1     1
2.3      1
97.5     1
69.9     1
29.8     1
8.2      1
21.6     1
72.9     1
0.8      1
1        1
#99.0    1
70.1     1
29.9     1
5.3      1
28.2     1
20.4     1
Name: 15–24, dtype: int64

In [236]:
# datapoints containing #. Only 1
df_px_gp_2018_23[df_px_gp_2018_23["15–24"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
11,2022,Needed to and saw a GP,#99.0,98.9,#98.4,98.9,99.5,99.6,100,99.9,100,99.6,99.2,#98.8,98.8


This value contain # prior to the percentage. # is an indication  to use the number with caution as it contains high margin of error. For the purpose of the project, the # will be removed and data points will be converted from object to numeric

In [237]:
# removing the # from any values before converting to float
df_px_gp_2018_23["15–24"] = df_px_gp_2018_23["15–24"].str.replace("#", "", regex=False)

In [238]:
df_px_gp_2018_23["15–24"] = pd.to_numeric(df_px_gp_2018_23["15–24"], errors="raise")
df_px_gp_2018_23["15–24"].value_counts(dropna=False)

72.0    2
24.8    2
26.9    1
76.7    1
3.0     1
20.3    1
28.1    1
1.4     1
98.4    1
75.0    1
3.8     1
20.8    1
28.9    1
71.3    1
0.9     1
98.7    1
75.9    1
23.9    1
3.7     1
23.2    1
98.9    1
72.8    1
27.1    1
2.3     1
97.5    1
69.9    1
29.8    1
8.2     1
21.6    1
72.9    1
0.8     1
1.0     1
99.0    1
70.1    1
29.9    1
5.3     1
28.2    1
20.4    1
Name: 15–24, dtype: int64

#### Age 25–34

In [239]:
# check for # values in numbers
df_px_gp_2018_23[df_px_gp_2018_23["25–34"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
35,2019,Needed to and saw a GP,98.7,#99.2,99,99.1,99.7,99.7,99.7,#99.0,99,99.6,99.4,99,#98.9


In [240]:
# removing the # from any values before converting to float
df_px_gp_2018_23["25–34"] = df_px_gp_2018_23["25–34"].str.replace("#", "", regex=False)

In [241]:
# checking if str.replaced has worked. 0 result means all # values have been replaced
df_px_gp_2018_23[df_px_gp_2018_23["25–34"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia


In [242]:
# conbert to numeric and raise any error
df_px_gp_2018_23["25–34"] = pd.to_numeric(df_px_gp_2018_23["25–34"], errors="raise")
df_px_gp_2018_23["25–34"].value_counts(dropna=False)

26.5    2
5.0     2
21.9    1
68.3    1
3.6     1
22.9    1
20.5    1
79.5    1
0.7     1
99.3    1
31.7    1
78.0    1
20.9    1
79.0    1
0.9     1
99.2    1
72.6    1
27.4    1
73.5    1
98.6    1
1.4     1
78.3    1
2.5     1
97.5    1
64.0    1
35.9    1
10.2    1
25.7    1
20.3    1
79.7    1
1.1     1
98.9    1
67.3    1
32.9    1
4.4     1
28.6    1
21.8    1
22.5    1
Name: 25–34, dtype: int64

In [243]:
df_px_gp_2018_23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             40 non-null     int64  
 1   survey option                    40 non-null     object 
 2   15–24                            40 non-null     float64
 3   25–34                            40 non-null     float64
 4   35–44                            40 non-null     object 
 5   45–54                            40 non-null     float64
 6   55–64                            40 non-null     float64
 7   65–74                            40 non-null     object 
 8   75–84                            40 non-null     object 
 9   85 and over                      40 non-null     object 
 10  Males                            40 non-null     object 
 11  Females                          40 non-null     object 
 12  major cities of australi

In [244]:
df_px_gp_2018_23["35–44"].value_counts(dropna=False)

99       2
19.8     1
30.5     1
3.3      1
26       1
17.7     1
82.3     1
1        1
69.5     1
5.2      1
70.8     1
25.4     1
18       1
82.1     1
0.9      1
71.3     1
28.5     1
4.7      1
29.2     1
99.4     1
80       1
81.5     1
1.7      1
98.3     1
64.2     1
35.9     1
8.2      1
27.6     1
18.6     1
1.4      1
0.7      1
#98.4    1
63.9     1
36       1
4.2      1
31.8     1
19.9     1
79.9     1
24       1
Name: 35–44, dtype: int64

In [245]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["35–44"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
11,2022,Needed to and saw a GP,99.0,98.9,#98.4,98.9,99.5,99.6,100,99.9,100,99.6,99.2,#98.8,98.8


In [246]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["35–44"] = pd.to_numeric(
    df_px_gp_2018_23["35–44"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["35–44"].value_counts(dropna=False)

99.0    2
19.8    1
30.5    1
3.3     1
26.0    1
17.7    1
82.3    1
1.0     1
69.5    1
5.2     1
70.8    1
25.4    1
18.0    1
82.1    1
0.9     1
71.3    1
28.5    1
4.7     1
29.2    1
99.4    1
80.0    1
81.5    1
1.7     1
98.3    1
64.2    1
35.9    1
8.2     1
27.6    1
18.6    1
1.4     1
0.7     1
98.4    1
63.9    1
36.0    1
4.2     1
31.8    1
19.9    1
79.9    1
24.0    1
Name: 35–44, dtype: int64

In [247]:
df_px_gp_2018_23["35–44"].dtype

dtype('float64')

#### Age 65–74

In [248]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["65–74"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
34,2019,Needed to but did not see a GP at all,0.9,0.9,0.9,0.9,0.5,#0.2,#0.2,0.8,1,0.4,0.6,1,1.2


In [249]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["65–74"] = pd.to_numeric(
    df_px_gp_2018_23["65–74"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["65–74"].value_counts(dropna=False)

99.6    2
0.2     2
6.1     1
1.4     1
15.1    1
6.2     1
93.6    1
99.9    1
82.3    1
17.8    1
16.3    1
16.2    1
6.4     1
93.8    1
99.7    1
85.7    1
14.3    1
1.0     1
1.1     1
83.7    1
93.9    1
99.5    1
0.6     1
77.5    1
22.7    1
4.5     1
18.3    1
5.4     1
94.4    1
0.5     1
80.5    1
19.5    1
1.6     1
17.9    1
6.3     1
93.7    1
0.4     1
13.3    1
Name: 65–74, dtype: int64

#### Age 75–84

In [250]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["75–84"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
10,2022,Needed to but did not see a GP at all,1.0,1.1,1.4,1.1,0.5,0.5,#0.1,0.0,0.0,0.0,0.8,1.0,1.2
26,2020,Needed to but did not see a GP at all,1.4,0.7,1.0,0.6,0.6,0.2,#0.1,0.0,0.9,0.5,0.6,0.9,0.9
34,2019,Needed to but did not see a GP at all,0.9,0.9,0.9,0.9,0.5,0.2,#0.2,0.8,1.0,0.4,0.6,1.0,1.2


In [251]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["75–84"] = pd.to_numeric(
    df_px_gp_2018_23["75–84"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["75–84"].value_counts(dropna=False)

2.3      2
99.7     2
0.3      2
10.2     2
0.1      2
0.2      2
2.7      2
97.3     2
87.5     1
12.2     1
97.7     1
12.0     1
90.0     1
0.9      1
10.6     1
89.6     1
99.9     1
2.8      1
97.4     1
15.6     1
0.8      1
16.5     1
83.8     1
100.0    1
97.1     1
14.1     1
3.3      1
17.4     1
82.3     1
99.4     1
0.4      1
9.5      1
Name: 75–84, dtype: int64

In [252]:
df_px_gp_2018_23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             40 non-null     int64  
 1   survey option                    40 non-null     object 
 2   15–24                            40 non-null     float64
 3   25–34                            40 non-null     float64
 4   35–44                            40 non-null     float64
 5   45–54                            40 non-null     float64
 6   55–64                            40 non-null     float64
 7   65–74                            40 non-null     float64
 8   75–84                            40 non-null     float64
 9   85 and over                      40 non-null     object 
 10  Males                            40 non-null     object 
 11  Females                          40 non-null     object 
 12  major cities of australi

#### Age 85 and over

In [253]:
df_px_gp_2018_23.rename(columns={"85 and over": "85+"}, inplace=True)
df_px_gp_2018_23.head()

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
0,2023,Did not need to see a GP,26.9,21.9,19.8,15.7,11.1,6.1,2.8,3.0,20.8,12.0,16.2,16.0,17.7
1,2023,Needed to see a GP,72.8,78.0,80.0,84.3,88.8,93.9,97.3,96.2,79.1,87.9,83.7,83.9,82
2,2023,Needed to but did not see a GP at all,2.3,2.5,1.7,1.5,0.9,0.6,0.4,0.0,1.8,1.2,1.4,1.8,1.4
3,2023,Needed to and saw a GP,97.5,97.5,98.3,98.3,99.2,99.6,99.4,99.9,98.1,98.8,98.6,98.1,#98.7
4,2023,At no time delayed seeing or did not see a GP ...,69.9,64.0,64.2,64.6,71.9,77.5,82.3,82.6,73.6,66.3,71.0,66.0,67


In [254]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["85+"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
9,2022,Needed to see a GP,72.9,79.7,81.5,84.6,88.8,94.4,97.1,#97.3,#97.1,97.9,84.7,83.9,82.2
14,2022,At least once delayed seeing or did not see a ...,5.3,4.4,4.2,4.0,3.3,1.6,0.8,#1.7,#1.9,3.3,3.1,4.6,5
22,2021,At least once delayed seeing or did not see a ...,3.0,3.6,3.3,2.3,2.0,1.1,0.3,#0.2,1.6,3.0,2.3,2.9,2.4
25,2020,Needed to see a GP,72.0,79.5,82.3,82.9,89.3,93.6,97.7,#98.7,79.3,88.1,83.7,84.8,82.3
30,2020,At least once delayed seeing or did not see a ...,3.8,5.0,5.2,4.6,3.6,1.4,0.3,#0.3,2.5,4.8,3.3,5.2,4.4
35,2019,Needed to and saw a GP,98.7,99.2,99.0,99.1,99.7,99.7,99.7,#99.0,99,99.6,99.4,99.0,#98.9
38,2019,At least once delayed seeing or did not see a ...,3.7,5.0,4.7,4.2,2.8,1.0,0.9,#0.4,2.7,4.0,3.2,3.9,4.2


In [255]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["85+"] = pd.to_numeric(
    df_px_gp_2018_23["85+"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["85+"].value_counts(dropna=False)

0.0     4
99.9    3
3.0     2
12.5    2
10.9    1
98.7    1
99.6    1
88.1    1
11.9    1
0.3     1
97.9    1
2.0     1
11.7    1
0.8     1
99.0    1
86.7    1
1.4     1
12.3    1
0.2     1
96.2    1
88.9    1
94.9    1
4.4     1
13.8    1
1.7     1
15.7    1
84.6    1
97.3    1
15.2    1
2.3     1
17.8    1
82.6    1
0.4     1
Name: 85+, dtype: int64

### Gender

#### Males

In [256]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["Males"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
8,2022,Did not need to see a GP,27.1,20.3,18.6,15.6,11.2,5.4,2.7,3.0,#2.3,#2.2,15.3,16.1,17.8
9,2022,Needed to see a GP,72.9,79.7,81.5,84.6,88.8,94.4,97.1,97.3,#97.1,97.9,84.7,83.9,82.2
14,2022,At least once delayed seeing or did not see a ...,5.3,4.4,4.2,4.0,3.3,1.6,0.8,1.7,#1.9,3.3,3.1,4.6,5.0


In [257]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["Males"] = pd.to_numeric(
    df_px_gp_2018_23["Males"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["Males"].value_counts(dropna=False)

99.2     2
21.9     2
2.5      1
1.6      1
17.4     1
20.7     1
79.3     1
0.9      1
78.1     1
19.4     1
81.0     1
21.5     1
78.4     1
1.0      1
99.0     1
79.5     1
20.6     1
2.7      1
19.0     1
20.8     1
79.1     1
0.8      1
1.8      1
98.1     1
73.6     1
26.4     1
5.5      1
20.9     1
2.3      1
97.1     1
0.0      1
100.0    1
88.4     1
12.1     1
1.9      1
11.8     1
78.0     1
17.9     1
Name: Males, dtype: int64

#### Females

In [258]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["Females"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,Males,Females,major cities of australia,inner regional australia,outer regional remote australia
8,2022,Did not need to see a GP,27.1,20.3,18.6,15.6,11.2,5.4,2.7,3.0,2.3,#2.2,15.3,16.1,17.8


In [259]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["Females"] = pd.to_numeric(
    df_px_gp_2018_23["Females"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["Females"].value_counts(dropna=False)

88.1    2
99.6    2
12.0    1
71.1    1
3.0     1
23.4    1
11.9    1
0.5     1
99.5    1
29.0    1
73.6    1
4.8     1
24.1    1
11.8    1
0.4     1
75.2    1
24.8    1
4.0     1
26.4    1
99.3    1
87.9    1
0.7     1
1.2     1
98.8    1
66.3    1
33.7    1
8.4     1
25.4    1
2.2     1
97.9    1
0.0     1
83.1    1
17.9    1
3.3     1
15.8    1
12.2    1
87.8    1
20.8    1
Name: Females, dtype: int64

### Region Type

#### Major Cities of Australia

In [260]:
# checking column names
df_px_gp_2018_23.columns

Index(['year', 'survey option', '15–24', '25–34', '35–44', '45–54', '55–64',
       '65–74', '75–84', '85+', 'Males', 'Females',
       'major cities of australia', 'inner regional australia',
       'outer regional remote australia'],
      dtype='object')

In [261]:
# renaming column to be shorter and remove spaces
df_px_gp_2018_23.rename(
    columns={"major cities of australia": "major_cities"}, inplace=True
)

In [262]:
df_px_gp_2018_23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             40 non-null     int64  
 1   survey option                    40 non-null     object 
 2   15–24                            40 non-null     float64
 3   25–34                            40 non-null     float64
 4   35–44                            40 non-null     float64
 5   45–54                            40 non-null     float64
 6   55–64                            40 non-null     float64
 7   65–74                            40 non-null     float64
 8   75–84                            40 non-null     float64
 9   85+                              40 non-null     float64
 10  Males                            40 non-null     float64
 11  Females                          40 non-null     float64
 12  major_cities            

#### Inner Regional Australia 

In [263]:
# renaming column to be shorter and remove spaces
df_px_gp_2018_23.rename(
    columns={"inner regional australia": "inner_regional"}, inplace=True
)

In [264]:
df_px_gp_2018_23.columns

Index(['year', 'survey option', '15–24', '25–34', '35–44', '45–54', '55–64',
       '65–74', '75–84', '85+', 'Males', 'Females', 'major_cities',
       'inner_regional', 'outer regional remote australia'],
      dtype='object')

In [265]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["inner_regional"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,Males,Females,major_cities,inner_regional,outer regional remote australia
11,2022,Needed to and saw a GP,99.0,98.9,98.4,98.9,99.5,99.6,100.0,99.9,100.0,99.6,99.2,#98.8,98.8


In [266]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["inner_regional"] = pd.to_numeric(
    df_px_gp_2018_23["inner_regional"].str.replace("#", "", regex=False), errors="raise"
)
df_px_gp_2018_23["inner_regional"].value_counts(dropna=False)

1.0     3
99.0    2
83.9    2
24.8    1
75.1    1
84.6    1
15.4    1
22.4    1
5.2     1
2.9     1
27.6    1
72.4    1
99.2    1
0.9     1
84.8    1
15.1    1
21.7    1
3.9     1
16.0    1
75.4    1
24.6    1
83.7    1
16.2    1
24.2    1
4.6     1
28.9    1
71.2    1
98.8    1
16.1    1
25.9    1
8.3     1
34.1    1
66.0    1
98.1    1
1.8     1
20.9    1
Name: inner_regional, dtype: int64

#### Outer Regional Remote Australia

In [267]:
df_px_gp_2018_23.columns

Index(['year', 'survey option', '15–24', '25–34', '35–44', '45–54', '55–64',
       '65–74', '75–84', '85+', 'Males', 'Females', 'major_cities',
       'inner_regional', 'outer regional remote australia'],
      dtype='object')

In [268]:
# renaming column to be shorter and remove spaces
df_px_gp_2018_23.rename(
    columns={"outer regional remote australia": "outer_regional_remote"}, inplace=True
)

In [269]:
df_px_gp_2018_23.columns

Index(['year', 'survey option', '15–24', '25–34', '35–44', '45–54', '55–64',
       '65–74', '75–84', '85+', 'Males', 'Females', 'major_cities',
       'inner_regional', 'outer_regional_remote'],
      dtype='object')

In [270]:
# check for number of rows containg #
df_px_gp_2018_23[df_px_gp_2018_23["outer_regional_remote"].str.contains("#")]

Unnamed: 0,year,survey option,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,Males,Females,major_cities,inner_regional,outer_regional_remote
3,2023,Needed to and saw a GP,97.5,97.5,98.3,98.3,99.2,99.6,99.4,99.9,98.1,98.8,98.6,98.1,#98.7
27,2020,Needed to and saw a GP,98.4,99.3,99.0,99.4,99.5,99.9,99.7,99.6,99.2,99.5,99.4,99.2,#99.0
35,2019,Needed to and saw a GP,98.7,99.2,99.0,99.1,99.7,99.7,99.7,99.0,99.0,99.6,99.4,99.0,#98.9


In [271]:
# remove the hash and convert values to numeric (float)
df_px_gp_2018_23["outer_regional_remote"] = pd.to_numeric(
    df_px_gp_2018_23["outer_regional_remote"].str.replace("#", "", regex=False),
    errors="raise",
)
df_px_gp_2018_23["outer_regional_remote"].value_counts(dropna=False)

17.7    2
1.2     2
99.0    2
25.0    1
2.4     1
22.7    1
82.3    1
0.9     1
73.3    1
26.7    1
4.4     1
22.2    1
19.2    1
80.8    1
98.9    1
78.6    1
21.3    1
4.2     1
75.0    1
1.1     1
82.0    1
17.8    1
1.4     1
98.7    1
67.0    1
33.2    1
6.9     1
26.2    1
82.2    1
81.0    1
98.8    1
69.6    1
30.2    1
5.0     1
25.4    1
18.9    1
17.3    1
Name: outer_regional_remote, dtype: int64

### Export to Pickle File

In [272]:
df_px_gp_2018_23.to_pickle(
    os.path.join(
        path, "clean_datasets/patient_experience/2023-2018_gp_services_cleaned.pkl"
    )
)