# Clean & Format Patient Experience After Hours GP Services Dataset

GP services after hours is survey data collected on patient's experience with after hours GP service. The data is all percentages of respondents of survey answer

In [11]:
import pandas as pd
import os

In [12]:
# import gp after hours service survey data.
path = r"/Users/patel/Documents/CF-Data Anaylst Course/portfolio_projects/mbs_analysis/datasets/"

df_px_gp_ah_2018_23 = pd.read_csv(
    os.path.join(
        path, "clean_datasets/patient_experience/2022-23_gp_after_hours_services.csv"
    )
)
df_px_gp_ah_2018_23.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             40 non-null     object 
 1   survey option                    40 non-null     object 
 2   Male                             40 non-null     float64
 3   Female                           40 non-null     float64
 4   15–24                            40 non-null     object 
 5   25–34                            40 non-null     object 
 6   35–44                            40 non-null     float64
 7   45–54                            40 non-null     float64
 8   55–64                            40 non-null     object 
 9   65–74                            40 non-null     object 
 10  75–84                            40 non-null     object 
 11  85 and over                      40 non-null     object 
 12  major cities of australi

## Data Consistency & Cleaning

### Year

In [13]:
df_px_gp_ah_2018_23["year"].value_counts()

2022-23    8
2021-22    8
2020-21    8
2019-20    8
2018-19    8
Name: year, dtype: int64

Update mapping of years to be as below:  This is so the years are consistent with mbs and census datasets

- 2022-23 : 2023
- 2021-22 : 2022
-  2020-21 : 2021
- 2019-20 : 2020
- 2018-19 : 2019


In [14]:
# replacing year field to standardize
year_replacement = {
    "2022-23": "2023",
    "2021-22": "2022",
    "2020-21": "2021",
    "2019-20": "2020",
    "2018-19": "2019",
}

df_px_gp_ah_2018_23["year"] = df_px_gp_ah_2018_23["year"].replace(year_replacement)
df_px_gp_ah_2018_23["year"].value_counts()

2023    8
2022    8
2021    8
2020    8
2019    8
Name: year, dtype: int64

In [15]:
df_px_gp_ah_2018_23.head(10)

Unnamed: 0,year,survey option,Male,Female,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,major cities of australia,inner regional australia,outer regional remote australia
0,2023,Did not need to see an after hours GP,92.9,89.5,92.1,88.3,89.7,91.1,92.5,93.6,93.3,93.5,90.9,91.5,92.8
1,2023,Needed to see an after hours GP,7.1,10.5,8.0,11.8,10.3,9.1,7.4,6.5,6.7,6,9.1,8.6,7.2
2,2023,Needed to but did not see an after hours GP at...,36.4,38.6,30.1,37.7,35.4,36.7,39.8,46.6,44.6,#48.8,34.7,46.4,48.9
3,2023,Needed to and saw an after hours GP,63.5,61.1,66.4,61.6,64.8,63.3,60.8,53.9,52.6,70.1,65.6,52.6,#51.8
4,2023,Always saw an after hours GP when needed,53.9,52.4,54.9,50.7,57.0,51.4,56.1,48.4,51.1,#45.3,56.3,43.9,44.3
5,2023,At least once did not see an after hours GP wh...,45.2,47.7,42.2,49.5,43.4,47.4,46.5,48.7,51.9,#57.1,43.8,55.8,57.1
6,2023,At least once did not see an after hours GP wh...,3.4,4.8,7.3,6.8,2.2,4.4,2.0,#2.7,#1.3,0,4.2,5.3,4.6
7,2023,At least once did not see an after hours GP wh...,41.6,42.5,37.5,42.6,40.9,43.1,44.0,47.6,49.8,#59.8,39.2,49.1,55
8,2022,Did not need to see an after hours GP,93.4,91.2,92.3,90.1,91.4,91.2,93.0,94.8,95.8,95,91.7,93.5,94
9,2022,Needed to see an after hours GP,6.6,8.8,7.7,9.8,8.6,8.7,7.0,5.2,4.2,5.9,8.3,6.5,6.1


In [16]:
# convert to int64 for consistency with census and mbs
df_px_gp_ah_2018_23["year"] = df_px_gp_ah_2018_23["year"].astype("int")
df_px_gp_ah_2018_23["year"].dtypes

dtype('int64')

### Gender

In [17]:
# rename gender columns
df_px_gp_ah_2018_23.rename(columns={"Male": "males", "Female": "females"}, inplace=True)
df_px_gp_ah_2018_23.columns

Index(['year', 'survey option', 'males', 'females', '15–24', '25–34', '35–44',
       '45–54', '55–64', '65–74', '75–84', '85 and over',
       'major cities of australia', 'inner regional australia',
       'outer regional remote australia'],
      dtype='object')

### Age Columns

Some age columns contain # indicating these values have high margin of error. For analysis purposes the # value is removed. 

#### Age 15–24 

In [18]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["15–24"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,major cities of australia,inner regional australia,outer regional remote australia
10,2022,Needed to but did not see an after hours GP at...,25.6,31.3,#33.5,32.1,24.5,25.8,29.5,27.2,#32.8,#37.6,24.1,43.9,#45.7
13,2022,At least once did not see an after hours GP wh...,32.7,41.4,#40.9,39.7,35.6,37.5,34.9,35.8,#38.1,#67.9,33.5,52.1,#55.6
15,2022,At least once did not see an after hours GP wh...,30.4,38.2,#34.6,36.5,32.4,35.1,30.9,33.0,#36.2,#59.5,30.8,47.8,#44.1
22,2021,At least once did not see an after hours GP wh...,1.2,2.2,#3.4,#0.6,1.7,2.5,#0.7,0.0,#4.7,0,1.4,3.4,#1.2
30,2020,At least once did not see an after hours GP wh...,2.1,2.8,#1.3,2.6,2.8,2.8,#0.6,2.8,2.7,0,2.2,3.1,4.5


In [19]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["15–24"] = pd.to_numeric(
    df_px_gp_ah_2018_23["15–24"].str.replace("#", "", regex=False), errors="raise"
)

In [20]:
# checking the updates and NAs
df_px_gp_ah_2018_23["15–24"].value_counts(dropna=False)

92.1    1
8.0     1
3.4     1
15.1    1
91.1    1
9.1     1
25.3    1
74.0    1
66.6    1
34.1    1
1.3     1
31.9    1
89.4    1
10.7    1
19.8    1
80.6    1
73.2    1
28.2    1
2.2     1
21.4    1
81.7    1
85.7    1
7.7     1
30.1    1
66.4    1
54.9    1
42.2    1
7.3     1
37.5    1
92.3    1
33.5    1
18.7    1
68.3    1
57.8    1
40.9    1
7.6     1
34.6    1
93.4    1
6.5     1
24.3    1
Name: 15–24, dtype: int64

#### Age 25–34

In [21]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["25–34"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,major cities of australia,inner regional australia,outer regional remote australia
22,2021,At least once did not see an after hours GP wh...,1.2,2.2,3.4,#0.6,1.7,2.5,#0.7,0,#4.7,0,1.4,3.4,#1.2


In [22]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["25–34"] = pd.to_numeric(
    df_px_gp_ah_2018_23["25–34"].str.replace("#", "", regex=False), errors="raise"
)

In [23]:
# checking the updates and NAs
df_px_gp_ah_2018_23["25–34"].value_counts(dropna=False)

61.6    2
88.3    1
2.6     1
24.8    1
87.9    1
12.1    1
18.6    1
80.4    1
70.8    1
29.2    1
26.2    1
24.6    1
88.1    1
12.0    1
14.8    1
84.4    1
72.3    1
27.0    1
1.7     1
0.6     1
74.7    1
11.8    1
83.9    1
37.7    1
50.7    1
49.5    1
6.8     1
42.6    1
90.1    1
9.8     1
32.1    1
69.3    1
39.7    1
2.1     1
36.5    1
91.7    1
8.4     1
16.5    1
25.0    1
Name: 25–34, dtype: int64

#### Age 55–64

In [24]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["55–64"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,major cities of australia,inner regional australia,outer regional remote australia
14,2022,At least once did not see an after hours GP wh...,2.1,3.6,7.6,2.1,3.4,3.8,#1.4,#1.4,0,#12.7,2.6,3.1,#8.8
22,2021,At least once did not see an after hours GP wh...,1.2,2.2,3.4,0.6,1.7,2.5,#0.7,0,#4.7,0,1.4,3.4,#1.2
30,2020,At least once did not see an after hours GP wh...,2.1,2.8,1.3,2.6,2.8,2.8,#0.6,2.8,2.7,0,2.2,3.1,4.5


In [25]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["55–64"] = pd.to_numeric(
    df_px_gp_ah_2018_23["55–64"].str.replace("#", "", regex=False), errors="raise"
)

In [26]:
# checking the updates and NAs
df_px_gp_ah_2018_23["55–64"].value_counts(dropna=False)

7.0     2
93.0    2
70.4    2
63.2    1
0.7     1
27.7    1
91.5    1
8.5     1
26.7    1
73.0    1
92.5    1
37.3    1
0.6     1
35.8    1
18.0    1
81.3    1
29.0    1
2.1     1
28.9    1
25.9    1
74.2    1
29.5    1
39.8    1
60.8    1
56.1    1
46.5    1
2.0     1
44.0    1
70.7    1
7.4     1
66.2    1
34.9    1
1.4     1
30.9    1
95.1    1
5.0     1
26.8    1
Name: 55–64, dtype: int64

#### Age 65–74

In [27]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["65–74"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,major cities of australia,inner regional australia,outer regional remote australia
6,2023,At least once did not see an after hours GP wh...,3.4,4.8,7.3,6.8,2.2,4.4,2.0,#2.7,#1.3,0,4.2,5.3,4.6
14,2022,At least once did not see an after hours GP wh...,2.1,3.6,7.6,2.1,3.4,3.8,1.4,#1.4,0,#12.7,2.6,3.1,#8.8
38,2019,At least once did not see an after hours GP wh...,1.5,2.3,2.2,1.7,2.3,1.7,2.1,#1.3,0,0,1.7,4.4,2.3


In [28]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["65–74"] = pd.to_numeric(
    df_px_gp_ah_2018_23["65–74"].str.replace("#", "", regex=False), errors="raise"
)

In [29]:
# checking the updates and NAs
df_px_gp_ah_2018_23["65–74"].value_counts(dropna=False)

34.6    2
27.2    2
94.8    2
0.0     1
5.3     1
72.4    1
69.6    1
28.5    1
2.8     1
26.3    1
93.6    1
68.8    1
5.1     1
25.1    1
70.9    1
61.3    1
37.7    1
1.3     1
95.0    1
27.7    1
72.3    1
5.2     1
46.6    1
53.9    1
48.4    1
48.7    1
2.7     1
47.6    1
72.9    1
6.5     1
65.0    1
35.8    1
1.4     1
33.0    1
96.4    1
3.6     1
33.1    1
Name: 65–74, dtype: int64

#### Age 75–84

In [30]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["75–84"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85 and over,major cities of australia,inner regional australia,outer regional remote australia
6,2023,At least once did not see an after hours GP wh...,3.4,4.8,7.3,6.8,2.2,4.4,2.0,2.7,#1.3,0,4.2,5.3,4.6
10,2022,Needed to but did not see an after hours GP at...,25.6,31.3,33.5,32.1,24.5,25.8,29.5,27.2,#32.8,#37.6,24.1,43.9,#45.7
11,2022,Needed to and saw an after hours GP,75.0,68.9,68.3,69.3,76.4,75.6,70.7,72.9,#63.3,#41.8,75.8,#58.0,#55.1
12,2022,Always saw an after hours GP when needed,68.1,58.3,57.8,61.6,65.4,63.2,66.2,65.0,#59.9,#44.7,66.3,48.7,#43.7
13,2022,At least once did not see an after hours GP wh...,32.7,41.4,40.9,39.7,35.6,37.5,34.9,35.8,#38.1,#67.9,33.5,52.1,#55.6
15,2022,At least once did not see an after hours GP wh...,30.4,38.2,34.6,36.5,32.4,35.1,30.9,33.0,#36.2,#59.5,30.8,47.8,#44.1
22,2021,At least once did not see an after hours GP wh...,1.2,2.2,3.4,0.6,1.7,2.5,0.7,0.0,#4.7,0,1.4,3.4,#1.2
27,2020,Needed to and saw an after hours GP,81.3,73.6,74.0,80.4,78.3,71.9,73.0,72.4,#75.2,#82.7,80.2,62.7,64.4
28,2020,Always saw an after hours GP when needed,73.0,63.8,66.6,70.8,69.5,60.5,63.2,69.6,#68.6,#82.7,70.2,55.7,57.9
35,2019,Needed to and saw an after hours GP,79.4,80.7,80.6,84.4,80.6,76.5,81.3,70.9,#74.1,#67.8,83.9,69.8,59.2


In [31]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["75–84"] = pd.to_numeric(
    df_px_gp_ah_2018_23["75–84"].str.replace("#", "", regex=False), errors="raise"
)

In [32]:
# checking the updates and NAs
df_px_gp_ah_2018_23["75–84"].value_counts(dropna=False)

25.9    2
0.0     2
68.6    1
4.7     1
18.3    1
93.4    1
6.6     1
21.6    1
75.2    1
28.2    1
78.6    1
2.7     1
26.9    1
95.6    1
4.3     1
27.1    1
74.1    1
73.1    1
21.9    1
93.3    1
6.7     1
19.9    1
44.6    1
52.6    1
51.1    1
51.9    1
1.3     1
49.8    1
95.8    1
4.2     1
32.8    1
63.3    1
59.9    1
38.1    1
36.2    1
96.4    1
3.4     1
79.7    1
Name: 75–84, dtype: int64

#### Age 85 and over

In [33]:
# renaming the column
df_px_gp_ah_2018_23.rename(columns={"85 and over": "85+"}, inplace=True)

In [34]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["85+"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,major cities of australia,inner regional australia,outer regional remote australia
2,2023,Needed to but did not see an after hours GP at...,36.4,38.6,30.1,37.7,35.4,36.7,39.8,46.6,44.6,#48.8,34.7,46.4,48.9
4,2023,Always saw an after hours GP when needed,53.9,52.4,54.9,50.7,57.0,51.4,56.1,48.4,51.1,#45.3,56.3,43.9,44.3
5,2023,At least once did not see an after hours GP wh...,45.2,47.7,42.2,49.5,43.4,47.4,46.5,48.7,51.9,#57.1,43.8,55.8,57.1
7,2023,At least once did not see an after hours GP wh...,41.6,42.5,37.5,42.6,40.9,43.1,44.0,47.6,49.8,#59.8,39.2,49.1,55
10,2022,Needed to but did not see an after hours GP at...,25.6,31.3,33.5,32.1,24.5,25.8,29.5,27.2,32.8,#37.6,24.1,43.9,#45.7
11,2022,Needed to and saw an after hours GP,75.0,68.9,68.3,69.3,76.4,75.6,70.7,72.9,63.3,#41.8,75.8,#58.0,#55.1
12,2022,Always saw an after hours GP when needed,68.1,58.3,57.8,61.6,65.4,63.2,66.2,65.0,59.9,#44.7,66.3,48.7,#43.7
13,2022,At least once did not see an after hours GP wh...,32.7,41.4,40.9,39.7,35.6,37.5,34.9,35.8,38.1,#67.9,33.5,52.1,#55.6
14,2022,At least once did not see an after hours GP wh...,2.1,3.6,7.6,2.1,3.4,3.8,1.4,1.4,0.0,#12.7,2.6,3.1,#8.8
15,2022,At least once did not see an after hours GP wh...,30.4,38.2,34.6,36.5,32.4,35.1,30.9,33.0,36.2,#59.5,30.8,47.8,#44.1


In [35]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["85+"] = pd.to_numeric(
    df_px_gp_ah_2018_23["85+"].str.replace("#", "", regex=False), errors="raise"
)

In [36]:
# checking the updates and NAs
df_px_gp_ah_2018_23["85+"].value_counts(dropna=False)

0.0     4
25.3    2
82.7    2
37.4    2
13.7    2
68.3    1
64.7    1
94.3    1
6.2     1
15.3    1
3.7     1
90.7    1
9.5     1
26.2    1
67.8    1
69.3    1
27.3    1
93.5    1
6.0     1
59.5    1
12.7    1
67.9    1
44.7    1
41.8    1
37.6    1
5.9     1
95.0    1
59.8    1
57.1    1
45.3    1
70.1    1
48.8    1
95.8    1
Name: 85+, dtype: int64

### Region Type

#### Major Cities in Australia

In [37]:
# renaming the column
df_px_gp_ah_2018_23.rename(
    columns={"major cities of australia": "major_cities"}, inplace=True
)

#### Inner Regional in Australia

In [38]:
# renaming the column
df_px_gp_ah_2018_23.rename(
    columns={"inner regional australia": "inner_regional"}, inplace=True
)

In [39]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["inner_regional"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,major_cities,inner_regional,outer regional remote australia
11,2022,Needed to and saw an after hours GP,75.0,68.9,68.3,69.3,76.4,75.6,70.7,72.9,63.3,41.8,75.8,#58.0,#55.1


In [40]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["inner_regional"] = pd.to_numeric(
    df_px_gp_ah_2018_23["inner_regional"].str.replace("#", "", regex=False),
    errors="raise",
)

In [41]:
# checking the updates and NAs
df_px_gp_ah_2018_23["inner_regional"].value_counts(dropna=False)

43.9    2
3.1     2
91.5    1
43.7    1
39.0    1
92.2    1
7.7     1
37.5    1
62.7    1
55.7    1
40.8    1
42.1    1
92.1    1
7.9     1
30.3    1
69.8    1
60.4    1
39.8    1
4.4     1
3.4     1
59.9    1
8.6     1
6.5     1
46.4    1
52.6    1
55.8    1
5.3     1
49.1    1
93.5    1
58.0    1
65.0    1
48.7    1
52.1    1
47.8    1
94.6    1
5.4     1
36.1    1
34.3    1
Name: inner_regional, dtype: int64

#### Outer Regional Remote Australia

In [42]:
# renaming the column
df_px_gp_ah_2018_23.rename(
    columns={"outer regional remote australia": "outer_regional_remote"}, inplace=True
)

In [43]:
# Check for # value in the column
df_px_gp_ah_2018_23[df_px_gp_ah_2018_23["outer_regional_remote"].str.contains("#")]

Unnamed: 0,year,survey option,males,females,15–24,25–34,35–44,45–54,55–64,65–74,75–84,85+,major_cities,inner_regional,outer_regional_remote
3,2023,Needed to and saw an after hours GP,63.5,61.1,66.4,61.6,64.8,63.3,60.8,53.9,52.6,70.1,65.6,52.6,#51.8
10,2022,Needed to but did not see an after hours GP at...,25.6,31.3,33.5,32.1,24.5,25.8,29.5,27.2,32.8,37.6,24.1,43.9,#45.7
11,2022,Needed to and saw an after hours GP,75.0,68.9,68.3,69.3,76.4,75.6,70.7,72.9,63.3,41.8,75.8,58.0,#55.1
12,2022,Always saw an after hours GP when needed,68.1,58.3,57.8,61.6,65.4,63.2,66.2,65.0,59.9,44.7,66.3,48.7,#43.7
13,2022,At least once did not see an after hours GP wh...,32.7,41.4,40.9,39.7,35.6,37.5,34.9,35.8,38.1,67.9,33.5,52.1,#55.6
14,2022,At least once did not see an after hours GP wh...,2.1,3.6,7.6,2.1,3.4,3.8,1.4,1.4,0.0,12.7,2.6,3.1,#8.8
15,2022,At least once did not see an after hours GP wh...,30.4,38.2,34.6,36.5,32.4,35.1,30.9,33.0,36.2,59.5,30.8,47.8,#44.1
19,2021,Needed to and saw an after hours GP,83.0,77.8,85.7,83.9,78.1,80.8,74.2,72.3,79.7,68.3,84.2,65.0,#63.2
20,2021,Always saw an after hours GP when needed,75.5,71.1,81.7,74.7,67.8,69.0,70.4,68.8,78.6,64.7,77.1,59.9,#54.8
22,2021,At least once did not see an after hours GP wh...,1.2,2.2,3.4,0.6,1.7,2.5,0.7,0.0,4.7,0.0,1.4,3.4,#1.2


In [44]:
# remove # and convert to numeric value. Raise any errors for any failed ones
df_px_gp_ah_2018_23["outer_regional_remote"] = pd.to_numeric(
    df_px_gp_ah_2018_23["outer_regional_remote"].str.replace("#", "", regex=False),
    errors="raise",
)

In [45]:
# checking the updates and NAs
df_px_gp_ah_2018_23["outer_regional_remote"].value_counts(dropna=False)

92.8    1
7.2     1
1.2     1
45.8    1
92.9    1
7.1     1
35.4    1
64.4    1
57.9    1
42.1    1
4.5     1
38.2    1
93.6    1
6.5     1
41.3    1
59.2    1
53.6    1
45.4    1
2.3     1
46.1    1
54.8    1
63.2    1
6.1     1
48.9    1
51.8    1
44.3    1
57.1    1
4.6     1
55.0    1
94.0    1
45.7    1
36.2    1
55.1    1
43.7    1
55.6    1
8.8     1
44.1    1
94.3    1
5.7     1
44.6    1
Name: outer_regional_remote, dtype: int64

### Export to Pickle File

In [46]:
# exporting to pickle file for further analysis
df_px_gp_ah_2018_23.to_pickle(
    os.path.join(
        path, "clean_datasets/patient_experience/2023-2018_gp_after_hours_cleaned.pkl"
    )
)