In [1]:
import pandas as pd

In [2]:
# Access the stored PPV1 value
%store -r PPV1

In [3]:
# Import future appointments data
data = pd.read_csv("../future_appts_trans.csv")

In [4]:
# Filter for adult ENT appointments only
ent_codes = [1,  2,  7,  8,  0,  6,  4, 10,  9,  5,  3, 11, 12]
data = data[data["reason_display_trans"].isin(ent_codes)]

In [5]:
# Transform date from object to datetime
data["appt_date"] = pd.to_datetime(data["appt_date"])
data

Unnamed: 0,appointment_id,age_deid,bmi_deid,ethnic_category_black,ethnic_category_unknown,gender_male,imd19_quintile,main_spoken_language_other,current_smoker,substance_misuse,...,appt_date,ethnicity_trans,tfc_name_trans,patient_on_multiple_pathways_trans,appt_month_trans,hospital_service_display_trans,service_mapping_trans,reason_display_trans,appointment_type_trans,did_not_attend_risk_group_trans
19,17479744,57,27.45,1,0,1,1,0,0,0,...,2024-06-26,3,7,0,6,0,10,1,0,3
41,17631073,17,27.45,0,1,0,2,0,0,0,...,2024-05-07,3,7,0,5,0,10,1,0,3
44,17659262,27,21.70,0,0,0,4,0,0,0,...,2024-05-07,5,7,0,5,0,10,2,0,2
46,17659348,57,21.70,0,0,0,3,0,0,0,...,2024-05-07,5,7,0,5,0,10,2,0,3
47,17659361,27,27.45,0,1,0,2,0,0,0,...,2024-05-07,3,7,0,5,0,10,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15720,20183628,47,27.45,0,0,1,2,0,0,0,...,2024-06-19,5,7,0,6,0,10,1,0,1
15721,20183649,57,40.00,0,1,0,1,0,0,0,...,2024-06-08,3,7,0,5,0,10,2,0,3
15722,20183782,32,21.70,0,0,0,4,0,0,0,...,2024-05-25,3,7,0,7,0,10,6,0,1
15723,20183795,62,39.45,0,0,0,1,0,0,0,...,2024-06-17,3,7,0,6,0,10,3,0,1


In [6]:
# Filter for very high risk group
very_high = data[data["did_not_attend_risk_group_trans"] == 4]

### Scenario 1: sum the total duration per day

In [7]:
# Sum up appointment duration per day
dna_time = very_high.groupby("appt_date")["appt_duration"].sum().reset_index()
dna_time

Unnamed: 0,appt_date,appt_duration
0,2024-05-02,15.0
1,2024-05-08,15.0
2,2024-05-09,15.0
3,2024-05-10,15.0
4,2024-05-14,30.0
...,...,...
153,2025-07-31,15.0
154,2025-08-11,20.0
155,2025-08-19,15.0
156,2025-09-15,15.0


In [8]:
# Multiply by PPV1 for a more accurate estimate
dna_time["appt_duration_est"] = dna_time["appt_duration"] * PPV1
dna_time

Unnamed: 0,appt_date,appt_duration,appt_duration_est
0,2024-05-02,15.0,8.804348
1,2024-05-08,15.0,8.804348
2,2024-05-09,15.0,8.804348
3,2024-05-10,15.0,8.804348
4,2024-05-14,30.0,17.608696
...,...,...,...
153,2025-07-31,15.0,8.804348
154,2025-08-11,20.0,11.739130
155,2025-08-19,15.0,8.804348
156,2025-09-15,15.0,8.804348


### Scenario 2: proportion of duration based on DNA probability

In [9]:
# Get duration based on proportion of DNA probability
very_high["proportion_duration"] = very_high["did_not_attend_risk"] * very_high["appt_duration"]
very_high

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  very_high["proportion_duration"] = very_high["did_not_attend_risk"] * very_high["appt_duration"]


Unnamed: 0,appointment_id,age_deid,bmi_deid,ethnic_category_black,ethnic_category_unknown,gender_male,imd19_quintile,main_spoken_language_other,current_smoker,substance_misuse,...,ethnicity_trans,tfc_name_trans,patient_on_multiple_pathways_trans,appt_month_trans,hospital_service_display_trans,service_mapping_trans,reason_display_trans,appointment_type_trans,did_not_attend_risk_group_trans,proportion_duration
49,17659386,22,27.45,0,0,0,3,0,0,0,...,3,7,0,5,0,10,2,0,4,19.282152
58,17671208,22,27.45,0,0,1,2,0,0,0,...,3,7,0,5,0,10,2,0,4,10.660128
168,17900340,32,21.70,0,0,0,2,1,1,0,...,5,7,1,6,0,10,2,0,4,19.243860
180,17901062,67,39.45,0,1,1,1,1,0,0,...,3,7,0,5,0,10,2,0,4,6.831172
261,18011196,67,27.45,0,0,0,1,0,0,0,...,5,7,0,5,0,10,2,0,4,11.632648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14486,20112448,47,21.70,0,0,0,1,0,1,1,...,5,7,0,3,0,10,2,0,4,12.954382
14991,20141762,27,27.45,1,0,0,1,0,0,0,...,3,7,0,12,0,10,2,0,4,7.246462
14992,20141766,32,27.45,0,0,1,1,0,1,0,...,5,7,0,6,0,10,2,0,4,7.748109
14993,20141781,17,27.45,0,1,1,3,0,0,0,...,3,7,0,7,0,10,2,0,4,6.717366


In [10]:
# Sum up appointment duration per day
proportion_dna_time = very_high.groupby("appt_date")["proportion_duration"].sum().reset_index()
proportion_dna_time

Unnamed: 0,appt_date,proportion_duration
0,2024-05-02,8.616955
1,2024-05-08,7.265516
2,2024-05-09,6.831172
3,2024-05-10,9.235223
4,2024-05-14,19.282152
...,...,...
153,2025-07-31,8.405201
154,2025-08-11,9.423724
155,2025-08-19,10.028464
156,2025-09-15,6.644366
