In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
# Import historic data on first prediction
data = pd.read_csv("../historic_appts_first_pred_trans.csv")
data

Unnamed: 0,appointment_id,age_deid,bmi_deid,ethnicity,ethnic_category_black,ethnic_category_unknown,gender_male,imd19_quintile,main_spoken_language_other,current_smoker,...,prediction_type,case_control,use_case,appt_date,appt_month_trans,hospital_service_display_trans,service_mapping_trans,reason_display_trans,appointment_type_trans,did_not_attend_risk_group_trans
0,10079786,52,27.45,,0,1,1,2,0,0,...,First prediction,1.0,3,2024-03-20,3,15,2,76,0,3
1,10164094,52,27.45,,1,0,1,2,0,1,...,First prediction,1.0,3,2023-11-24,11,71,2,376,0,3
2,10211058,32,39.45,,0,1,0,2,0,0,...,First prediction,1.0,3,2024-01-19,1,71,2,376,0,1
3,10550310,72,39.45,,1,0,1,4,0,0,...,First prediction,1.0,3,2024-01-19,1,71,2,376,0,1
4,10670198,32,39.45,,1,0,0,2,0,1,...,First prediction,1.0,3,2023-12-01,12,71,2,376,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139238,13147714,2,27.45,,0,0,0,1,0,0,...,First prediction,1.0,3,2023-12-29,12,95,8,499,0,2
139239,13257461,67,40.00,,0,0,0,1,0,0,...,First prediction,1.0,3,2024-02-01,2,125,2,694,0,2
139240,13390142,17,27.45,,1,0,1,3,0,0,...,First prediction,1.0,3,2024-03-11,3,95,8,452,0,3
139241,13440971,77,21.70,,0,1,1,2,0,0,...,First prediction,1.0,3,2024-03-20,3,36,2,215,1,2


In [3]:
# Filter for adult ENT appointments only
ent_codes = [6, 23,  1,  8, 17, 20, 21, 22,  0, 13, 28, 14, 15,  3, 26, 18, 25, 2, 24, 27, 11,  4, 12, 19,  5, 16,  9,  7, 10]
data = data[data["reason_display_trans"].isin(ent_codes)]
data

Unnamed: 0,appointment_id,age_deid,bmi_deid,ethnicity,ethnic_category_black,ethnic_category_unknown,gender_male,imd19_quintile,main_spoken_language_other,current_smoker,...,prediction_type,case_control,use_case,appt_date,appt_month_trans,hospital_service_display_trans,service_mapping_trans,reason_display_trans,appointment_type_trans,did_not_attend_risk_group_trans
17,12088942,42,27.45,,0,0,0,4,0,1,...,First prediction,1.0,3,2024-02-14,2,3,10,6,0,3
20,12207851,42,27.45,,1,0,1,4,0,0,...,First prediction,1.0,3,2024-03-12,3,3,10,6,0,4
36,12791383,27,21.70,,0,0,0,3,0,0,...,First prediction,1.0,3,2024-01-31,1,3,10,6,0,2
41,12940266,42,27.45,,1,0,0,2,0,0,...,First prediction,1.0,3,2024-04-03,4,3,10,6,0,1
68,13428133,83,21.70,,0,0,0,2,0,0,...,First prediction,1.0,3,2024-04-05,4,3,10,6,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139155,20238919,42,21.70,,0,0,0,2,0,1,...,First prediction,1.0,3,2024-04-23,4,111,9,9,0,1
139227,11138651,47,27.45,,0,0,0,2,0,0,...,First prediction,1.0,3,2024-02-20,2,3,10,6,0,3
139233,12468296,52,27.45,,0,0,0,1,0,1,...,First prediction,1.0,3,2023-11-25,11,3,10,6,0,3
139234,12799747,72,21.70,,0,0,0,2,0,1,...,First prediction,1.0,3,2023-12-08,12,111,9,6,0,2


In [4]:
# Export historic adult ENT appointments (First Prediction) to CSV
data.to_csv("../historic_appts_first_pred_trans_ENT.csv", index=False)

In [5]:
# Calculate the total number of appointments
total_appts = data.shape[0]
total_appts

5793

### Risk category scenario 1
- Predicted no show: very high risk
- Predicted show: high, moderate, low risk

In [6]:
# Transform risk categories to binary
risk_mapping ={
    1: 0, # Low risk
    2: 0, # Moderate risk
    3: 0, # High risk
    4: 1 # Very high risk
}

# Map the month names to numbers
trans_data1 = data.copy()
trans_data1["pred_no_show"] = trans_data1["did_not_attend_risk_group_trans"].map(risk_mapping)

trans_data1[["no_show", "did_not_attend_risk_group_trans", "pred_no_show"]]

Unnamed: 0,no_show,did_not_attend_risk_group_trans,pred_no_show
17,0.0,3,0
20,0.0,4,1
36,0.0,2,0
41,0.0,1,0
68,0.0,3,0
...,...,...,...
139155,0.0,1,0
139227,0.0,3,0
139233,0.0,3,0
139234,0.0,2,0


In [7]:
# Calculate confusion matrix
cm1 = confusion_matrix(trans_data1["no_show"], trans_data1["pred_no_show"])
cm1

array([[4977,   57],
       [ 678,   81]])

In [8]:
trans_data1.loc[trans_data1["no_show"] == 1]

Unnamed: 0,appointment_id,age_deid,bmi_deid,ethnicity,ethnic_category_black,ethnic_category_unknown,gender_male,imd19_quintile,main_spoken_language_other,current_smoker,...,case_control,use_case,appt_date,appt_month_trans,hospital_service_display_trans,service_mapping_trans,reason_display_trans,appointment_type_trans,did_not_attend_risk_group_trans,pred_no_show
132,14071855,42,40.00,,0,0,1,3,0,1,...,1.0,3,2023-12-05,12,111,9,6,0,2,0
183,14390588,42,27.45,,0,0,1,3,1,0,...,1.0,3,2024-04-18,4,3,10,6,0,3,0
219,14609191,47,27.45,,0,0,1,1,0,0,...,1.0,3,2024-04-05,4,111,9,6,0,3,0
238,14707395,52,39.45,,0,0,1,2,0,1,...,1.0,3,2024-02-23,2,111,9,6,0,3,0
312,14963998,27,27.45,,0,1,1,2,0,0,...,1.0,3,2024-02-29,2,3,10,6,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137620,20171165,2,27.45,,0,0,1,2,0,0,...,1.0,3,2024-04-15,4,92,8,14,0,1,0
137865,20177551,27,27.45,,0,1,0,1,0,0,...,1.0,3,2024-04-16,4,3,10,4,0,3,0
138451,20201487,2,27.45,,1,0,1,1,0,0,...,1.0,3,2024-04-22,4,3,10,3,0,3,0
138591,20206525,57,21.70,,1,0,0,3,0,1,...,1.0,3,2024-04-23,4,3,10,6,0,1,0


In [9]:
# Calculate PPV (Precision) ie. TP / (TP + FP)
TN1, FP1, FN1, TP1 = cm1.ravel()

PPV1 = TP1 / (TP1 + FP1)
PPV1

0.5869565217391305

In [10]:
# Calculate true no-show %
true_no_show1 = TP1/total_appts
true_no_show1

0.013982392542723977

### Risk category scenario 2
- Predicted no show: very high, high risk
- Predicted show: moderate, low risk

In [11]:
# Transform risk categories to binary
risk_mapping ={
    1: 0, # Low risk
    2: 0, # Moderate risk
    3: 1, # High risk
    4: 1 # Very high risk
}

# Map the month names to numbers
trans_data2 = data.copy()
trans_data2["pred_no_show"] = trans_data2["did_not_attend_risk_group_trans"].map(risk_mapping)

trans_data2[["no_show", "did_not_attend_risk_group_trans", "pred_no_show"]]

Unnamed: 0,no_show,did_not_attend_risk_group_trans,pred_no_show
17,0.0,3,1
20,0.0,4,1
36,0.0,2,0
41,0.0,1,0
68,0.0,3,1
...,...,...,...
139155,0.0,1,0
139227,0.0,3,1
139233,0.0,3,1
139234,0.0,2,0


In [12]:
# Calculate confusion matrix
cm2 = confusion_matrix(trans_data2["no_show"], trans_data2["pred_no_show"])
cm2

array([[4041,  993],
       [ 383,  376]])

In [13]:
# Calculate PPV (Precision) ie. TP / (TP + FP)
TN2, FP2, FN2, TP2 = cm2.ravel()

PPV2 = TP2 / (TP2 + FP2)
PPV2

0.27465303140978814

In [14]:
# Calculate true no-show %
true_no_show2 = TP2/total_appts
true_no_show2

0.06490592093906439

### Risk category scenario 3
- Predicted no show: very high, high risk, moderate
- Predicted show: low risk

In [15]:
# Transform risk categories to binary
risk_mapping ={
 1: 0, # Low risk
    2: 1, # Moderate risk
    3: 1, # High risk
    4: 1 # Very high risk
}

# Map the month names to numbers
trans_data3 = data.copy()
trans_data3["pred_no_show"] = trans_data3["did_not_attend_risk_group_trans"].map(risk_mapping)

trans_data3[["no_show", "did_not_attend_risk_group_trans", "pred_no_show"]]

Unnamed: 0,no_show,did_not_attend_risk_group_trans,pred_no_show
17,0.0,3,1
20,0.0,4,1
36,0.0,2,1
41,0.0,1,0
68,0.0,3,1
...,...,...,...
139155,0.0,1,0
139227,0.0,3,1
139233,0.0,3,1
139234,0.0,2,1


In [16]:
# Calculate confusion matrix
cm3 = confusion_matrix(trans_data3["no_show"], trans_data3["pred_no_show"])
cm3

array([[2547, 2487],
       [ 159,  600]])

In [17]:
# Calculate PPV (Precision) ie. TP / (TP + FP)
TN3, FP3, FN3, TP3 = cm3.ravel()

PPV3 = TP3 / (TP3 + FP3)
PPV3

0.19436345966958213

In [18]:
# Calculate true no-show %
true_no_show3 = TP3/total_appts
true_no_show3

0.10357327809425168

In [19]:
# Store the PPV1 value for later access in other Jupyter Notebook scripts
%store PPV1
%store PPV2

Stored 'PPV1' (float64)
Stored 'PPV2' (float64)
