In [33]:
import pandas as pd
from sklearn import preprocessing

In [34]:
# Import raw data
data = pd.read_csv('../historic_appts_first_pred_raw.csv')
data

Unnamed: 0,appointment_id,age_deid,bmi_deid,ethnicity,ethnic_category_black,ethnic_category_unknown,gender_male,imd19_quintile,main_spoken_language_other,current_smoker,...,service_mapping_priority,service_mapping_surgical,weekswaiting,no_show,did_not_attend_risk,did_not_attend_risk_group,prediction_type,case_control,use_case,appt_date
0,10079786,52,27.45,,0,1,1,2,0,0,...,0,0,,0.0,0.264963,High Risk,First prediction,1.0,3,2024-03-20
1,10164094,52,27.45,,1,0,1,2,0,1,...,0,0,,1.0,0.266140,High Risk,First prediction,1.0,3,2023-11-24
2,10211058,32,39.45,,0,1,0,2,0,0,...,0,0,,1.0,0.062844,Low Risk,First prediction,1.0,3,2024-01-19
3,10550310,72,39.45,,1,0,1,4,0,0,...,0,0,,0.0,0.040898,Low Risk,First prediction,1.0,3,2024-01-19
4,10670198,32,39.45,,1,0,0,2,0,1,...,0,0,,1.0,0.389798,High Risk,First prediction,1.0,3,2023-12-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139238,13147714,2,27.45,,0,0,0,1,0,0,...,0,0,,1.0,0.160467,Moderate Risk,First prediction,1.0,3,2023-12-29
139239,13257461,67,40.00,,0,0,0,1,0,0,...,0,0,,0.0,0.124008,Moderate Risk,First prediction,1.0,3,2024-02-01
139240,13390142,17,27.45,,1,0,1,3,0,0,...,0,0,,1.0,0.235887,High Risk,First prediction,1.0,3,2024-03-11
139241,13440971,77,21.70,,0,1,1,2,0,0,...,0,0,,1.0,0.163437,Moderate Risk,First prediction,1.0,3,2024-03-20


In [35]:
# Check for missing values ie. columns aviailable for analysis
data.isnull().sum()

appointment_id                               0
age_deid                                     0
bmi_deid                                     0
ethnicity                               139243
ethnic_category_black                        0
ethnic_category_unknown                      0
gender_male                                  0
imd19_quintile                               0
main_spoken_language_other                   0
current_smoker                               0
substance_misuse                             0
smoking_unknown                              0
tfc_name                                139243
patient_on_multiple_pathways            139243
appt_month                                   0
appt_hour                                    0
appt_duration                                0
hospital_service_display                   397
service_mapping                              0
reason_display                               0
appointment_type                             0
new_appt_flag

In [36]:
# count number of unique appointment IDs
data[data["case_control"]==1]["appointment_id"].nunique()

139243

In [37]:
# Average did_not_attend risk score
data[(data["did_not_attend_risk_group"]=="High Risk")|(data["did_not_attend_risk_group"]=="Very High Risk")]["did_not_attend_risk"].mean()

0.27078719642882415

### Transform date from object to datetime

In [38]:
data["appt_date"] = pd.to_datetime(data["appt_date"])
data["appt_date"]

0        2024-03-20
1        2023-11-24
2        2024-01-19
3        2024-01-19
4        2023-12-01
            ...    
139238   2023-12-29
139239   2024-02-01
139240   2024-03-11
139241   2024-03-20
139242   2024-01-30
Name: appt_date, Length: 139243, dtype: datetime64[ns]

### Transform categorical data to numeric data

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139243 entries, 0 to 139242
Data columns (total 57 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   appointment_id                        139243 non-null  int64         
 1   age_deid                              139243 non-null  int64         
 2   bmi_deid                              139243 non-null  float64       
 3   ethnicity                             0 non-null       float64       
 4   ethnic_category_black                 139243 non-null  int64         
 5   ethnic_category_unknown               139243 non-null  int64         
 6   gender_male                           139243 non-null  int64         
 7   imd19_quintile                        139243 non-null  int64         
 8   main_spoken_language_other            139243 non-null  int64         
 9   current_smoker                        139243 non-null  int6

In [40]:
# Encode the appt_month column

month_mapping ={
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12
}

# Map the month names to numbers
data["appt_month_trans"] = data["appt_month"].map(month_mapping)

data[['appt_month', 'appt_month_trans']]


Unnamed: 0,appt_month,appt_month_trans
0,March,3
1,November,11
2,January,1
3,January,1
4,December,12
...,...,...
139238,December,12
139239,February,2
139240,March,3
139241,March,3


In [41]:
# Encode the hospital_service_display column

# Create a label encoder object
le = preprocessing.LabelEncoder()

# Fit the label encoder
label = le.fit_transform(data["hospital_service_display"])

# Add the encoded column to the data
data["hospital_service_display_trans"] = label

In [42]:
# Encode the service_mapping column

# Create a label encoder object
le = preprocessing.LabelEncoder()

# Fit the label encoder
label = le.fit_transform(data["service_mapping"])

# Add the encoded column to the data
data["service_mapping_trans"] = label

In [43]:
# Encode the reason_display column

# Create a label encoder object
le = preprocessing.LabelEncoder()

# Fit the label encoder
label = le.fit_transform(data["reason_display"])

# Add the encoded column to the data
data["reason_display_trans"] = label

In [44]:
# Find out the codes for reason_display that contain 'adult ENT'
print(data[data["reason_display"].str.contains("adult ENT", case=False, na=False)]["reason_display"].unique())
ent_appts = data[data["reason_display"].str.contains("adult ENT", case=False, na=False)]
ent_codes = ent_appts["reason_display_trans"].unique()
ent_codes



['Adult ENT F/Up' 'Adult ENT Voice Clinic New' 'Adult ENT Balance F/Up'
 'Adult ENT New' 'Adult ENT Thyroid F/Up' 'Adult ENT Voice Clinic F/Up'
 'Adult ENT Voice Clinic JT F/Up' 'Adult ENT Voice Clinic JT New'
 'Adult ENT BIPP F/Up' 'Adult ENT Post Op F/Up'
 'Adult ENT Voice Post Op F/Up' 'Adult ENT Pre-Assessment F/Up'
 'Adult ENT Results F/Up' 'Adult ENT Emergency Clinic F/Up'
 'Adult ENT Voice Osteopath F/Up' 'Adult ENT Thyroid New'
 'Adult ENT Voice Coach New' 'Adult ENT Balance New'
 'Adult ENT Voice Coach F/Up' 'Adult ENT Voice Osteopath New'
 'Adult ENT Nurse Skin Test F/Up' 'Adult ENT Emergency Clinic New'
 'Adult ENT Nurse Skin Test New' 'Adult ENT Thyroid Post Op F/Up'
 'Adult ENT Emergency Nose Fracture New' 'Adult ENT Telephone F/Up'
 'Adult ENT Nurse Dressing Clinic F/Up' 'Adult ENT Minor Ops F/Up'
 'Adult ENT Nurse Dressing Clinic New']


array([ 6, 23,  1,  8, 17, 20, 21, 22,  0, 13, 28, 14, 15,  3, 26, 18, 25,
        2, 24, 27, 11,  4, 12, 19,  5, 16,  9,  7, 10])

In [45]:
# Encode the appointment_type column

# Create a label encoder object
le = preprocessing.LabelEncoder()

# Fit the label encoder
label = le.fit_transform(data["appointment_type"])

# Add the encoded column to the data
data["appointment_type_trans"] = label

In [46]:
# Encode the did_not_attend_risk_group column

risk_mapping ={
    "Low Risk": 1,
    "Moderate Risk": 2,
    "High Risk": 3,
    "Very High Risk": 4,
}

# Map the month names to numbers
data["did_not_attend_risk_group_trans"] = data["did_not_attend_risk_group"].map(risk_mapping)

data[['did_not_attend_risk_group', 'did_not_attend_risk_group_trans']]

Unnamed: 0,did_not_attend_risk_group,did_not_attend_risk_group_trans
0,High Risk,3
1,High Risk,3
2,Low Risk,1
3,Low Risk,1
4,High Risk,3
...,...,...
139238,Moderate Risk,2
139239,Moderate Risk,2
139240,High Risk,3
139241,Moderate Risk,2


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139243 entries, 0 to 139242
Data columns (total 63 columns):
 #   Column                                Non-Null Count   Dtype         
---  ------                                --------------   -----         
 0   appointment_id                        139243 non-null  int64         
 1   age_deid                              139243 non-null  int64         
 2   bmi_deid                              139243 non-null  float64       
 3   ethnicity                             0 non-null       float64       
 4   ethnic_category_black                 139243 non-null  int64         
 5   ethnic_category_unknown               139243 non-null  int64         
 6   gender_male                           139243 non-null  int64         
 7   imd19_quintile                        139243 non-null  int64         
 8   main_spoken_language_other            139243 non-null  int64         
 9   current_smoker                        139243 non-null  int6

### Export the transformed use case 3 data

In [48]:
# Dropped all transformed columns
data_trans = data.drop([
"appt_month",
"hospital_service_display",
"service_mapping",
"reason_display",
"appointment_type",
"did_not_attend_risk_group"], axis =1)

# Export data for use case 1 to CSV
data_trans.to_csv("../historic_appts_first_pred_trans.csv", index=False)