# Libraries

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Data

In [2]:
df = pd.read_csv("../../../data/delays/raw_data.csv")
df.head()

Unnamed: 0,Record ID,Event Name,Repeat Instrument,Repeat Instance,Date of Visit,Patient Study ID,Date Of Birth,Marital Status,Country District,Physical address (village/ward),...,Vital Status,Date of Death,Cause of Death,Date of Last Follow Up,How was patient reached?,"Other, specify",Follow Up Weight (kg),Performance status,Brachytherapy Dose Received (cGy),EBRT Dose Received (cGy)
0,1,Consult,,,2015-01-28,1.0,1965-01-01,Divorced,Kweneng West District,Letlhakeng(Goomodimo ward),...,,,,,,,,,,
1,1,OTV_1,,,,,,,,,...,,,,,,,,,,2200.0
2,1,OTV_3,,,,,,,,,...,,,,,,,,,600.0,4200.0
3,1,OTV_4,,,,,,,,,...,,,,,,,,,2100.0,5000.0
4,1,3 Month Follow-Up,,,,,,,,,...,,,,,,,,,,


# Cleaning

In [3]:
# print(df[df["Event Name"] == "Consult"].shape)
# print(df[df["Event Name"] == "Consult"]["Record ID"].nunique())
# one row per pt
consult_df = df[df["Event Name"] == "Consult"]
# drop NA columns
consult_df.dropna(how="all", axis=1, inplace=True)
# drop columns not used for analysis
consult_df.drop(columns=["Event Name", "Patient Study ID", "Physical address (village/ward)", "Village",
                         "Consulting clinic", "Cancer Diagnosis Site", 
                         'List all HIV medications taking (choice=lamivudine(3TC))', 
                         'List all HIV medications taking (choice=abacavir(ABC))', 
                         'List all HIV medications taking (choice=amprenavir(APV))', 
                         'List all HIV medications taking (choice=atazanavir(ATV))', 
                         'List all HIV medications taking (choice=ritonavir-boosted atazanavir(ATV/r))', 
                         'List all HIV medications taking (choice=cobicistat(COBI))', 
                         'List all HIV medications taking (choice=stavudine(D4T))', 
                         'List all HIV medications taking (choice=zalcitabine(ddC))', 
                         'List all HIV medications taking (choice=didanosine(DDI))', 
                         'List all HIV medications taking (choice=delavirdine(DLV))', 
                         'List all HIV medications taking (choice=darunavir(DRV))', 
                         'List all HIV medications taking (choice=ritonavir-boosted darunavir(DRV/r))', 
                         'List all HIV medications taking (choice=dolutegravir(DTG))', 
                         'List all HIV medications taking (choice=efavirenz(EFV))', 
                         'List all HIV medications taking (choice=etravirine(ETR))', 
                         'List all HIV medications taking (choice=elvitegravir(EVG))', 
                         'List all HIV medications taking (choice=fosamprenavir(FPV))', 
                         'List all HIV medications taking (choice=ritonavir-boosted fosamprenavir(FPV/r))', 
                         'List all HIV medications taking (choice=emtricitabine(FTC))', 
                         'List all HIV medications taking (choice=azidothymidineglucuronide(GAZT))', 
                         'List all HIV medications taking (choice=indinavir(IDV))', 
                         'List all HIV medications taking (choice=lopinavir(LPV))', 
                         'List all HIV medications taking (choice=ritonavir-boosted lopinavir(LPV/r))', 
                         'List all HIV medications taking (choice=maraviroc(MVC))', 
                         'List all HIV medications taking (choice=nelfinavir(NFV))', 
                         'List all HIV medications taking (choice=nevirapine(NVP))', 
                         'List all HIV medications taking (choice=raltegravir(RAL))', 
                         'List all HIV medications taking (choice=rilpivirine(RPV))', 
                         'List all HIV medications taking (choice=ritonavir(RTV))', 
                         'List all HIV medications taking (choice=saquinavir(SQV))', 
                         'List all HIV medications taking (choice=ritonavir-boosted saquinavir(SQV/r))', 
                         'List all HIV medications taking (choice=enfuvirtide(T20))', 
                         'List all HIV medications taking (choice=tenofovir disoproxil fumarate(TDF))', 
                         'List all HIV medications taking (choice=tipranavir(TPV))', 
                         'List all HIV medications taking (choice=ritonavir-boostedtipranavir(TPV/r))', 
                         'List all HIV medications taking (choice=zidovudine(ZDV))', 
                         'List all HIV medications taking (choice=Atripla)', 
                         'List all HIV medications taking (choice=Alluvia)', 
                         'List all HIV medications taking (choice=Comibivir (CBV))', 
                         'List all HIV medications taking (choice=Truvada)', 
                         'List all HIV medications taking (choice=Dolutegravir (DTG))', 
                         'List all HIV medications taking (choice=Other)', 'Other HIV medication', 'Date',
                        ], inplace=True)

df["Event Name"].unique()
# print(df[df["Is this the last OTV?"] == "Yes"].shape)
# print(df[df["Is this the last OTV?"] == "Yes"]["Record ID"].nunique())
last_otv_df = df[df["Is this the last OTV?"] == "Yes"]
# fix duplicate rows per pt
last_otv_df = last_otv_df.sort_values(["Record ID", "Event Name"]).groupby("Record ID", sort=False).last().reset_index()
# print(last_otv_df.shape)
# print(last_otv_df["Record ID"].nunique())
# drop NA columns
last_otv_df.dropna(how="all", axis=1, inplace=True)
# drop columns not used for analysis
last_otv_df.drop(columns=["Event Name", "Date", "Date.1", 
                          "Did patient finish treatment as prescribed-received all chemo and RT doses as prescribed on time without missing any days?",
                          "If didn't received treatment as prescribed, reason? (choice=Dose of chemo changed)", 
                          "If didn't received treatment as prescribed, reason? (choice=Dose of radiation changed)", 
                          "If didn't received treatment as prescribed, reason? (choice=Missed a chemo dose)", 
                          "If didn't received treatment as prescribed, reason? (choice=Missed radiation)", 
                          "If didn't received treatment as prescribed, reason? (choice=Other)", 
                          'If other  specify'
                         ], inplace=True)
# merge with consult_df
merged_df = consult_df.merge(last_otv_df, on="Record ID", how="outer", suffixes=["_consult", "_last_otv"])


In [4]:
death_df = df[(df["Vital Status"] == "Dead") & (~df["Date of Death"].isnull())][["Record ID", "Event Name", 'Vital Status', 'Date of Death']].drop_duplicates(subset=["Record ID", "Vital Status", "Date of Death"])
# print(death_df.shape)
# print(death_df["Record ID"].nunique())
# confirm these death dates
## pts with different death dates recorded
death_check1_df = death_df[death_df["Record ID"].isin(death_df[death_df["Record ID"].duplicated() == True]["Record ID"].unique())]
## pts missing death date
death_check2_df = df[(df["Vital Status"] == "Dead") & (df["Date of Death"].isnull()) & (~df["Record ID"].isin(death_df["Record ID"].unique()))][["Record ID", "Event Name", 'Vital Status', 'Date of Death']]
# print(death_check2_df.shape)
# print(death_check2_df["Record ID"].nunique())
death_check_df = pd.concat([death_check1_df, death_check2_df], axis=0, ignore_index=True)

# only forward those for analysis -- no difference
# df["Date of Visit"] = pd.to_datetime(df["Date of Visit"])
# pts_before_2020 = df[(df["Event Name"] == "Consult") & (df["Date of Visit"] < "2020-01-01")]["Record ID"].unique().tolist()
# print(death_check_df.shape)
# print(death_check_df["Record ID"].nunique())
# death_check_df_ = death_check_df[death_check_df["Record ID"].isin(pts_before_2020)]
# print(death_check_df_.shape)
# print(death_check_df_["Record ID"].nunique())
# death_check_df.to_csv("../../../data/delays/death_dates.csv", index=False)

# TEMP
death_df = df[df["Vital Status"] == "Dead"][["Record ID", 'Vital Status', 'Date of Death']].drop_duplicates().groupby("Record ID")[["Vital Status", "Date of Death"]].first().reset_index()
print(death_df.shape)
print(death_df["Record ID"].nunique())
##

(570, 3)
570


In [5]:
alive_df = df[(df["Vital Status"] == "Alive") & (~df["Last seen date"].isnull()) & (~df["Record ID"].isin(death_df["Record ID"]))][["Record ID", "Event Name", 'Vital Status', 'Last seen date']].drop_duplicates(subset=["Record ID", "Vital Status", "Last seen date"])
# print(alive_df.shape)
# print(alive_df["Record ID"].nunique())
# confirm these last seen dates
## pts with different last seen dates recorded --> grab the most recent last seen date
alive_check1_df = alive_df[alive_df["Record ID"].isin(alive_df[alive_df["Record ID"].duplicated() == True]["Record ID"].unique())]
## pts missing last seen date
alive_check2_df = df[(df["Vital Status"] == "Alive") & (df["Last seen date"].isnull()) & (~df["Record ID"].isin(alive_df["Record ID"].unique())) & (~df["Record ID"].isin(death_df["Record ID"].unique()))][["Record ID", 'Vital Status', 'Last seen date']].drop_duplicates()
# print(alive_check2_df.shape)
# print(alive_check2_df["Record ID"].nunique())
# alive_check_df = pd.concat([alive_check1_df, alive_check2_df], axis=0, ignore_index=True)

# only forward those for analysis
df["Date of Visit"] = pd.to_datetime(df["Date of Visit"])
pts_before_2020 = df[(df["Event Name"] == "Consult") & (df["Date of Visit"] < "2020-01-01")]["Record ID"].unique().tolist()
# print(alive_check2_df.shape)
# print(alive_check2_df["Record ID"].nunique())
alive_check2_df_ = alive_check2_df[alive_check2_df["Record ID"].isin(pts_before_2020)]
# print(alive_check2_df_.shape)
# print(alive_check2_df_["Record ID"].nunique())
# alive_check2_df_.to_csv("../../../data/delays/last_seen_dates.csv", index=False)

# TEMP
alive_df = df[(df["Vital Status"] == "Alive") & (~df["Record ID"].isin(death_df["Record ID"].unique()))][["Record ID", 'Vital Status', 'Last seen date']].drop_duplicates().groupby("Record ID")[["Vital Status", "Last seen date"]].last().reset_index()
print(alive_df.shape)
print(alive_df["Record ID"].nunique())
##


(748, 3)
748


In [6]:
# combine death data and last seen data with merged data (consult and last OTV data)
merged_df = (merged_df.merge(death_df, how="left", on="Record ID")).merge(alive_df, how="left", on="Record ID")

# merge vital_status column coming from death_df and alive_df
merged_df["Vital Status"] = merged_df["Vital Status_x"].fillna(merged_df["Vital Status_y"])
merged_df.drop(columns=["Vital Status_x", "Vital Status_y"], inplace=True)

# preview merged data
print(merged_df.columns.tolist())
merged_df.head()


['Record ID', 'Date of Visit', 'Date Of Birth', 'Marital Status ', 'Country District', 'FIGO Cervical Stage', 'Pathology', 'Pathology Other', 'Date of Pathology sample taken', 'Have you been screened for cervical cancer?', 'History of Diabetes', 'History of tuberculosis (TB)', 'Date of Pathology Report?', 'Are you HIV positive?', 'CD4 Count Date', 'CD4 Count', 'Are you receiving HIV treatment?', 'How long have you been on ART?', 'Performance Status_consult', 'Creatinine (umo/L) Lab Result _consult', 'Creatinine Lab Date_consult', 'Hemoglobin (g/dl) Lab Result_consult', 'Hemoglobin Lab Date_consult', 'Albumin (g/dL) Lab Result _consult', 'Albumin Lab Date_consult', 'White Blood Count (per10^9) Lab Result_consult', 'White Blood Count Lab Date_consult', 'Neutrophil Count (%,per 10^9) Lab Result_consult', 'Neutrophil Percent (%) Lab Result_consult', 'Neutrophil Count Lab Date_consult', 'Viral Load (copies/ml) Lab Result(NUMBERS ONLY)', 'Viral Load Lab Date_consult', 'Performance Status_las

Unnamed: 0,Record ID,Date of Visit,Date Of Birth,Marital Status,Country District,FIGO Cervical Stage,Pathology,Pathology Other,Date of Pathology sample taken,Have you been screened for cervical cancer?,...,No treatment,Total number of chemo cycles received,Surgery,Response to treatment,What is the EQD2 result?,Brachytherapy Dose Received (cGy),EBRT Dose Received (cGy),Date of Death,Last seen date,Vital Status
0,1,2015-01-28,1965-01-01,Divorced,Kweneng West District,Stage IVB,SCC,,,Yes,...,,,No,Partial response,79.8,2100.0,5000.0,2016-02-06,,Dead
1,3,2015-01-30,1974-09-25,Married,Serowe / Palapye District,Stage IIIB,SCC,,2014-05-30,No,...,,0.0,No,Not recorded,82.0,2400.0,5000.0,,2017-06-15,Alive
2,5,2015-02-03,1966-09-01,Single,South East District,Stage IIIB,SCC,,2014-12-11,No,...,,,No,Complete response,77.8,2100.0,4800.0,2015-08-15,,Dead
3,9,2015-02-09,1950-01-01,Single,Kweneng East District,Stage IIB,SCC,,2014-08-25,No,...,,4.0,No,Not recorded,79.8,2100.0,5000.0,,2019-03-06,Alive
4,11,2015-02-09,1963-06-03,Single,South East District,,SCC,,2014-07-29,Yes,...,,5.0,No,Complete response,74.0,2100.0,4500.0,,2017-10-23,Alive


# Column Renaming

In [7]:
merged_df.rename(columns={
    'Record ID':"patient_id", 'Date of Visit':"enroll_date", 'Date Of Birth':"birth_date", 'Marital Status ':"marital_status", 
    'Country District':"district", 'FIGO Cervical Stage':"pre_trt_stage", 'Pathology':"path_response", 'Pathology Other':"path_response_other", 
    'Date of Pathology sample taken':"path_sample_date", 'Have you been screened for cervical cancer?':"cancer_screen", 'History of Diabetes':"diabetes_hist", 
    'History of tuberculosis (TB)':"tb_history", 'Date of Pathology Report?':"path_report_date", 'Are you HIV positive?':"hiv_status", 
    'CD4 Count Date':"cd4_date", 'CD4 Count':"cd4_count", 'Are you receiving HIV treatment?':"hiv_trt", 'How long have you been on ART?':"art_hisory", 
    'Performance Status_consult':"perform_sts_consult", 'Creatinine (umo/L) Lab Result _consult':"cr_consult", 'Creatinine Lab Date_consult':"cr_consult_date", 
    'Hemoglobin (g/dl) Lab Result_consult':"hb_consult", 'Hemoglobin Lab Date_consult':"hb_consult_date", 
    'Albumin (g/dL) Lab Result _consult':"alb_consult", 'Albumin Lab Date_consult':"alb_consult_date", 
    'White Blood Count (per10^9) Lab Result_consult':"wbc_consult", 'White Blood Count Lab Date_consult':"wbc_consult_date", 
    'Neutrophil Count (%,per 10^9) Lab Result_consult':"anc_count_consult", 'Neutrophil Percent (%) Lab Result_consult':"anc_pct_consult", 
    'Neutrophil Count Lab Date_consult':"anc_consult_date", 'Viral Load (copies/ml) Lab Result(NUMBERS ONLY)':"vl_consult", 
    'Viral Load Lab Date_consult':"vl_consult_date", 'Performance Status_last_otv':"perform_sts_otv", 
    'Creatinine (umo/L) Lab Result _last_otv':"cr_otv", 'Creatinine Lab Date_last_otv':"cr_otv_date", 
    'Hemoglobin (g/dl) Lab Result_last_otv':"hb_otv", 'Hemoglobin Lab Date_last_otv':"hb_otv_date", 'Albumin (g/dL) Lab Result _last_otv':"alb_otv", 
    'Albumin Lab Date_last_otv':"alb_otv_date", 'White Blood Count (per10^9) Lab Result_last_otv':"wbc_otv", 
    'White Blood Count Lab Date_last_otv':"wbc_otv_date", 'Neutrophil Count (%,per 10^9) Lab Result_last_otv':"anc_count_otv", 
    'Neutrophil Percent (%) Lab Result_last_otv':"anc_pct_otv", 'Neutrophil Count Lab Date_last_otv':"anc_otv_date", 
    'Viral Load Lab Date_last_otv':"vl_otv_date", 'Is this the last OTV?':"last_otv", 'End of Treatment FIGO Staging':"post_trt_stage", 
    'End of treatment date ':"trt_end_date", 'Date treatment started ':"trt_start_date", 'Treatment type':"trt_type", 'No treatment':"no_trt", 
    'Total number of chemo cycles received':"total_chemo", 'Surgery':"surgery", 'Response to treatment':"trt_response", 
    'What is the  EQD2 result?':"eqd2", 'Brachytherapy Dose Received (cGy)':"brachy", 'EBRT Dose Received (cGy)':"ebrt", 
    'Date of Death':"death_date", 'Last seen date':"last_seen_date:", 'Vital Status':"vital_status"
    }, inplace=True)

print(merged_df.columns.to_list())

['patient_id', 'enroll_date', 'birth_date', 'marital_status', 'district', 'pre_trt_stage', 'path_response', 'path_response_other', 'path_sample_date', 'cancer_screen', 'diabetes_hist', 'tb_history', 'path_report_date', 'hiv_status', 'cd4_date', 'cd4_count', 'hiv_trt', 'art_hisory', 'perform_sts_consult', 'cr_consult', 'cr_consult_date', 'hb_consult', 'hb_consult_date', 'alb_consult', 'alb_consult_date', 'wbc_consult', 'wbc_consult_date', 'anc_count_consult', 'anc_pct_consult', 'anc_consult_date', 'vl_consult', 'vl_consult_date', 'perform_sts_otv', 'cr_otv', 'cr_otv_date', 'hb_otv', 'hb_otv_date', 'alb_otv', 'alb_otv_date', 'wbc_otv', 'wbc_otv_date', 'anc_count_otv', 'anc_pct_otv', 'anc_otv_date', 'vl_otv_date', 'last_otv', 'post_trt_stage', 'trt_end_date', 'trt_start_date', 'trt_type', 'no_trt', 'total_chemo', 'surgery', 'trt_response', 'eqd2', 'brachy', 'ebrt', 'death_date', 'last_seen_date:', 'vital_status']


# Filter patients for analysis

### Grab patients enrolled between 2015 and 2019
* Earliest enrollment date is in 2015

In [8]:
merged_df["enroll_date"] = pd.to_datetime(merged_df["enroll_date"])
pts_before_2020 = merged_df[merged_df["enroll_date"] < "2020-01-01"]

print(merged_df.shape) # all pts
print(merged_df[merged_df["enroll_date"].isnull()].shape) # no one is missing enrollment date
print(pts_before_2020.shape) # pts enrolled before 2020

(1530, 60)
(0, 60)
(949, 60)


### Grab patients stage IB2-IVB

In [9]:
# pts_before_2020[(pts_before_2020["pre_trt_stage"].isnull()) & (pts_before_2020["post_trt_stage"].isnull())][["patient_id"]].to_csv("../../../data/delays/stage.csv", index=False)

# merge missing stage data (from line above)
pre_post_stage_update_df = pd.read_csv("../../../data/delays/updates_to_data/stage.csv")
pre_post_stage_update_df= pre_post_stage_update_df[(~pre_post_stage_update_df["pre_trt_stage"].isnull()) |
                                                   (~pre_post_stage_update_df["post_trt_stage"].isnull())]
pts_before_2020 = pts_before_2020.merge(pre_post_stage_update_df[["patient_id", "pre_trt_stage", "post_trt_stage"]], how="left", on="patient_id")
pts_before_2020["pre_trt_stage"] = pts_before_2020["pre_trt_stage_x"].fillna(pts_before_2020["pre_trt_stage_y"])
pts_before_2020["post_trt_stage"] = pts_before_2020["post_trt_stage_x"].fillna(pts_before_2020["post_trt_stage_y"])
# pts_before_2020[pts_before_2020["patient_id"] == 5083][["patient_id", "pre_trt_stage", "post_trt_stage"]]
pts_before_2020.drop(columns=["pre_trt_stage_x", "pre_trt_stage_y", "post_trt_stage_x", "post_trt_stage_y"], inplace=True)

pts_pre_2020_local_adv = pts_before_2020[(pts_before_2020["pre_trt_stage"].isin(['Stage IVB', 'Stage IIIB', 'Stage IIB', 'Stage IIA', 
                                                                                 'Stage IVA', 'Stage IIIA', 'Stage IB2', 'Stage IB3', 
                                                                                 'Stage IIIC1'])) |
                                         (pts_before_2020["post_trt_stage"].isin(['Stage IIIB', 'Stage IIIA', 'Stage IIA', 'Stage IVA', 
                                                                                  'Stage IIB', 'Stage IB2', 'Stage IVB', 'Stage IB3',
                                                                                  'Stage IIIC1']))]

print(pts_before_2020.shape) # pts enrolled before 2020
print(pts_before_2020[(pts_before_2020["pre_trt_stage"].isnull()) & (pts_before_2020["post_trt_stage"].isnull())].shape) # pts missing both pre_trt_stage and post_trt_stage
print(pts_pre_2020_local_adv.shape) # pts enrolled before 2020 and staged IB2-IVB either pre or post trt



(949, 60)
(10, 60)
(818, 60)


# Look at distribution of treatment intent (Curative v. Definitive v. Palliative)

In [12]:
# before filtering for CRT pts
# last_otv_df = df[df["Is this the last OTV?"] == "Yes"]
# last_otv_df = last_otv_df.groupby("Record ID")[["Record ID", "Treatment type"]].first()
# print(last_otv_df.shape)
# print(last_otv_df[last_otv_df["Treatment type"].isnull()].shape)
# last_otv_df[last_otv_df["Treatment type"].isnull()]["Record ID"].to_excel("../../data/delays/missing_treatment_type_12-10.xlsx", index=False)

# last_otv_df["Treatment type"].value_counts()

# display(pts_pre_2020_local_adv["trt_type"].value_counts())

# merge missing trt type data (from lines above)
trt_type_update_df = pd.read_csv("../../../data/delays/updates_to_data/add_trt_type.csv")
trt_type_update_df = trt_type_update_df[~trt_type_update_df[" Treatment Type"].isnull()]
# trt_type_update_df.head()
pts_pre_2020_local_adv = pts_pre_2020_local_adv.merge(trt_type_update_df[["Record ID", " Treatment Type"]], how="left", left_on="patient_id", right_on="Record ID")
pts_pre_2020_local_adv["trt_type"] = pts_pre_2020_local_adv["trt_type"].fillna(pts_pre_2020_local_adv[" Treatment Type"])
pts_pre_2020_local_adv[pts_pre_2020_local_adv["patient_id"] == 34][["patient_id", "trt_type"]]


Unnamed: 0,patient_id,trt_type
14,34,Definitive


In [13]:
print(pts_pre_2020_local_adv.shape)
display(pts_pre_2020_local_adv["trt_type"].value_counts())

(818, 62)


Definitive    363
Curative      274
Palliative    119
Name: trt_type, dtype: int64