# Libraries

In [None]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings('ignore')

# Data

In [None]:
# read data
data = pd.read_stata("../../data/20170213 Cure complete tx-Stata 13.dta")

# Cleaning

In [None]:
# update followup/death date and remove duplicates
dates_2020 = pd.read_csv("../../data/2020_followup.csv") # has updated follow-up
dates_2020[dates_2020["Study ID"].isin(["BT160", "DP051", "DS165", "KB017", "MR117"])]
dates_2020 = dates_2020.drop(dates_2020.index[[8, 12, 13, 28, 60]]).drop_duplicates()
dates_2020["Last follow up date ( as in 2020)"] = dates_2020["Last follow up date ( as in 2020)"].fillna(dates_2020["Date of Death/Last seen in IPMS"])

# dates_2020 misses 39 patients. This sheet contains those 39 patients without updates
remaining_39 = pd.read_csv("../../data/extra_2020_followup.csv")
combined_182_fp_updates = pd.concat([dates_2020, remaining_39], ignore_index = True)
combined_182_fp_updates = combined_182_fp_updates[["Study ID", "Vital Status", "Last follow up date ( as in 2020)"]]
combined_182_fp_updates.rename(columns = {"Study ID":"id", "Vital Status":"vital_status", "Last follow up date ( as in 2020)":"death_followup"}, inplace = True)
combined_182_fp_updates["death_followup"] = pd.to_datetime(combined_182_fp_updates["death_followup"])

# merge
data = pd.merge(data, combined_182_fp_updates, on = "id").drop_duplicates()

In [None]:
# encode factors
data.loc[data["xmarital"] == 'divorced/widowed', 'xmarital'] = 2
data.loc[data["xmarital"] == 'single', 'xmarital'] = 3
data.loc[data["xmarital"] == 'married/partnered', 'xmarital'] = 1

data.loc[data["hiv_status"] == "positive", "hiv_status"] = 1
data.loc[data["hiv_status"] == "negative", "hiv_status"] = 0
data.loc[data["hiv_status"] == "not tested", "hiv_status"] = np.nan

data.loc[data["tumorresponse"] == "complete", "tumorresponse"] = 1
data.loc[data["tumorresponse"] == "partial", "tumorresponse"] = 2
data.loc[data["tumorresponse"] == "no_tx", "tumorresponse"] = 4
data.loc[data["tumorresponse"] == "to be assessed on followup", "tumorresponse"] = 3
data.loc[data["tumorresponse"] == "not recorded", "tumorresponse"] = 5

In [None]:
data["date_histology"] = pd.to_datetime(data["date_histology"], unit = 'D', origin = pd.Timestamp('1960-01-01'))
data.loc[data["date_radiation_begins"] == ".", "date_radiation_begins"] = np.nan
data["date_radiation_begins"] = pd.to_datetime(data["date_radiation_begins"])

data["actual_treat_start_date"] = data["date_radiation_begins"]
# fill missing treatment start dates with pathology date
data["date_radiation_begins"] = data["date_radiation_begins"].fillna(data["date_histology"])

In [None]:
# grab CD4 closest to trt start date
cd4 = data[data["hiv_status"] == 1][["id", "date_radiation_begins", "date_histology", "cd4_1", "date_1_cd4", "cd4_2", "date_2_cd4", "cd4_3", "date_3_cd4",
               "cd4_4", "date_4_cd4", "cd4_5", "date_5_cd4", "cd4_6", "date_6_cd4", "cd4_7", "date_7_cd4",
               "cd4_8", "date_8_cd4", "cd4_9", "date_9_cd4", "cd4_10", "date_10_cd4", "cd4_11", "date_11_cd4",
               "cd4_12", "date_12_cd4", "cd4_13", "date_13_cd4", "cd4_14", "date_14_cd4", "cd4_15", "date_15_cd4",
               "cd4_16", "date_16_cd4"]]
cd4.rename(columns = {"date_1_cd4":"date_1", "date_2_cd4":"date_2", "date_3_cd4":"date_3", "date_4_cd4":"date_4",
                      "date_5_cd4":"date_5", "date_6_cd4":"date_6", "date_7_cd4":"date_7", "date_8_cd4":"date_8",
                      "date_9_cd4":"date_9", "date_10_cd4":"date_10", "date_11_cd4":"date_11", "date_12_cd4":"date_12",
                      "date_13_cd4":"date_13", "date_14_cd4":"date_14", "date_15_cd4":"date_15", "date_16_cd4":"date_16"}, inplace = True)

cd4 = pd.wide_to_long(cd4, ["date_", "cd4_"], i = ["id", "date_radiation_begins", "date_histology"], j = "key").dropna(how = "all").reset_index()
cd4["date_radiation_begins"] = cd4["date_radiation_begins"].fillna(cd4["date_histology"])
cd4["date_difference"] = abs((cd4["date_radiation_begins"] - cd4["date_"]).dt.days)
cd4 = cd4.sort_values(by = ["id", "date_difference"])
cd4_to_merge = cd4.groupby("id")[["cd4_", "date_"]].first().reset_index().rename(columns = {"cd4_":"cd4_final"})

# merge
data = data.merge(cd4_to_merge, how = "left", on = "id")

In [None]:
# exclude patient (missing radiation start date and pathology date)
data = data[data["id"] != "LM188"]

In [None]:
# grab VL closest to trt start date
vl = data[data["hiv_status"] == 1][["id", "date_radiation_begins", "date_histology", "VL_1", "date_1_vl", "vl_2", "date_2_vl", "vl_3", "date_3_vl",
               "vl_4", "date_4_vl", "vl_5", "date_5_vl", "vl_6", "date_6_vl", "vl_7", "date_7_vl",
               "vl_8", "date_8_vl", "vl_9", "date_9_vl", "vl_10", "date_10_vl", "vl_11", "date_11_vl",
               "vl_12", "date_12_vl", "vl_13", "date_13_vl", "vl_14", "date_14_vl", "vl_15", "date_15_vl", "vl_16",
               "date_16_vl"]]
vl.rename(columns = {"date_1_vl":"date_1", "date_2_vl":"date_2", "date_3_vl":"date_3", "date_4_vl":"date_4",
                      "date_5_vl":"date_5", "date_6_vl":"date_6", "date_7_vl":"date_7", "date_8_vl":"date_8",
                      "date_9_vl":"date_9", "date_10_vl":"date_10", "date_11_vl":"date_11", "date_12_vl":"date_12",
                      "date_13_vl":"date_13", "date_14_vl":"date_14", "date_15_vl":"date_15", "date_16_vl":"date_16"}, inplace = True)

vl = pd.wide_to_long(vl, ["date_", "vl_"], i = ["id", "date_radiation_begins", "date_histology"], j = "key").dropna(how = "all").reset_index()
vl["date_radiation_begins"] = vl["date_radiation_begins"].fillna(vl["date_histology"])
vl["date_difference"] = abs((vl["date_radiation_begins"] - vl["date_"]).dt.days)
vl = vl.sort_values(by = ["id", "date_difference"])
vl_to_merge = vl.groupby("id")[["vl_", "date_"]].first().reset_index().rename(columns = {"vl_":"vl_final"})

# merge
data = data.merge(vl_to_merge, how = "left", on = "id")

In [None]:
# grab analysis columns
data = data[["id", "hiv_status", "age", "xmarital", "distance", "_2_have_you_been_screened_for_c",
                               "stage", "xstage", "vaginal_hemorrhage", "vaginal_discharge", "pelvic_pain",
                               "cr_1st", "hb_1st", "neut_1st", "wbc_1st", "performance_baseline", "cd4_final", "vl_final",
                               "on_arv", "No_chemo_cycles", "Total_received_ebrtfinal", "Total_received_Brachyfinal",
                               "EQD2Dosefinal", "Total_received_boostfinal", "tx_duration", "tumorresponse", 
                               "vital_status", "death_followup", "date_radiation_begins", "combivir", "neviripine",
                               "efavirenz", "truvada", "aluvia", "abacavir", "lamivuidine", "Stavudine", "on_tdf", "atripla",
                               "cr_max_during_chemo", "hb_min_during_chemo", "wbc_min_during_chemo", "neut_min_during_chemo", "xmax_pelvic_pain", 
                               "xmax_vaginal_discharge", "xmax_fatigue", "xmax_weight_loss", 
                               "xmax_nausea", "xmax_vomiting", "xmax_urinary_frequency", "xmax_urinary_incontinence",
                               "xmax_urinary_urgency", "xmax_diarrhoea", "xmax_dermatitis_radiation", "date_histology", "actual_treat_start_date"]]

# rename to match main dataset
data.rename(columns = {"id":"patient_id", "age":"enroll_age", "xmarital":"marital", "_2_have_you_been_screened_for_c":"cancer_screening",
                                "stage":"init_cancer_stage_exact", "xstage":"combined_cancer_stage", "cr_1st":"cr_result", "hb_1st":"hb_result", 
                                "neut_1st":"neut_result", "wbc_1st":"wbc_result", "performance_baseline":"init_performance_status",
                                "No_chemo_cycles":"total_chemo_received", "Total_received_ebrtfinal":"ebrt_curr_dose", 
                                "Total_received_Brachyfinal":"brachy_curr_dose", "EQD2Dosefinal":"eqd2", 'Total_received_boostfinal':'ebrt_boost_curr_dose',
                                "tx_duration":"treat_duration", "tumorresponse":"treat_response", "death_followup":"death_date", 
                                "date_radiation_begins":"treat_start_date", "neviripine":"nevirapine", "lamivuidine":"lamivudine",
                                "Stavudine":"stavudine", "on_tdf":"tdf", "cr_max_during_chemo":"cr_result_tox", "hb_min_during_chemo":"hb_min_result_tox", 
                                "wbc_min_during_chemo":"wbc_min_result_tox", "neut_min_during_chemo":"neut_min_result_tox",
                                "xmax_pelvic_pain":"pelvic_pain_tox", "xmax_vaginal_discharge":"vaginal_discharge_tox", "xmax_fatigue":"fatigue_tox",
                                "xmax_weight_loss":"weight_loss_tox", "xmax_nausea":"nausea_tox", "xmax_vomiting":"vomiting_tox", 
                                "xmax_urinary_frequency":"urine_freq_tox", "xmax_urinary_incontinence":"urine_incontinence_tox",
                                "xmax_urinary_urgency":"urine_urge_tox", "xmax_diarrhoea":"diarrhea_tox", "xmax_dermatitis_radiation":"dermatitis_tox",
                                "date_histology":"pathology_date"}, inplace = True)
data["combined_cancer_stage_exact"] = data["init_cancer_stage_exact"]

# transform to common units
data["ebrt_curr_dose"] = data["ebrt_curr_dose"]*100
data["ebrt_boost_curr_dose"] = data["ebrt_boost_curr_dose"]*100
data["brachy_curr_dose"] = data["brachy_curr_dose"]*100

# encode vital_status
data.loc[(data["vital_status"] == "Alive") |
         (data["vital_status"] == "Alive "), "vital_status"] = 0
data.loc[data["vital_status"] == "Dead", "vital_status"] = 1

In [None]:
# encode cancer stage
data.loc[data["combined_cancer_stage_exact"] == '', "combined_cancer_stage_exact"] = np.nan
data.loc[data["combined_cancer_stage_exact"] == 'ia1', "combined_cancer_stage_exact"] = 2
data.loc[data["combined_cancer_stage_exact"] == 'ib1', "combined_cancer_stage_exact"] = 5
data.loc[data["combined_cancer_stage_exact"] == 'ib2', "combined_cancer_stage_exact"] = 6
data.loc[data["combined_cancer_stage_exact"] == 'iia1', "combined_cancer_stage_exact"] = 7
data.loc[data["combined_cancer_stage_exact"] == 'iia2', "combined_cancer_stage_exact"] = 7
data.loc[data["combined_cancer_stage_exact"] == 'iib', "combined_cancer_stage_exact"] = 8
data.loc[data["combined_cancer_stage_exact"] == 'iiia', "combined_cancer_stage_exact"] = 9
data.loc[data["combined_cancer_stage_exact"] == 'iiib', "combined_cancer_stage_exact"] = 10
data.loc[data["combined_cancer_stage_exact"] == 'iva', "combined_cancer_stage_exact"] = 11

In [None]:
# add trt columns
data["surgery"] = 0
data["primary_surgery_chemo"] = 0
data["primary_surgery_crt"] = 0
data["chemo"] = np.nan
data["crt"] = np.nan

data.loc[data["total_chemo_received"] == 0, "chemo"] = 0
data.loc[data["total_chemo_received"] > 0, "chemo"] = 1
data.loc[(data["total_chemo_received"] == 0) | (data["total_chemo_received"].isnull()), "crt"] = 0
data.loc[(data["eqd2"] == 0) | (data["eqd2"].isnull()), "crt"] = 0
data.loc[(data["total_chemo_received"] > 0) &
        (data["eqd2"] > 0), "crt"] = 1
data["rt"] = np.nan
data.loc[(data["eqd2"] == 0) | (data["eqd2"].isnull()), "rt"] = 0
data.loc[(data["eqd2"] > 0) &
         ((data["total_chemo_received"] == 0) | (data["total_chemo_received"].isnull())), "rt"] = 1
data["treatment"] = 1
data.loc[(data["surgery"] == 0) & (data["primary_surgery_crt"] == 0) & 
         (data["primary_surgery_chemo"] == 0) & (data["rt"] == 0) & (data["crt"] == 0), "treatment"] = 0

# Save cleaned data

In [None]:
data.to_csv("../../data/small_data.csv", index = False)