# Libraries

In [None]:
import pandas as pd
import numpy as np
import datetime

import warnings
warnings.filterwarnings('ignore')

# Data

In [None]:
# Import small and large data
large_data = pd.read_csv("../../data/large_data.csv")
small_data = pd.read_csv("../../data/small_data.csv")

# Combine data

In [None]:
 # Combined large and small datasets
data = pd.concat([large_data, small_data], ignore_index = True)

In [None]:
data.loc[data["ebrt_curr_dose"] == 0, "received_ebrt"] = 0
data.loc[data["ebrt_curr_dose"] > 0, "received_ebrt"] = 1

data.loc[data["brachy_curr_dose"] == 0, "received_brachy"] = 0
data.loc[data["brachy_curr_dose"] > 0, "received_brachy"] = 1

data.loc[data["ebrt_boost_curr_dose"] == 0, "received_ebrt_boost"] = 0
data.loc[data["ebrt_boost_curr_dose"] > 0, "received_ebrt_boost"] = 1

data.loc[data["eqd2"] == "", "eqd2"] = np.nan
data["eqd2"] = data["eqd2"].astype(float)
data.loc[data["eqd2"] == 0, "received_eqd2"] = 0
data.loc[data["eqd2"] > 0, "received_eqd2"] = 1


data.loc[(data["enroll_age"] >= 21) &
                 (data["enroll_age"] <= 39), "age_cat"] = 1
data.loc[(data["enroll_age"] > 39) &
                 (data["enroll_age"] <= 59), "age_cat"] = 2
data.loc[(data["enroll_age"] > 59) &
                 (data["enroll_age"] <= 96), "age_cat"] = 3

In [None]:
# time_alive will not be able to be calculated on these patients 
data.loc[data["treat_start_date"] == "", "treat_start_date"] = np.nan
data.loc[data["death_date"] == "", "death_date"] = np.nan
print(list(data[(data["death_date"].isnull()) | (data["treat_start_date"].isnull())]["patient_id"]))


In [None]:
print(len(set(data["patient_id"]))) # all patients from large and small data combined
print(len(set(data[data["vital_status"] == 1]["patient_id"])))
print(len(set(data[data["vital_status"] == 0]["patient_id"])))

In [None]:
data["total_chemo_received"] = data["total_chemo_received"].astype(float)

In [None]:
data["time_alive_treat"] = abs(pd.to_datetime(data["death_date"]) - pd.to_datetime(data["treat_start_date"])).dt.days
# If no treat start date, then use path date
data["time_alive_treat"] = data["time_alive_treat"].fillna((pd.to_datetime(data["death_date"]) - pd.to_datetime(data["pathology_date"])).dt.days)

# time_alive in which time is death/censor - path date
data["time_alive_path"] = abs(pd.to_datetime(data["death_date"]) - pd.to_datetime(data["pathology_date"])).dt.days
# If no path date, use enroll date
data["time_alive_path"] = data["time_alive_path"].fillna((pd.to_datetime(data["death_date"]) - pd.to_datetime(data["enroll_date"])).dt.days)
# If enroll date after treat start date, use treat start date
data["time_alive_temp"] = abs(pd.to_datetime(data["death_date"]) - pd.to_datetime(data["treat_start_date"])).dt.days
data["time_alive_path"] = np.where((data.enroll_date > data.treat_start_date), data.time_alive_temp, data.time_alive_path)

In [None]:
data.loc[(data["tenofovir"] == 1) &
         (data["lamivudine"] == 1) &
         (data["dolutegravir"] == 1), "tld"] = 1

data.loc[data["other_arv_name"] == "TLD", "tld"] = 1

In [None]:
# Create binary column to indicate whether or not patients have symptoms at presentation
set(data["vaginal_hemorrhage"].dropna()) # {0,1,2}
data["vaginal_hemorrhage_symp"] = np.nan
data.loc[data.vaginal_hemorrhage == 0, "vaginal_hemorrhage_symp"] = 0
data.loc[data.vaginal_hemorrhage == 1, "vaginal_hemorrhage_symp"] = 1
data.loc[data.vaginal_hemorrhage == 2, "vaginal_hemorrhage_symp"] = 1

set(data["vaginal_discharge"].dropna()) # {0,1,2}
data["vaginal_discharge_symp"] = np.nan
data.loc[data.vaginal_discharge == 0, "vaginal_discharge_symp"] = 0
data.loc[data.vaginal_discharge == 1, "vaginal_discharge_symp"] = 1
data.loc[data.vaginal_discharge == 2, "vaginal_discharge_symp"] = 1

set(data["pelvic_pain"].dropna()) # {0,1,2,3}
data["pelvic_pain_symp"] = np.nan
data.loc[data.pelvic_pain == 0, "pelvic_pain_symp"] = 0
data.loc[data.pelvic_pain == 1, "pelvic_pain_symp"] = 1
data.loc[data.pelvic_pain == 2, "pelvic_pain_symp"] = 1
data.loc[data.pelvic_pain == 3, "pelvic_pain_symp"] = 1

In [None]:
# Create ARV groupings

data["nrti"] = 0
data["nnrti"] = 0
data["pi"] = 0

# Need to confirm: (data["dolutegravir"] == 1) which category
data.loc[(data["abacavir"] == 1) | (data["azt"] == 1) |
         (data["tdf"] == 1) | (data["lamivudine"] == 1) |
         (data["stavudine"] == 1) | (data["emtricitabine"] == 1) | (data["didanosine"] == 1) |
         (data["atripla"] == 1) | (data["combivir"] == 1) |
         (data["truvada"] == 1) | (data["tld"] == 1), "nrti"] = 1

data.loc[(data["nevirapine"] == 1) | (data["efavirenz"] == 1) |
         (data["atripla"] == 1), "nnrti"] = 1
         
data.loc[(data["atazanavir_boosted"] == 1) | (data["atazanavir"] == 1) |
         (data["amprenavir"] == 1) | (data["darunavir"] == 1) |
         (data["darunavir_boosted"] == 1) | (data["nelfinavir"] == 1) |
         (data["tipranavir_boosted"] == 1) | (data["tipranavir"] == 1), "pi"] = 1  

In [None]:
# Add toxicity grades based on formulas and ranges (LLNs and ULNs)

# Cr
data["cr_tox_grade"] = np.nan
data.loc[data["cr_result_tox"] <= 97, "cr_tox_grade"] = 0.0
data.loc[((data["cr_result_tox"] > 97) &
          (data["cr_result_tox"] <= 1.5*97)), "cr_tox_grade"] = 1.0
data.loc[((data["cr_result_tox"] > 1.5*data["cr_result"]) &
         (data["cr_result_tox"] <= 3.0*data["cr_result"])), "cr_tox_grade"] = 2.0
data.loc[((data["cr_result_tox"] > 1.5*97) &
         (data["cr_result_tox"] <= 3.0*97)), "cr_tox_grade"] = 2.0
data.loc[((data["cr_result_tox"] > 3.0*data["cr_result"]) &
         (data["cr_result_tox"] <= 6.0*data["cr_result"])), "cr_tox_grade"] = 3.0
data.loc[((data["cr_result_tox"] > 3.0*97) &
         (data["cr_result_tox"] <= 6.0*97)), "cr_tox_grade"] = 3.0
data.loc[(data["cr_result_tox"] > 6.0*97), "cr_tox_grade"] = 4.0

# Hb
data["hb_tox_grade"] = np.nan
data.loc[data["hb_min_result_tox"] >= 12.4, "hb_tox_grade"] = 0.0
data.loc[((data["hb_min_result_tox"] < 12.4) &
          (data["hb_min_result_tox"] >= 10)), "hb_tox_grade"] = 1.0
data.loc[((data["hb_min_result_tox"] < 10.0) &
         (data["hb_min_result_tox"] >= 8.0)), "hb_tox_grade"] = 2.0
data.loc[data["hb_min_result_tox"] < 8.0, "hb_tox_grade"] = 3.0

# ANC
data["neut_tox_grade"] = np.nan
data.loc[data["neut_min_result_tox"] >= 2.0, "neut_tox_grade"] = 0.0
data.loc[((data["neut_min_result_tox"] < 2.0) &
         (data["neut_min_result_tox"] >= 1.5)), "neut_tox_grade"] = 1.0
data.loc[((data["neut_min_result_tox"] < 1.5) &
         (data["neut_min_result_tox"] >= 1.0)), "neut_tox_grade"] = 2.0
data.loc[((data["neut_min_result_tox"] < 1.0) &
         (data["neut_min_result_tox"] >= 0.5)), "neut_tox_grade"] = 3.0
data.loc[data["neut_min_result_tox"] < 0.5, "neut_tox_grade"] = 4.0

# WBC
data["wbc_tox_grade"] = np.nan
data.loc[data["wbc_min_result_tox"] >= 4.0, "wbc_tox_grade"] = 0.0
data.loc[((data["wbc_min_result_tox"] < 4.0) &
         (data["wbc_min_result_tox"] >= 3.0)), "wbc_tox_grade"] = 1.0
data.loc[((data["wbc_min_result_tox"] < 3.0) &
         (data["wbc_min_result_tox"] >= 2.0)), "wbc_tox_grade"] = 2.0
data.loc[((data["wbc_min_result_tox"] < 2.0) &
         (data["wbc_min_result_tox"] >= 1.0)), "wbc_tox_grade"] = 3.0
data.loc[data["wbc_min_result_tox"] < 1.0, "wbc_tox_grade"] = 4.0

# Albumin
data["alb_tox_grade"] = np.nan
data.loc[data["alb_result"] >= 35.0, "alb_tox_grade"] = 0.0
data.loc[((data["alb_result"] < 35.0) &
         (data["alb_result"] >= 30.0)), "alb_tox_grade"] = 1.0
data.loc[((data["alb_result"] < 30.0) &
         (data["alb_result"] >= 20.0)), "alb_tox_grade"] = 2.0
data.loc[data["alb_result"] < 20.0, "alb_tox_grade"] = 3.0

In [None]:
# Index columns necessary for analysis
data = data[["patient_id", "enroll_age", "age_cat", "hiv_status", "marital", "distance", "cancer_screening", "combined_cancer_stage", "combined_cancer_stage_exact", 
             "cr_result", "hb_result", "neut_result", "wbc_result", "alb_result", "init_performance_status", "chemo", "total_chemo_received", 
             "ebrt_curr_dose", "brachy_curr_dose", "eqd2", "ebrt_boost_curr_dose", "treat_duration", "treat_response", "vital_status",
             "death_date", "treat_start_date", "vaginal_hemorrhage", "vaginal_discharge", "pelvic_pain", "cd4_final", "vl_final", 
             "nnrti", "nrti", "pi",
             'on_arv', 'lamivudine', 'abacavir', 'amprenavir', "azt",
             'atazanavir', 'atazanavir_boosted', 'cobicistat', 'stavudine',
             'zalcitabine', 'didanosine', 'delavirdine', 'darunavir',
             'darunavir_boosted', 'dolutegravir', 'efavirenz', 'etravirine',
             'elvitegravir', 'fosamprenavir', 'fosamprenavir_boosted',
             'emtricitabine', 'gazt', 'indinavir', 'lopinavir',
             'lopinavir_boosted', 'maraviroc', 'nelfinavir', 'nevirapine',
             'raltegravir', 'rilpivirine', 'ritonavir', 'saquinavir',
             'saquinavir_boosted', 'enfuvirtide', 'tdf', 'tipranavir',
             'tipranavir_boosted', 'zidovudine', 'atripla', 'aluvia',
             'combivir', 'truvada', 'tld', 'other_arv',
             "cr_result_tox", "cr_tox_grade", "pelvic_pain_tox", "vaginal_discharge_tox", "vaginal_hemorrhage_tox", "fatigue_tox", "weight_loss_tox", 
             "nausea_tox", "vomiting_tox", "urine_freq_tox", "urine_incontinence_tox", "urine_urge_tox", "diarrhea_tox", "dermatitis_tox",
             "hb_min_result_tox", "hb_tox_grade", "wbc_min_result_tox", "wbc_tox_grade", "neut_min_result_tox", "neut_tox_grade", "alb_min_result_tox", "alb_tox_grade",
             "vaginal_hemorrhage_symp", "vaginal_discharge_symp", "pelvic_pain_symp", "time_alive_treat", "time_alive_path", "received_eqd2", "surgery", "surgery_only", "rt", "crt", "primary_surgery_chemo", "primary_surgery_crt", "treatment",
             "pathology_date", "actual_treat_start_date", "init_cancer_stage", "init_cancer_stage_exact", "final_cancer_stage", "final_cancer_stage_exact"]]


# Save merged data

In [None]:
data.to_csv("../../data/data.csv")