In [14]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/data.csv")

In [7]:
data[data["chemo_only"] == 1][["patient_id", "total_chemo_received", "ebrt_curr_dose", "ebrt_boost_curr_dose", "brachy_curr_dose", "surgery", "eqd2"]]

Unnamed: 0,patient_id,total_chemo_received,ebrt_curr_dose,ebrt_boost_curr_dose,brachy_curr_dose,surgery,eqd2
303,710.0,8.0,0.0,0.0,0.0,0.0,0.0
456,5233.0,6.0,,,,,
460,5251.0,3.0,,,,,
525,5429.0,6.0,,,,,
573,5578.0,1.0,,,,0.0,
604,5654.0,6.0,,,,,
726,6051.0,6.0,,,,0.0,
776,6174.0,6.0,,,,0.0,
789,6234.0,6.0,,,,,
814,6279.0,4.0,,,,0.0,


### Identified errors with patient grouping for treatment

In [3]:
# errors

# 1 patient, should be CRT only
display(data[(data["rt"] == 1) & (data["crt"] == 1)][["patient_id", "rt", "eqd2", "crt", "total_chemo_received", "surgery", "treatment"]])

# no treatment recorded
no_trt_recorded = data[(data["eqd2"].isnull()) & (data["surgery"].isnull()) & (data["total_chemo_received"].isnull())][["patient_id", "treatment"]]
# 1 patients, should be RT only and treatment
display(data[(data["treatment"] == 0) & 
     (~data["patient_id"].isin(no_trt_recorded["patient_id"].tolist())) &
     (data["eqd2"] > 0)][["patient_id", "eqd2", "surgery", "total_chemo_received", "treatment"]])
# 12 patients, should be chemo only and treatment
display(data[(data["treatment"] == 0) & 
     (~data["patient_id"].isin(no_trt_recorded["patient_id"].tolist())) &
     (data["total_chemo_received"] > 0)][["patient_id", "eqd2", "surgery", "total_chemo_received", "treatment"]])

# 7 patients, should be surgery+RT
display(data[(data["treatment"] == 1) &
     ((data["surgery_only"] == 0) | (data["surgery_only"].isnull())) &
     ((data["rt"] == 0) | (data["rt"].isnull())) &
     ((data["crt"] == 0) | (data["crt"].isnull())) &
     ((data["primary_surgery_chemo"] == 0) | (data["primary_surgery_chemo"].isnull())) &
     ((data["primary_surgery_crt"] == 0) | (data["primary_surgery_crt"].isnull()))][["patient_id", "eqd2", "surgery", "total_chemo_received"]])

Unnamed: 0,patient_id,rt,eqd2,crt,total_chemo_received,surgery,treatment
487,5326.0,1.0,59.1,1.0,1.0,0.0,1


Unnamed: 0,patient_id,eqd2,surgery,total_chemo_received,treatment
378,889.0,47.2,0.0,0.0,0


Unnamed: 0,patient_id,eqd2,surgery,total_chemo_received,treatment
303,710.0,0.0,0.0,8.0,0
456,5233.0,,,6.0,0
460,5251.0,,,3.0,0
525,5429.0,,,6.0,0
573,5578.0,,0.0,1.0,0
604,5654.0,,,6.0,0
726,6051.0,,0.0,6.0,0
776,6174.0,,0.0,6.0,0
789,6234.0,,,6.0,0
814,6279.0,,0.0,4.0,0


Unnamed: 0,patient_id,eqd2,surgery,total_chemo_received
103,238.0,68.3,1.0,0.0
503,5383.0,52.1,1.0,0.0
534,5467.0,90.8,1.0,0.0
634,5746.0,74.0,1.0,0.0
678,5861.0,69.0,1.0,0.0
778,6182.0,58.5,1.0,0.0
832,6331.0,50.6,1.0,0.0


### Correcting errors

In [4]:
data["chemo_only"] = 0
data["primary_surgery_rt"] = 0

In [5]:
# correct errors
data.loc[data["patient_id"] == '5326.0', "rt"] = 0

data.loc[data["patient_id"] == '889.0', "rt"] = 1
data.loc[data["patient_id"] == '889.0', "treatment"] = 1

data.loc[data["patient_id"].isin(['710.0', '5233.0', '5251.0', '5429.0', '5578.0', '5654.0', '6051.0', '6174.0', 
                                  '6234.0', '6279.0', '6844.0', '6846.0']), "chemo_only"] = 1
data.loc[data["patient_id"].isin(['710.0', '5233.0', '5251.0', '5429.0', '5578.0', '5654.0', '6051.0', '6174.0', 
                                  '6234.0', '6279.0', '6844.0', '6846.0']), "treatment"] = 1


data.loc[data["patient_id"].isin(['238.0', '5383.0', '5467.0', '5746.0', '5861.0', '6182.0', '6331.0']), "primary_surgery_rt"] = 1

### Mutually exclusive check

In [6]:
data.shape

(1043, 122)

In [7]:
data.groupby("treatment")["patient_id"].count()

treatment
0     76
1    967
Name: patient_id, dtype: int64

In [8]:
data.groupby("surgery_only")["patient_id"].count()

surgery_only
0.0    804
1.0     58
Name: patient_id, dtype: int64

In [9]:
data.groupby("rt")["patient_id"].count()

rt
0.0    157
1.0    341
Name: patient_id, dtype: int64

In [10]:
data.groupby("crt")["patient_id"].count()

crt
0.0    503
1.0    531
Name: patient_id, dtype: int64

In [11]:
data.groupby("chemo_only")["patient_id"].count()

chemo_only
0    1031
1      12
Name: patient_id, dtype: int64

In [12]:
data.groupby("primary_surgery_chemo")["patient_id"].count()

primary_surgery_chemo
0    1035
1       8
Name: patient_id, dtype: int64

In [13]:
data.groupby("primary_surgery_rt")["patient_id"].count()

primary_surgery_rt
0    1036
1       7
Name: patient_id, dtype: int64

In [14]:
data.groupby("primary_surgery_crt")["patient_id"].count()

primary_surgery_crt
0    1033
1      10
Name: patient_id, dtype: int64

### Save data

In [15]:
data.to_csv("../data/data.csv", index=False)