# Libraries

In [37]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Data

In [38]:
df = pd.read_csv("../../data/delays/data_12-10.csv")
df.head()

Unnamed: 0,Record ID,Event Name,Repeat Instrument,Repeat Instance,Date of Visit,Patient Study ID,Date Of Birth,Marital Status,Country District,Physical address (village/ward),...,Vital Status,Date of Death,Cause of Death,Date of Last Follow Up,How was patient reached?,"Other, specify",Follow Up Weight (kg),Performance status,Brachytherapy Dose Received (cGy),EBRT Dose Received (cGy)
0,1,Consult,,,2015-01-28,1.0,1965-01-01,Divorced,Kweneng West District,Letlhakeng(Goomodimo ward),...,,,,,,,,,,
1,1,OTV_1,,,,,,,,,...,,,,,,,,,,2200.0
2,1,OTV_3,,,,,,,,,...,,,,,,,,,600.0,4200.0
3,1,OTV_4,,,,,,,,,...,,,,,,,,,2100.0,5000.0
4,1,3 Month Follow-Up,,,,,,,,,...,,,,,,,,,,


# Grab patients enrolled between 2013 and 2019

In [39]:
df["Date of Visit"] = pd.to_datetime(df["Date of Visit"])
pts_before_2020 = df[(df["Event Name"] == "Consult") & (df["Date of Visit"] < "2020-01-01")]["Record ID"].unique().tolist()
print(len(pts_before_2020))

df = df[df["Record ID"].isin(pts_before_2020)]

949


# Grab patients stage IB2-IVB

In [43]:
df[df["Event Name"] == "Consult"]["FIGO Cervical Stage"].unique().tolist()

pts_init_stage_ib2_above = df[(df["Event Name"] == "Consult") & (df["FIGO Cervical Stage"].isin(['Stage IVB','Stage IIIB','Stage IIB',
                                                                                                 'Stage IIA','Stage IVA','Stage IIIA',
                                                                                                 'Stage IB2','Stage IB3','Stage IIIC1']))]["Record ID"].unique().tolist()

pts_final_stage_ib2_above = df[(df["Is this the last OTV?"] == "Yes") & (df["End of Treatment FIGO Staging"].isin(['Stage IIIB', 'Stage IIIA', 'Stage IIA',
                                                                                                                   'Stage IVA', 'Stage IIB', 'Stage IB2', 
                                                                                                                   'Stage IVB', 'Stage IB3', 'Stage IIIC1']))]["Record ID"].unique().tolist()

print(len(set.union(set(pts_init_stage_ib2_above), set(pts_final_stage_ib2_above))))
df = df[(df["Record ID"].isin(pts_init_stage_ib2_above)) | (df["Record ID"].isin(pts_final_stage_ib2_above))]


816


# Grab patients who received CRT (chemoradiation)

In [52]:
# missing values for chemo or EQD2
print(last_otv_df[last_otv_df["Total number of chemo cycles received"].isnull()].shape)
print(last_otv_df[last_otv_df["What is the  EQD2 result?"].isnull()].shape)


(106, 3)
(69, 3)


In [50]:
last_otv_df = df[df["Is this the last OTV?"] == "Yes"]

# we can group and take one value for the pts with more than one row for last OTV
# last_otv_df[last_otv_df["Record ID"].isin(last_otv_df[last_otv_df["Record ID"].duplicated() == True]["Record ID"])][["Record ID", "Total number of chemo cycles received", "What is the  EQD2 result?"]]
last_otv_df = last_otv_df.groupby("Record ID")[["Record ID", "Total number of chemo cycles received", "What is the  EQD2 result?"]].first()
# check if we did it right
# last_otv_df[last_otv_df["Record ID"].isin([5529, 5772, 6090, 6186])]

last_otv_df["What is the  EQD2 result?"] = last_otv_df["What is the  EQD2 result?"].astype(float)
pts_crt = last_otv_df[(last_otv_df["Total number of chemo cycles received"].isin(['4', '2', '1', '3', '5', '6',
                                                                                  '4 cycles neoadjuvant in Serowe', '4 cycles',
                                                                                  '7'])) & 
                      (last_otv_df["What is the  EQD2 result?"] > 0)]["Record ID"].unique().tolist()
print(len(pts_crt))

df = df[df["Record ID"].isin(pts_crt)]

# Look at distribution of treatment intent (Curative v. Definitive v. Palliative)

In [55]:
# before filtering for CRT pts
last_otv_df = df[df["Is this the last OTV?"] == "Yes"]
last_otv_df = last_otv_df.groupby("Record ID")[["Record ID", "Treatment type"]].first()
print(last_otv_df.shape)
print(last_otv_df[last_otv_df["Treatment type"].isnull()].shape)
# last_otv_df[last_otv_df["Treatment type"].isnull()]["Record ID"].to_excel("../../data/delays/missing_treatment_type_12-10.xlsx", index=False)

last_otv_df["Treatment type"].value_counts()

(812, 2)
(81, 2)


Definitive    346
Curative      268
Palliative    117
Name: Treatment type, dtype: int64

In [24]:
# after filtering for CRT pts
last_otv_df = df[df["Is this the last OTV?"] == "Yes"]
last_otv_crt_df = last_otv_df[last_otv_df["Record ID"].isin(pts_crt)]
# we can group and take one value for the pts with more than one row for last OTV
# last_otv_crt_df[last_otv_crt_df["Record ID"].isin(last_otv_crt_df[last_otv_crt_df["Record ID"].duplicated() == True]["Record ID"])][["Record ID", "Treatment type"]]
last_otv_crt_df = last_otv_crt_df.groupby("Record ID")[["Record ID", "Treatment type"]].first()
# check if we did it right
# last_otv_crt_df[last_otv_crt_df["Record ID"].isin([5529, 6090])]
last_otv_crt_df["Treatment type"].value_counts()
## 351 / 367 CRT patients with definitive or curative trt intent


Curative      219
Definitive    127
Palliative      5
Name: Treatment type, dtype: int64

# Initial insights

In [5]:
# how many patients total?
print(df["Record ID"].nunique())
## when were they enrolled?
print(pd.to_datetime(df[df["Event Name"] == "Consult"]["Date of Visit"]).min())
print(pd.to_datetime(df[df["Event Name"] == "Consult"]["Date of Visit"]).max())

# what types of events? 
print(df["Event Name"].unique())

# column names
print(df.columns.values)

719
2015-01-28 00:00:00
2019-12-18 00:00:00
['Consult' 'OTV_1' 'OTV_3' 'OTV_4' '3 Month Follow-Up'
 '15 Months Follow-Up' 'OTV_2' '21 Months Follow-Up' '27 Months Follow-Up'
 '3 Year Follow-Up' '42 Months Follow-Up' '45 Months Follow-up'
 '4 Year Follow-Up' '54 Months Follow-Up' '5 Year Follow-Up'
 '66 Months Follow-Up' '75 Months Follow-Up' 'OTV_5' 'OTV_6'
 '18 Months Follow-Up' '2 Year Follow-Up' '30 Months Follow-Up'
 '33 Months Follow-Up' '39 Months Follow-Up' '51 Months Follow-Up'
 '57 Months Follow-Up' '87 Months Follow-Up' '78 Months Follow-Up' 'OTV_7'
 '6 Month Follow-Up' '69 Months Follow-Up' '63 Months Follow-up'
 '1 Year Follow-Up' '9 Month Follow-Up' 'End of Treatment'
 '6 Year Follow-Up' 'OTV_8' 'Cancer Care Delay & Factors']
['Record ID' 'Event Name' 'Repeat Instrument' 'Repeat Instance'
 'Date of Visit' 'Patient Study ID' 'Date Of Birth' 'Marital Status '
 'Country District' 'Physical address (village/ward)' 'Village'
 'Consulting clinic' 'Cancer Diagnosis Site' 'FIGO Ce

In [6]:
# peek at one patient
df[df["Record ID"] == 9]

Unnamed: 0,Record ID,Event Name,Repeat Instrument,Repeat Instance,Date of Visit,Patient Study ID,Date Of Birth,Marital Status,Country District,Physical address (village/ward),...,Vital Status,Date of Death,Cause of Death,Date of Last Follow Up,How was patient reached?,"Other, specify",Follow Up Weight (kg),Performance status,Brachytherapy Dose Received (cGy),EBRT Dose Received (cGy)
31,9,Consult,,,2015-02-09,9.0,1950-01-01,Single,Kweneng East District,Lentsweletau(Manato ward),...,,,,,,,,,,
32,9,OTV_3,,,NaT,,,,,,...,,,,,,,,,1400.0,3400.0
33,9,OTV_6,,,NaT,,,,,,...,,,,,,,,,2100.0,5000.0
34,9,15 Months Follow-Up,Follow Up,1.0,NaT,,,,,,...,Alive,,,,Primary contact,,,90: Able to carry on normal activity; minor si...,,
35,9,18 Months Follow-Up,Follow Up,1.0,NaT,,,,,,...,Alive,,,,Primary contact,,,60: Requires occasional assistance; but is abl...,,
36,9,21 Months Follow-Up,Follow Up,1.0,NaT,,,,,,...,Alive,,,,Primary contact,,,90: Able to carry on normal activity; minor si...,,
37,9,21 Months Follow-Up,Follow Up,2.0,NaT,,,,,,...,Alive,,,,Consulting clinic,,,90: Able to carry on normal activity; minor si...,,
38,9,2 Year Follow-Up,Follow Up,1.0,NaT,,,,,,...,Alive,,,,,,,90: Able to carry on normal activity; minor si...,,
39,9,30 Months Follow-Up,Follow Up,1.0,NaT,,,,,,...,Alive,,,,Primary contact,,,60: Requires occasional assistance; but is abl...,,
40,9,33 Months Follow-Up,Follow Up,1.0,NaT,,,,,,...,Alive,,,,Primary contact,,,90: Able to carry on normal activity; minor si...,,


# Split data

In [7]:
# print(df[["Record ID", "Event Name", "What is the  EQD2 result?"]].dropna(how="all", subset=["What is the  EQD2 result?"])["Event Name"].unique())
## all OTV visits
## how many received radiation at some point
radiation_df = df[["Record ID", "Event Name", "What is the  EQD2 result?"]].dropna(how="all", subset=["What is the  EQD2 result?"])
radiation_df = radiation_df[radiation_df["What is the  EQD2 result?"] != 0]
print(radiation_df["Record ID"].nunique())

## death data
death_df = df[(df["Vital Status"] == "Dead")][["Record ID", "Event Name", "Vital Status", "Date of Death"]].sort_values(["Record ID", "Date of Death"]).groupby(["Record ID"], sort=False)[["Event Name", "Vital Status", "Date of Death"]].first().reset_index()
# print(death_df.shape)
# print(death_df["Record ID"].nunique())

## remove radiation subjects who died before EOT
print(death_df[death_df["Event Name"] == "End of Treatment"]["Record ID"].nunique())
radiation_df = radiation_df[~radiation_df["Record ID"].isin(death_df[death_df["Event Name"] == "End of Treatment"]["Record ID"].unique())]

## how many completed trt
end_trt_df = df[df["Record ID"].isin(radiation_df["Record ID"].unique())][["Record ID", "End of treatment date "]].dropna().drop_duplicates().groupby("Record ID").first().reset_index()
end_trt_df2 = df[(df["Event Name"] == "End of Treatment") & (df["Record ID"].isin(radiation_df["Record ID"].unique()))][["Record ID", "Date.2"]]
end_trt_df = end_trt_df.merge(end_trt_df2, how="outer", on="Record ID")
end_trt_df["End of treatment date "] = pd.to_datetime(end_trt_df["End of treatment date "])
end_trt_df["Date.2"] = pd.to_datetime(end_trt_df["Date.2"])
end_trt_df["date_min"] = end_trt_df[["End of treatment date ", "Date.2"]].apply(lambda dates: min(dates), axis=1)
end_trt_df.drop(columns=["End of treatment date ", "Date.2"], inplace=True)
end_trt_df.rename(columns={"date_min":"End of treatment date "}, inplace=True)
# print(end_trt_df.shape)
# print(end_trt_df["Record ID"].nunique())

display(radiation_df[~radiation_df["Record ID"].isin(end_trt_df["Record ID"].unique())])
## checking with Mma. Monare

## how many completed trt by 1/1/22
eot_b2022 = end_trt_df[end_trt_df["End of treatment date "] < "2022-01-01"]
print(eot_b2022.shape)
print(eot_b2022["Record ID"].nunique())
## how many completed trt by 1/1/20
eot_b2020 = end_trt_df[end_trt_df["End of treatment date "] < "2020-01-01"]
print(eot_b2020.shape)
print(eot_b2020["Record ID"].nunique())
## proceed with these groups
a1_df = df[df["Record ID"].isin(eot_b2022["Record ID"].unique())]
# print(a1_df["Record ID"].nunique())
a2_df = df[df["Record ID"].isin(eot_b2020["Record ID"].unique())]
# print(a2_df["Record ID"].nunique())

## grab follow up data -- follow-up, drop empty rows, non-dead
a1_follow_up_df = df[((df["Event Name"].str.contains("Follow-Up")) | 
                      (df["Event Name"].str.contains("Follow-up"))) & 
                     (~df["Repeat Instance"].isnull()) & 
                     (df["Vital Status"] != "Dead") &
                     (df["Record ID"].isin(a1_df["Record ID"].unique())) &
                     (df["Follow Up Type"].isin(["Office Visit", "Phone Call", "IPMS"]))].drop(columns=["End of treatment date "])
# print(a1_follow_up_df.shape)
print(a1_follow_up_df["Record ID"].nunique())

a2_follow_up_df = df[((df["Event Name"].str.contains("Follow-Up")) | 
                      (df["Event Name"].str.contains("Follow-up"))) & 
                     (~df["Repeat Instance"].isnull()) & 
                     (df["Vital Status"] != "Dead") &
                     (df["Record ID"].isin(a2_df["Record ID"].unique())) &
                     (df["Follow Up Type"].isin(["Office Visit", "Phone Call", "IPMS"]))].drop(columns=["End of treatment date "])
# print(a2_follow_up_df.shape)

a1_follow_up_df = a1_follow_up_df.merge(end_trt_df, how="left", on="Record ID")
a1_follow_up_df.dropna(axis=1, how="all", inplace=True)
a1_follow_up_df.dropna(axis=0, how="all", subset=['Repeat Instrument', 'Repeat Instance', 'Follow Up Type', 
                                               'Vital Status', 'How was patient reached?', 
                                               'Follow Up Weight (kg)', 'Performance status'], inplace=True)
print(a1_follow_up_df.shape) # some/all patients have more than one row

a2_follow_up_df = a2_follow_up_df.merge(end_trt_df, how="left", on="Record ID")
a2_follow_up_df.dropna(axis=1, how="all", inplace=True)
a2_follow_up_df.dropna(axis=0, how="all", subset=['Repeat Instrument', 'Repeat Instance', 'Follow Up Type', 
                                               'Vital Status', 'How was patient reached?', 
                                               'Follow Up Weight (kg)', 'Performance status'], inplace=True)
print(a2_follow_up_df.shape) # some/all patients have more than one row

# demographic data
consult_df = df[(df["Event Name"] == "Consult") & 
                (df["Record ID"].isin(eot_b2022["Record ID"].unique()))]
## add enrollment age 
consult_df["age"] = (pd.to_datetime(consult_df["Date of Visit"]) - pd.to_datetime(consult_df["Date Of Birth"]))/np.timedelta64(1,"Y")
print(consult_df.shape)
print(consult_df["Record ID"].nunique())
## add column for analysis groups
consult_df["analysis1"] = 0
consult_df.loc[consult_df["Record ID"].isin(a1_df["Record ID"].unique()), "analysis1"] = 1
consult_df["analysis2"] = 0
consult_df.loc[consult_df["Record ID"].isin(a2_df["Record ID"].unique()), "analysis2"] = 1



# last OTV data
last_otv_df = df[(df["Is this the last OTV?"] == "Yes") & 
                 (df["Record ID"].isin(eot_b2022["Record ID"].unique()))]

lotv_dups = last_otv_df[last_otv_df["Record ID"].isin(last_otv_df[last_otv_df["Record ID"].duplicated() == True]["Record ID"].unique().tolist())]
lotv_non_dups = last_otv_df[~last_otv_df["Record ID"].isin(lotv_dups["Record ID"].unique())]

lotv_dups["otv_num"] = lotv_dups["Event Name"].str[-1].astype(int)
lotv_dups = lotv_dups.sort_values(["Record ID", "otv_num"]).groupby("Record ID", sort=False).first().reset_index()

last_otv_df = pd.concat([lotv_non_dups, lotv_dups[lotv_non_dups.columns]], axis=0, ignore_index=True)
print(last_otv_df.shape)
print(last_otv_df["Record ID"].nunique())


964
128


Unnamed: 0,Record ID,Event Name,What is the EQD2 result?
124,19,OTV_7,58.0
6996,5677,OTV_1,12.0


(852, 2)
852
(722, 2)
722
726
(4332, 13)
(3917, 13)
(852, 114)
852
(852, 113)
852


In [None]:
all(pt for pt in a2_df["Record ID"].unique() if pt in a1_df["Record ID"].unique())

In [None]:
# drop any completely empty columns
consult_df.dropna(axis=1, how="all", inplace=True)
print(consult_df.shape) # one patient per row
last_otv_df.dropna(axis=1, how="all", inplace=True)
print(last_otv_df.shape)


In [None]:
print(a1_df["Record ID"].nunique())
print(a1_df.shape)

print(a2_df["Record ID"].nunique())
print(a2_df.shape)

print(a1_follow_up_df["Record ID"].nunique())
print(a1_follow_up_df.shape)

print(a2_follow_up_df["Record ID"].nunique())
print(a2_follow_up_df.shape)

# Demographics

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# age
display(consult_df[consult_df["age"] >= 21]["age"].describe()) # filter error patient
consult_df.loc[(consult_df["age"] >= 21) & (consult_df["age"] <= 39), "age_cat"] = "21-39"
consult_df.loc[(consult_df["age"] > 39) & (consult_df["age"] <= 59), "age_cat"] = "40-59"
consult_df.loc[(consult_df["age"] >= 60), "age_cat"] = "60+"
display(consult_df[consult_df["age"] >= 21].groupby("age_cat")["Record ID"].nunique())

# marital status
display(consult_df.groupby("Marital Status ")["Record ID"].nunique())

# district
display(consult_df.groupby("Country District")["Record ID"].nunique())

# village
display(consult_df.groupby("Village")["Record ID"].nunique())

# FIGO cervical stage (for consult event)
display(consult_df.sort_values("FIGO Cervical Stage").groupby("FIGO Cervical Stage")["Record ID"].nunique())

# cancer screening?
display(consult_df.groupby("Have you been screened for cervical cancer?")["Record ID"].nunique())

# history of diabetes
display(consult_df.groupby("History of Diabetes")["Record ID"].nunique())

# history of TB
display(consult_df.groupby("History of tuberculosis (TB)")["Record ID"].nunique())

# HIV status
display(consult_df.groupby("Are you HIV positive?")["Record ID"].nunique())

## HIV treatment (for consult event)
display(consult_df.groupby("Are you receiving HIV treatment?")["Record ID"].nunique())
display(consult_df[consult_df["Are you HIV positive?"] == "Yes"].groupby("Are you receiving HIV treatment?")["Record ID"].nunique())

# performance status
display(consult_df.groupby("Performance Status")["Record ID"].nunique())

## performance status <90
consult_df.loc[consult_df["Performance Status"].isin(["100: Normal no complaints; no evidence of disease",
                                                      "90: Able to carry on normal activity; minor signs or symptoms of disease"]), "performance_status_cat"] = ">=90"
consult_df.loc[consult_df["Performance Status"].isin(["80: Normal activity with effort; some signs of symptoms of disease",
                                                      "70: Cares for self; unable to carry on normal activity or to do active work",
                                                      "60: Requires occasional assistance; but is able to care for most of his/her personal needs",
                                                      "50: Requires considerable assistance and frequent medical care",
                                                      "30: Severely disabled; hospital admission is indicated although death not imminent"]), "performance_status_cat"] = "<90"
display(consult_df.groupby("performance_status_cat")["Record ID"].nunique())

# labs
consult_df.loc[consult_df["Creatinine (umo/L) Lab Result "].isin([9999,8888]), "Creatinine (umo/L) Lab Result "] = np.nan
consult_df.loc[consult_df["Hemoglobin (g/dl) Lab Result"].isin([9999,8888,99999]), "Hemoglobin (g/dl) Lab Result"] = np.nan
consult_df.loc[consult_df["Neutrophil Count (%,per 10^9) Lab Result"].isin([9999,8888]), "Neutrophil Count (%,per 10^9) Lab Result"] = np.nan
consult_df.loc[consult_df["White Blood Count (per10^9) Lab Result"].isin([9999,8888]), "White Blood Count (per10^9) Lab Result"] = np.nan
consult_df.loc[consult_df["Albumin (g/dL) Lab Result "].isin([9999,8888,88888]), "Albumin (g/dL) Lab Result "] = np.nan
display(consult_df.describe())

# HIV characteristics
## CD4
display(consult_df[(consult_df["Are you HIV positive?"] == "Yes") & (~consult_df["CD4 Count"].isin([9999,8888]))]["CD4 Count"].describe())
## CD4 groups
hiv_positive = consult_df[consult_df["Are you HIV positive?"] == "Yes"]
cd4_df = hiv_positive[~hiv_positive["CD4 Count"].isin([9999])]
cd4_df.loc[cd4_df["CD4 Count"] < 200, "CD4 Count Cat"] = "<200"
cd4_df.loc[(cd4_df["CD4 Count"] >= 200) & (cd4_df["CD4 Count"] < 350), "CD4 Count Cat"] = ">=200-<350"
cd4_df.loc[(cd4_df["CD4 Count"] >= 350) & (cd4_df["CD4 Count"] < 500), "CD4 Count Cat"] = ">=350-<500"
cd4_df.loc[cd4_df["CD4 Count"] >= 500, "CD4 Count Cat"] = ">=500"
display(cd4_df.groupby("CD4 Count Cat")["Record ID"].nunique())
## VL
vl_df = hiv_positive[~hiv_positive["Viral Load (copies/ml) Lab Result(NUMBERS ONLY)"].isin([9999,8888])]
vl_df["Viral Load (copies/ml) Lab Result(NUMBERS ONLY) Cat"] = 0
vl_df.loc[vl_df["Viral Load (copies/ml) Lab Result(NUMBERS ONLY)"] >= 400, "Viral Load (copies/ml) Lab Result(NUMBERS ONLY) Cat"] = 1 
display(vl_df.groupby("Viral Load (copies/ml) Lab Result(NUMBERS ONLY) Cat")["Record ID"].nunique())


In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# FIGO cervical stage (at end of treatment)
display(last_otv_df.groupby("End of Treatment FIGO Staging")["Record ID"].nunique())

# treatment
display(last_otv_df.groupby("No treatment")["Record ID"].nunique())

## surgery
display(last_otv_df.groupby("Surgery")["Record ID"].nunique())

## treatment type
display(last_otv_df.groupby("Treatment type")["Record ID"].nunique())

## treatment response
display(last_otv_df.groupby("Response to treatment")["Record ID"].nunique())

# performance status
display(last_otv_df.groupby("Performance Status")["Record ID"].nunique())

## performance status <90
last_otv_df.loc[last_otv_df["Performance Status"].isin(["100: Normal no complaints; no evidence of disease",
                                                      "90: Able to carry on normal activity; minor signs or symptoms of disease"]), "performance_status_cat"] = ">=90"
last_otv_df.loc[last_otv_df["Performance Status"].isin(["80: Normal activity with effort; some signs of symptoms of disease",
                                                      "70: Cares for self; unable to carry on normal activity or to do active work",
                                                      "60: Requires occasional assistance; but is able to care for most of his/her personal needs",
                                                      "50: Requires considerable assistance and frequent medical care",
                                                      "30: Severely disabled; hospital admission is indicated although death not imminent"]), "performance_status_cat"] = "<90"
display(last_otv_df.groupby("performance_status_cat")["Record ID"].nunique())

# treatment
## surgery only
print(last_otv_df[(last_otv_df["Surgery"] == "Yes") &
                  ((last_otv_df["Total number of chemo cycles received"].isin(["0", "0000", "None", "no chemo", "nil"])) | (last_otv_df["Total number of chemo cycles received"].isnull())) &
                  ((last_otv_df["What is the  EQD2 result?"] == 0) | (last_otv_df["What is the  EQD2 result?"].isnull()))]["Record ID"].nunique())

## chemo only
print(last_otv_df[((last_otv_df["Surgery"] == "No") | (last_otv_df["Surgery"].isnull())) &
                  (last_otv_df["Total number of chemo cycles received"].isin([['4', '5', '2', '1', '3', '6', '4 cycles neoadjuvant in Serowe', '4 cycles', '7', '8', '10']])) &
                  ((last_otv_df["What is the  EQD2 result?"] == 0) | (last_otv_df["What is the  EQD2 result?"].isnull()))]["Record ID"].nunique())
## RT only
print(last_otv_df[((last_otv_df["Surgery"] == "No") | (last_otv_df["Surgery"].isnull())) &
                  ((last_otv_df["Total number of chemo cycles received"].isin(["0", "0000", "None", "no chemo", "nil"])) | (last_otv_df["Total number of chemo cycles received"].isnull())) &
                  (last_otv_df["What is the  EQD2 result?"] != 0) & 
                  (~last_otv_df["What is the  EQD2 result?"].isnull())]["Record ID"].nunique())  

# CRT
print(last_otv_df[((last_otv_df["Surgery"] == "No") | (last_otv_df["Surgery"].isnull())) &
                  (last_otv_df["Total number of chemo cycles received"].isin(['4', '5', '2', '1', '3', '6', '4 cycles neoadjuvant in Serowe', '4 cycles', '7', '8', '10'])) &
                  (last_otv_df["What is the  EQD2 result?"] != 0) & 
                  (~last_otv_df["What is the  EQD2 result?"].isnull())]["Record ID"].nunique())

## surgery and chemo
print(last_otv_df[(last_otv_df["Surgery"] == "Yes") &
                  (last_otv_df["Total number of chemo cycles received"].isin(['4', '5', '2', '1', '3', '6', '4 cycles neoadjuvant in Serowe', '4 cycles', '7', '8', '10'])) &
                  ((last_otv_df["What is the  EQD2 result?"] == 0) | (last_otv_df["What is the  EQD2 result?"].isnull()))]["Record ID"].nunique())

## surgery and RT
print(last_otv_df[(last_otv_df["Surgery"] == "Yes") &
                  ((last_otv_df["Total number of chemo cycles received"].isin(["0", "0000", "None", "no chemo", "nil"])) | (last_otv_df["Total number of chemo cycles received"].isnull())) &
                  (last_otv_df["What is the  EQD2 result?"] != 0) & 
                  (~last_otv_df["What is the  EQD2 result?"].isnull())]["Record ID"].nunique())  

## surgery and CRT
print(last_otv_df[(last_otv_df["Surgery"] == "Yes") &
                  (last_otv_df["Total number of chemo cycles received"].isin(['4', '5', '2', '1', '3', '6', '4 cycles neoadjuvant in Serowe', '4 cycles', '7', '8', '10'])) &
                  (last_otv_df["What is the  EQD2 result?"] != 0) & 
                  (~last_otv_df["What is the  EQD2 result?"].isnull())]["Record ID"].nunique())  

# treatment duration
(pd.to_datetime(last_otv_df["End of treatment date "]) - pd.to_datetime(last_otv_df["Date treatment started "])).dt.days.describe()


# Focus Points

### Deaths

In [None]:
print(death_df[death_df["Record ID"].isin(a1_df["Record ID"].unique())].shape)
print(death_df[death_df["Record ID"].isin(a1_df["Record ID"].unique())]["Record ID"].nunique())

death_df[death_df["Record ID"].isin(a1_df["Record ID"].unique())].groupby("Event Name")["Record ID"].nunique()

### Follow-up windows (grouped by follow-up type):  
    - % who have follow up  within first 6 months (this would include 3 and 6 month visits), 
    - % who have follow up between 6-12 months (this includes 9 and 12 month visits), 
    - % who have follow up between 12-18 months
    - % who have follow up between 18-24 months, 
    - % who have follow up between 24-36 months,
    - % who have follow up between 36-48 months, 
    - % who have follow up between 48-60 months

In [None]:
a1_follow_up_df["time_from_eot"] = ((pd.to_datetime(a1_follow_up_df["Date.2"]) - pd.to_datetime(a1_follow_up_df["End of treatment date "]))/np.timedelta64(1, 'M'))

# office_phone_only = a1_follow_up_df[a1_follow_up_df["Follow Up Type"].isin(["Office Visit", "Phone Call"])]

office_phone_ipms_only = a1_follow_up_df[a1_follow_up_df["Follow Up Type"].isin(["Office Visit", "Phone Call", "IPMS"])]
w1 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["3 Month Follow-Up", "6 Month Follow-Up"])]
w2 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["9 Month Follow-Up", "1 Year Follow-Up"])]
w3 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["15 Months Follow-Up", "18 Months Follow-Up"])]
w4 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["21 Months Follow-Up", "2 Year Follow-Up"])]
w5 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["27 Months Follow-Up", "30 Months Follow-Up", "33 Months Follow-Up", "3 Year Follow-Up"])]
w6 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["39 Months Follow-Up", "42 Months Follow-Up", "45 Months Follow-up", "4 Year Follow-Up"])]
w7 = office_phone_ipms_only[office_phone_ipms_only["Event Name"].isin(["51 Months Follow-Up", "54 Months Follow-Up", "57 Months Follow-Up", "5 Year Follow-Up"])]

# w1 = office_phone_only[office_phone_only["Event Name"].isin(["3 Month Follow-Up", "6 Month Follow-Up"])]
# w2 = office_phone_only[office_phone_only["Event Name"].isin(["9 Month Follow-Up", "1 Year Follow-Up"])]
# w3 = office_phone_only[office_phone_only["Event Name"].isin(["15 Months Follow-Up", "18 Months Follow-Up"])]
# w4 = office_phone_only[office_phone_only["Event Name"].isin(["21 Months Follow-Up", "2 Year Follow-Up"])]
# w5 = office_phone_only[office_phone_only["Event Name"].isin(["27 Months Follow-Up", "30 Months Follow-Up", "33 Months Follow-Up", "3 Year Follow-Up"])]
# w6 = office_phone_only[office_phone_only["Event Name"].isin(["39 Months Follow-Up", "42 Months Follow-Up", "45 Months Follow-up", "4 Year Follow-Up"])]
# w7 = office_phone_only[office_phone_only["Event Name"].isin(["51 Months Follow-Up", "54 Months Follow-Up", "57 Months Follow-Up", "5 Year Follow-Up"])]



In [None]:
def run_stats(time_range, window_df, all_pt_df, eot_cutoff_date, death_df, died_before):
    # remove pts who died already
    all_pt_df = all_pt_df[~all_pt_df["Record ID"].isin(died_before)]
    window_df = window_df[~window_df["Record ID"].isin(died_before)]
    
    eligible_pts = all_pt_df[all_pt_df["End of treatment date "] < eot_cutoff_date]["Record ID"].unique().tolist()
#     print(all(pt not in eligible_pts for pt in died_before))
#     if not all(pt not in window_df["Record ID"].unique().tolist() for pt in died_before):
#         print([pt for pt in died_before if pt in window_df["Record ID"].unique().tolist()])
    eligible_pts = set(eligible_pts + window_df["Record ID"].unique().tolist())
    print("Number of patients eligible for follow-up at {}: {}".format(time_range, len(eligible_pts)))
    print("Number of patients with follow-up at {}: {}".format(time_range, window_df["Record ID"].nunique()))
        
    office_df = window_df[window_df["Follow Up Type"] == "Office Visit"]
    print("\tOffice visit: {}".format(office_df["Record ID"].nunique()))
    
    phone_df = window_df[(window_df["Follow Up Type"] == "Phone Call") &
                         (~window_df["Record ID"].isin(office_df["Record ID"].unique().tolist()))]
    print("\tPhone visit: {}".format(phone_df["Record ID"].nunique()))
    
    ipms_df = window_df[(window_df["Follow Up Type"] == "IPMS") &
                        (~window_df["Record ID"].isin(office_df["Record ID"].unique().tolist())) &
                        (~window_df["Record ID"].isin(phone_df["Record ID"].unique().tolist()))]
    print("\tEMR: {}".format(ipms_df["Record ID"].nunique()))
    
    office_phone_ipms_df = pd.concat([office_df, phone_df, ipms_df], axis=0, ignore_index=True)
    display(office_phone_ipms_df.sort_values(["Record ID", "Follow Up Type", "Date.2"]).groupby(["Record ID", "Follow Up Type"], sort=False)[["time_from_eot"]].first().groupby("Follow Up Type")["time_from_eot"].mean())

    died_during_interval = death_df[(death_df["Event Name"].isin(window_df["Event Name"].unique())) & (death_df["Record ID"].isin(eligible_pts))]
    print("Number of patients who died during {} follow-up and will not contribute to future windows: {}".format(time_range, died_during_interval.shape[0]))

    return died_during_interval["Record ID"].tolist()


In [None]:
died_before_eot = death_df[death_df["Event Name"] == "End of Treatment"]["Record ID"].tolist()
died_during_interval = run_stats("[3mo-6mo]", w1, a1_df, "2022-01-01", death_df, died_before_eot)
all_dead = set(died_before_eot + died_during_interval)
print(len(all_dead))

In [None]:
died_during_interval = run_stats("(6mo-12mo]", w2, a1_follow_up_df, "2021-07-01", death_df, all_dead)
all_dead = set(list(all_dead) + died_during_interval)
print(len(all_dead))

In [None]:
died_during_interval = run_stats("(12mo-18mo]", w3, a1_follow_up_df, "2021-01-01", death_df, all_dead)
all_dead = set(list(all_dead) + died_during_interval)
print(len(all_dead))

In [None]:
died_during_interval = run_stats("(18mo-24mo]", w4, a1_follow_up_df, "2020-07-01", death_df, all_dead)
all_dead = set(list(all_dead) + died_during_interval)
print(len(all_dead))

In [None]:
died_during_interval = run_stats("(24mo-36mo]", w5, a1_follow_up_df, "2019-07-01", death_df, all_dead)
all_dead = set(list(all_dead) + died_during_interval)
print(len(all_dead))

In [None]:
died_during_interval = run_stats("(36mo-48mo]", w6, a1_follow_up_df, "2018-07-01", death_df, all_dead)
all_dead = set(list(all_dead) + died_during_interval)
print(len(all_dead))

In [None]:
died_during_interval = run_stats("(48mo-60mo]", w7, a1_follow_up_df, "2017-07-01", death_df, all_dead)
all_dead = set(list(all_dead) + died_during_interval)
print(len(all_dead))

### What percentage of patients are coming in every six months for the first 2 years?

#### No follow-up

In [None]:
no_follow_up = a1_df[(~a1_df["Record ID"].isin(a1_follow_up_df["Record ID"].unique().tolist()))]
print(no_follow_up.shape)
print(no_follow_up["Record ID"].nunique())
# no follow-up

In [None]:
def criteria_1(visit_list):
    if "3 Month Follow-Up" in visit_list or "6 Month Follow-Up" in visit_list:
        if "9 Month Follow-Up" in visit_list or "1 Year Follow-Up" in visit_list:
            if "15 Months Follow-Up" in visit_list or "18 Months Follow-Up" in visit_list:
                if "21 Months Follow-Up" in visit_list or "2 Year Follow-Up" in visit_list:
                    return "Y"
                return "N"
            return "N"
        return "N"
    return "N"

def criteria_1a(visit_list):
    total = 0
    if "3 Month Follow-Up" in visit_list or "6 Month Follow-Up" in visit_list:
        total += 1
    if "9 Month Follow-Up" in visit_list or "1 Year Follow-Up" in visit_list:
        total+=1
    if "15 Months Follow-Up" in visit_list or "18 Months Follow-Up" in visit_list:
        total+=1
    if "21 Months Follow-Up" in visit_list or "2 Year Follow-Up" in visit_list:
        total+=1
    return total
 

In [None]:
died_within_2yr = death_df[(death_df["Record ID"].isin(a1_df["Record ID"].unique().tolist())) & (death_df["Event Name"].isin(["3 Month Follow-Up", "6 Month Follow-Up", "9 Month Follow-Up", "1 Year Follow-Up", "15 Months Follow-Up", "18 Months Follow-Up", "21 Months Follow-Up", "2 Year Follow-Up"]))]
print(died_within_2yr.shape)
print(died_within_2yr["Record ID"].nunique())
display(died_within_2yr["Event Name"].value_counts())
display(died_within_2yr["Event Name"].value_counts()/233*100)
# died within 2 year follow up period


In [None]:
# add to consult_df for when data is combined and saved later
consult_df["died_within_2yr"] = 0
consult_df.loc[consult_df["Record ID"].isin(died_within_2yr["Record ID"].tolist()), "died_within_2yr"] = 1

consult_df["died_within_2yr"].value_counts()

In [None]:
office_phone_only = a1_follow_up_df[a1_follow_up_df["Follow Up Type"].isin(["Office Visit", "Phone Call"])]

visit_df = office_phone_only.groupby("Record ID")["Event Name"].apply(set).reset_index()
visit_df["criteria_1"] = visit_df["Event Name"].apply(criteria_1)
visit_df["criteria_1a"] = visit_df["Event Name"].apply(criteria_1a)
print(visit_df[visit_df["criteria_1a"] > 0]["Record ID"].nunique())
display(visit_df["criteria_1"].value_counts())
display(visit_df["criteria_1a"].value_counts())
display(visit_df["criteria_1a"].value_counts()/852*100)
# came in every six months for the first 2 years starting from the 6 month follow up

print(visit_df[(visit_df["Record ID"].isin(died_within_2yr["Record ID"].unique().tolist()))]["Record ID"].nunique())
# remaining did not have an office or phone call visit / died before

# IPMS follow-up only
a1_follow_up_df[(a1_follow_up_df["Follow Up Type"] == "IPMS") &
                (~a1_follow_up_df["Record ID"].isin(visit_df[visit_df["criteria_1a"] > 0]["Record ID"].unique())) &
                (a1_follow_up_df["Event Name"].isin(["3 Month Follow-Up", "6 Month Follow-Up", "9 Month Follow-Up", "1 Year Follow-Up", "15 Months Follow-Up", "18 Months Follow-Up", "21 Months Follow-Up", "2 Year Follow-Up"]))]["Record ID"].nunique()
## remaining are lost to follow-up


In [None]:
# add to consult_df for when data is combined and saved later
consult_df["criteria_1"] = "N"
consult_df.loc[consult_df["Record ID"].isin(visit_df[visit_df["criteria_1"] == "Y"]["Record ID"].tolist()), "criteria_1"] = "Y"

consult_df[consult_df["analysis1"] == 1].groupby("criteria_1")["Record ID"].count()


### What percentage of patients are coming in every year for the next 3 years?

#### No follow-up

In [None]:
no_follow_up = a2_df[(~a2_df["Record ID"].isin(a2_follow_up_df["Record ID"].unique().tolist()))]
print(no_follow_up.shape)
print(no_follow_up["Record ID"].nunique())
# no follow-up

In [None]:
def criteria_2(visit_list):
    if "27 Months Follow-Up" in visit_list or "30 Months Follow-Up" in visit_list or "33 Months Follow-Up" in visit_list or "3 Year Follow-Up" in visit_list:
        if "39 Months Follow-Up" in visit_list or "42 Months Follow-Up" in visit_list or "45 Months Follow-up" in visit_list or "4 Year Follow-Up" in visit_list:
            if "51 Months Follow-Up" in visit_list or "54 Months Follow-Up" in visit_list or "57 Months Follow-Up" in visit_list or "5 Year Follow-Up" in visit_list:
                return "Y"
            return "N"
        return "N"
    return "N"

def criteria_2a(visit_list):
    total = 0
    if "27 Months Follow-Up" in visit_list or "30 Months Follow-Up" in visit_list or "33 Months Follow-Up" in visit_list or "3 Year Follow-Up" in visit_list:
        total+=1
    if "39 Months Follow-Up" in visit_list or "42 Months Follow-Up" in visit_list or "45 Months Follow-up" in visit_list or "4 Year Follow-Up" in visit_list:
        total+=1
    if "51 Months Follow-Up" in visit_list or "54 Months Follow-Up" in visit_list or "57 Months Follow-Up" in visit_list or "5 Year Follow-Up" in visit_list:
        total+=1
    return total


In [None]:
died_within_3to5yr = death_df[(~death_df["Record ID"].isin(died_within_2yr["Record ID"].unique().tolist())) & (death_df["Record ID"].isin(a2_follow_up_df["Record ID"].unique().tolist())) & (death_df["Event Name"].isin(["27 Months Follow-Up", "30 Months Follow-Up", "33 Months Follow-Up", "3 Year Follow-Up", "39 Months Follow-Up", "42 Months Follow-Up", "45 Months Follow-up", "4 Year Follow-Up", "51 Months Follow-Up", "54 Months Follow-Up", "57 Months Follow-Up", "5 Year Follow-Up"]))]
# died_within_3to5yr = death_df[(death_df["Record ID"].isin(a2_follow_up_df["Record ID"].unique().tolist())) & (death_df["Event Name"].isin(["27 Months Follow-Up", "30 Months Follow-Up", "33 Months Follow-Up", "3 Year Follow-Up", "39 Months Follow-Up", "42 Months Follow-Up", "45 Months Follow-up", "4 Year Follow-Up", "51 Months Follow-Up", "54 Months Follow-Up", "57 Months Follow-Up", "5 Year Follow-Up"]))]
print(died_within_3to5yr.shape)
print(died_within_3to5yr["Record ID"].nunique())
display(died_within_3to5yr["Event Name"].value_counts())
display(died_within_3to5yr["Event Name"].value_counts()/103*100)
# died within 3 and 5 year follow up period


In [None]:
office_phone_only = a2_follow_up_df[a2_follow_up_df["Follow Up Type"].isin(["Office Visit", "Phone Call"])]

visit_df = office_phone_only.groupby("Record ID")["Event Name"].apply(set).reset_index()
visit_df["criteria_2"] = visit_df["Event Name"].apply(criteria_2)
visit_df["criteria_2a"] = visit_df["Event Name"].apply(criteria_2a)
print(visit_df[visit_df["criteria_2a"] > 0]["Record ID"].nunique())
display(visit_df["criteria_2"].value_counts())
display(visit_df["criteria_2a"].value_counts())
display(visit_df["criteria_2a"].value_counts()/722*100)
# came in every year for the next 3 years

visit_df[(visit_df["Record ID"].isin(died_within_3to5yr["Record ID"].unique().tolist()))]["Record ID"].nunique()
# remaining did not have an office or phone call visit / died before

# IPMS follow-up only
a2_follow_up_df[(a2_follow_up_df["Follow Up Type"] == "IPMS") &
                (~a2_follow_up_df["Record ID"].isin(visit_df[visit_df["criteria_2a"] > 0]["Record ID"].unique())) &
                (a2_follow_up_df["Event Name"].isin(["3 Month Follow-Up", "6 Month Follow-Up", "9 Month Follow-Up", "1 Year Follow-Up", "15 Months Follow-Up", "18 Months Follow-Up", "21 Months Follow-Up", "2 Year Follow-Up"]))]["Record ID"].nunique()
## remaining are lost to follow-up


In [None]:
# add to consult_df for when data is combined and saved later
consult_df["criteria_2"] = "N"
consult_df.loc[consult_df["Record ID"].isin(visit_df[visit_df["criteria_2"] == "Y"]["Record ID"].tolist()), "criteria_2"] = "Y"

consult_df[consult_df["analysis2"] == 1].groupby("criteria_2")["Record ID"].count()

### Patient visit windows split by pre- and post-pandemic

- Use the following dates to define pre-pandemic and post-pandemic: March 31, 2020 for end of pre-pandemic and 4/1/2020 start of post-pandemic.
- If visit date missing, anchor on the treatment date


In [None]:
def run_stats(df, offset):
    # take first occurred follow-up if more than one
    office_df = df[df["Follow Up Type"] == "Office Visit"]
    office_df = office_df.sort_values(["Record ID", "Date.2"]).groupby("Record ID", sort=False)[["Date.2", "Follow Up Type"]].first().reset_index()
#     print(office_df.shape)
#     print(office_df["Record ID"].nunique())
    phone_df = df[(~df["Record ID"].isin(office_df["Record ID"].unique())) &
                  (df["Follow Up Type"] == "Phone Call")]
    phone_df = phone_df.sort_values(["Record ID", "Date.2"]).groupby("Record ID", sort=False)[["Date.2", "Follow Up Type"]].first().reset_index()
#     print(phone_df.shape)
#     print(phone_df["Record ID"].nunique())
    ipms_df = df[(~df["Record ID"].isin(office_df["Record ID"].unique())) &
                 (~df["Record ID"].isin(phone_df["Record ID"].unique())) &
                 (df["Follow Up Type"] == "IPMS")]
    ipms_df = ipms_df.sort_values(["Record ID", "Date.2"]).groupby("Record ID", sort=False)[["Date.2", "Follow Up Type"]].first().reset_index()
#     print(ipms_df.shape)
#     print(ipms_df["Record ID"].nunique())
    anchored = df[(~df["Record ID"].isin(office_df["Record ID"].unique())) &
                  (~df["Record ID"].isin(phone_df["Record ID"].unique())) &
                  (~df["Record ID"].isin(ipms_df["Record ID"].unique()))]
    anchored["Date.2"] = pd.to_datetime(df["End of treatment date "]) + pd.DateOffset(months=offset)
#     print(anchored.shape)
#     print(anchored["Record ID"].nunique())
    non_anchored = pd.concat([office_df, phone_df, ipms_df], axis=0, ignore_index=True)
    
    print("Non-anchored")
    run_stats2(non_anchored, "N")
    print("Anchored")
    run_stats2(anchored, "Y")
    
def run_stats2(df, anchored="N"):
    if df[df["Date.2"].isnull()].shape[0] > 0:
        display(df[df["Date.2"].isnull()])
    # separate pre- and post-pandemic
    pre_pandemic = df[df["Date.2"] <= "2020-03-31"]
    print("Pre-pandemic: {}".format(pre_pandemic["Record ID"].nunique()))
    post_pandemic = df[df["Date.2"] >= "2020-04-01"]
    print("Post-pandemic: {}".format(post_pandemic["Record ID"].nunique()))
    
    if anchored == "N":
        # pre-pandemic by type
        print("Pre-pandemic by type")
        display(pre_pandemic.groupby("Follow Up Type")["Record ID"].nunique())

        # post-pandemic by type
        print("Post-pandemic by type")
        display(post_pandemic.groupby("Follow Up Type")["Record ID"].nunique())


In [None]:
a1_add = a1_df[~a1_df["Record ID"].isin(a1_follow_up_df["Record ID"].unique())][["Record ID", "End of treatment date "]].dropna()
a1_add = a1_add.sort_values(["Record ID", "End of treatment date "]).groupby("Record ID", sort=False).first().reset_index()
a1_add[list(set(a1_follow_up_df.columns) - set(["Record ID", "End of treatment date "]))] = np.nan
a1_follow_up_df_ = pd.concat([a1_follow_up_df, a1_add], axis=0, ignore_index=True)

print(a1_follow_up_df_.shape)
print(a1_follow_up_df_["Record ID"].nunique())



In [None]:
w1 = a1_follow_up_df_[(a1_follow_up_df_["Event Name"].isin(["3 Month Follow-Up", "6 Month Follow-Up"])) | (a1_follow_up_df_["Event Name"].isnull())]
w1_add = a1_follow_up_df_[~a1_follow_up_df_["Record ID"].isin(w1["Record ID"].unique())][["Record ID", "End of treatment date "]].dropna().drop_duplicates()
# print(w1_add.shape)
# print(w1_add["Record ID"].nunique())
w1_add[list(set(w1.columns) - set(w1_add.columns))] = np.nan
# print(w1.shape)
w1 = pd.concat([w1, w1_add], axis=0, ignore_index=True)
# print(w1.shape)
# print(w1["Record ID"].nunique())


w2 = a1_follow_up_df_[a1_follow_up_df_["Event Name"].isin(["9 Month Follow-Up", "1 Year Follow-Up"]) | (a1_follow_up_df_["Event Name"].isnull())]
w2_add = a1_follow_up_df_[~a1_follow_up_df_["Record ID"].isin(w2["Record ID"].unique())][["Record ID", "End of treatment date "]].dropna().drop_duplicates()
# print(w2_add.shape)
# print(w2_add["Record ID"].nunique())
w2_add[list(set(w2.columns) - set(w2_add.columns))] = np.nan
# print(w1.shape)
w2 = pd.concat([w2, w2_add], axis=0, ignore_index=True)
# print(w2.shape)
# print(w2["Record ID"].nunique())

w3 = a1_follow_up_df_[a1_follow_up_df_["Event Name"].isin(["15 Months Follow-Up", "18 Months Follow-Up"]) | (a1_follow_up_df_["Event Name"].isnull())]
w3_add = a1_follow_up_df_[~a1_follow_up_df_["Record ID"].isin(w3["Record ID"].unique())][["Record ID", "End of treatment date "]].dropna().drop_duplicates()
# print(w3_add.shape)
# print(w3_add["Record ID"].nunique())
w3_add[list(set(w3.columns) - set(w3_add.columns))] = np.nan
# print(w3.shape)
w3 = pd.concat([w3, w3_add], axis=0, ignore_index=True)
# print(w3.shape)
# print(w3["Record ID"].nunique())

w4 = a1_follow_up_df_[a1_follow_up_df_["Event Name"].isin(["21 Months Follow-Up", "2 Year Follow-Up"]) | (a1_follow_up_df_["Event Name"].isnull())]
w4_add = a1_follow_up_df_[~a1_follow_up_df_["Record ID"].isin(w4["Record ID"].unique())][["Record ID", "End of treatment date "]].dropna().drop_duplicates()
# print(w4_add.shape)
# print(w4_add["Record ID"].nunique())
w4_add[list(set(w4.columns) - set(w4_add.columns))] = np.nan
# print(w4.shape)
w4 = pd.concat([w4, w4_add], axis=0, ignore_index=True)
# print(w4.shape)
# print(w4["Record ID"].nunique())

In [None]:
run_stats(w1, 6)
run_stats(w2, 12)
run_stats(w3, 18)
run_stats(w4, 24)

# Combine and save data

In [None]:
final_df = (consult_df.merge(death_df, how="left", on="Record ID")).merge(last_otv_df, how="left", on="Record ID")
print(final_df.shape)
print(final_df["Record ID"].nunique())

In [None]:
final_df["analysis1"].value_counts()

In [None]:
final_df["analysis2"].value_counts()

In [None]:
final_df.to_csv("../../data/survivorship_care/analysis_data.csv", index=False)