In [1]:
import os
import sys
import pandas as pd
import re

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

We want to investigate if alternative treatments (9 listed by Suyin) to Xandi/Zytiga are strongly correlated with the class label. This is important as these are predictors in the dataset.

In [3]:
# Load in the flatfile (with the 450 000 patients) and the patient list of the final negative and positive cohort
df = pd.read_csv(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\PFA\Pfizer_mCRPC_IDP_v2.csv')
df_label = pd.read_excel(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\Pfizer_mCRPC_Patients.xlsx')

In [28]:
# List of alternative treatments given by Suyin, first two are Xtandi and Zytagi. We care about the _1ST_DT_OVERALL
ls_alt_treatment = ['ENZALUTAMIDE_1ST_RX_DT_OVERALL', 'ABIRATERONE_1ST_RX_DT_OVERALL', 'OTHER_ADT_1ST_RX_DT_OVERALL',
                    'RADIOCHEMO_1ST_RX_DT_OVERALL', 'CHEMO_1ST_RX_DT_OVERALL', 'IMMUNO_1ST_RX_DT_OVERALL', 
                    'PROSTATECTOMY_1ST_DX_DT', 'ORIECHTOMY_1ST_DX_DT_OVERALL', 'EBRT_1ST_DX_DT_OVERALL']

In [16]:
# Creating a subset of the flatfile with only the final patients. This merge also includes the index and lookback date
ls_alt_treat_with_id = ['PATIENT_ID'] + ls_alt_treatment
df_alt = df[ls_alt_treat_with_id]
df_merged = df_label.merge(df_alt, on='PATIENT_ID', how='left')

In [18]:
# Convert the relevant columns to date time objects
ls_datecols = df_merged.columns[4:]
df_merged[ls_datecols] = df_merged[ls_datecols].apply(lambda x: pd.to_datetime(x, format='%m/%d/%Y'))

In [19]:
# Create a table that shows the prevelance of each feature
ls_columns = ['PN_FLAG'] + ls_alt_treatment
df_flag_sum = df_merged[ls_columns].groupby('PN_FLAG').apply(lambda x: (~x.isnull()).sum())
df_flag_sum[ls_columns[1:]]

Unnamed: 0_level_0,ENZALUTAMIDE_1ST_RX_DT_OVERALL,ABIRATERONE_1ST_RX_DT_OVERALL,OTHER_ADT_1ST_RX_DT_OVERALL,RADIOCHEMO_1ST_RX_DT_OVERALL,CHEMO_1ST_RX_DT_OVERALL,IMMUNO_1ST_RX_DT_OVERALL,PROSTATECTOMY_1ST_DX_DT,ORIECHTOMY_1ST_DX_DT_OVERALL,EBRT_1ST_DX_DT_OVERALL
PN_FLAG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
N,5282,5331,46310,1230,8492,2124,6188,920,9898
P,1874,1977,2772,154,490,229,163,47,529


In [20]:
def cross_table_alt_treat(lambda_cond, title):
    df_cond = pd.DataFrame({title: df_merged[ls_datecols[4:]].apply(lambda_cond).any(axis=1)})
    df_cond['PN_FLAG'] = df_merged['PN_FLAG']
    df_sum = df_cond.groupby('PN_FLAG').sum()
    return df_sum

In [21]:
# Creating a cross table that shows the influence of having had the alternative treatment on the likelihood 
# of getting X/Z in the 90 day window after the index date, aka the class distributions
df_alt_treatment = cross_table_alt_treat(lambda x: ~x.isnull(), 'any_alt')
df_before_index = cross_table_alt_treat(lambda x: x < df_merged['INDEX_DATE'], 'before_index')
df_on_index = cross_table_alt_treat(lambda x: x == df_merged['INDEX_DATE'], 'on_index')
df_after_index = cross_table_alt_treat(lambda x: x > df_merged['INDEX_DATE'], 'after_index')

In [22]:
df_cross_table = pd.concat([df_alt_treatment, df_before_index, df_on_index, df_after_index], axis=1).transpose()
df_cross_table['pos/total'] = df_cross_table['P']/df_cross_table.sum(axis=1)

In [23]:
# The final table
df_cross_table

PN_FLAG,N,P,pos/total
any_alt,52600.0,2873.0,0.051791
before_index,32950.0,2589.0,0.07285
on_index,4879.0,128.0,0.025564
after_index,25485.0,1005.0,0.037939


In [24]:
df_cross_table.to_excel('cross_table_alt_treatments.xlsx')

In [25]:
patients_xz_on_index = pd.read_csv(r'F:\Projects\Pfizer_mCRPC\Data\Raw_data\Patients_XZ_on_index.csv')

In [26]:
df_patients_xz_on_index = df_merged[df_merged['PATIENT_ID'].isin(patients_xz_on_index['PATIENT_ID'])]

In [27]:
# Nothing really special happening in terms of alternative treatment and the 139 patients that had X/Z on index
df_patients_xz_on_index[ls_datecols[4:]].apply(lambda x: x == df_patients_xz_on_index['INDEX_DATE']).any(axis=1).sum()
df_patients_xz_on_index[ls_datecols[4:]].apply(lambda x: x < df_patients_xz_on_index['INDEX_DATE']).any(axis=1).sum()
df_patients_xz_on_index[ls_datecols[4:]].apply(lambda x: x > df_patients_xz_on_index['INDEX_DATE']).any(axis=1).sum()

6

111

38