In [1]:
import pandas as pd
import re
import sys

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Function to clean up newly created column names in the dummies

def clean_string(s):
    """
    Function that cleans up string, typically used for column names. Changes
    the incoming string s so that:
        - only lowercase
        - white spaces replaced by underscores
        - brackets replaced by underscores
        - multiple underscores in a row replaced by one
          underscore (e.g. ' (' -> '__' -> '_'
    :param s: string
    :return: new string
    """
    if isinstance(s, float):
        return s
    # only lowercase
    else:
        s = s.lower()
        # whites spaces, brackets, &, ;, , replaced by underscores
        s = re.sub(r'( )|(\()|(\))|(;)|(,)|(&)|(:)', '_', s)
        # multiple underscores in a row replaced by one underscore
        s = re.sub(r'(_)\1+', r'\1', s)
        # trailing underscores are removed
        s = re.sub(r'[_]+$', '', s)
        return s

In [4]:
df_uro = pd.read_csv(r'F:\Projects\Pfizer_mCRPC\Data\pre_modelling\EMR_Urology\Pfizer_mCRPC_UROEMR.csv')

In [5]:
df_uro.tail()

Unnamed: 0,PATIENT_ID,PN_FLAG,LOOKBACK_DATE,INDEX_DATE,DEIDCRPC2FLAG,DEIDCRPC2DATE,DEIDCRPC3FLAG,DEIDCRPC3DATE,DEIDMETASTATICFLAG,DEIDMETASTATICDATE,...,URO_FREE_TESTO_DATE7,URO_TNM_DATE,URO_LATEST_T,URO_LATEST_T_DATE,URO_LATEST_N,URO_LATEST_N_DATE,URO_LATEST_M,URO_LATEST_M_DATE,URO_LATEST_GLEASON,URO_LATEST_GLEASON_DATE
3639,559194684,1,2015-04-07,2017-04-06,1,2015-12-03,1,2015-12-03,0,,...,,,,,,,,,,
3640,212971802,1,2012-03-25,2014-03-25,0,,0,,1,2016-02-11,...,,,,,,,,,,
3641,1627395905,1,2013-07-23,2015-07-23,1,2014-11-06,0,,1,,...,,,,,,,,,,
3642,1273908304,1,2014-11-05,2016-11-04,1,2015-05-14,0,,1,2015-09-09,...,,,,,,,,,,
3643,911918575,1,2013-12-15,2015-12-15,0,,0,,1,,...,,2015-01-01,T3b,2015-01-01,,,,,,


## Remove later metastatic information 

In [6]:
ls_metac_colnames = ['MET_DATE2', 'METAC_SITE_NM2', 'MET_DATE3', 'METAC_SITE_NM3', 'MET_DATE4', 'METAC_SITE_NM4', 'MET_DATE5',
                     'METAC_SITE_NM5', 'MET_DATE6', 'METAC_SITE_NM6', 'MET_DATE7', 'METAC_SITE_NM7']

In [7]:
df_uro = df_uro.drop(columns = ls_metac_colnames)

## Check overlap with Oncology EMR column names 

In [11]:
df_onc = pd.read_csv(r'F:\Projects\Pfizer_mCRPC\Data\pre_modelling\EMR_Oncology\Pfizer_mCRPC_ONCEMR_update.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
df_onc.columns
df_uro.columns

Index(['PATIENT_ID', 'LOOKBACK_DATE', 'INDEX_DATE', 'ONC_LATEST_STAGE',
       'ONC_LATEST_STAGE_DATE', 'MET_DATE1', 'METAC_SITE_NM1', 'MET_DATE2',
       'METAC_SITE_NM2', 'MET_DATE3',
       ...
       'ONC_TNM_DATE', 'ONC_LATEST_T', 'ONC_LATEST_T_DATE', 'ONC_LATEST_N',
       'ONC_LATEST_N_DATE', 'ONC_LATEST_M', 'ONC_LATEST_M_DATE',
       'ONC_LATEST_GLEASON', 'ONC_LATEST_GLEASON_DATE', 'PN_FLAG'],
      dtype='object', length=186)

Index(['PATIENT_ID', 'PN_FLAG', 'LOOKBACK_DATE', 'INDEX_DATE', 'DEIDCRPC2FLAG',
       'DEIDCRPC2DATE', 'DEIDCRPC3FLAG', 'DEIDCRPC3DATE', 'DEIDMETASTATICFLAG',
       'DEIDMETASTATICDATE', 'MET_DATE1', 'METAC_SITE_NM1', 'URO_PSA1',
       'URO_PSA_DATE1', 'URO_PSA2', 'URO_PSA_DATE2', 'URO_PSA3',
       'URO_PSA_DATE3', 'URO_PSA4', 'URO_PSA_DATE4', 'URO_PSA5',
       'URO_PSA_DATE5', 'URO_PSA6', 'URO_PSA_DATE6', 'URO_PSA7',
       'URO_PSA_DATE7', 'URO_PSA8', 'URO_PSA_DATE8', 'URO_PSA9',
       'URO_PSA_DATE9', 'URO_PSA10', 'URO_PSA_DATE10', 'URO_PSA11',
       'URO_PSA_DATE11', 'URO_PSA12', 'URO_PSA_DATE12', 'URO_PSA13',
       'URO_PSA_DATE13', 'URO_PSA14', 'URO_PSA_DATE14', 'URO_PSA15',
       'URO_PSA_DATE15', 'URO_PSA16', 'URO_PSA_DATE16', 'URO_PSA17',
       'URO_PSA_DATE17', 'URO_TOTAL_TESTO1', 'URO_TOTAL_TESTO_DATE1',
       'URO_TOTAL_TESTO2', 'URO_TOTAL_TESTO_DATE2', 'URO_TOTAL_TESTO3',
       'URO_TOTAL_TESTO_DATE3', 'URO_TOTAL_TESTO4', 'URO_TOTAL_TESTO_DATE4',
       'URO_TO

## Make dummy variables 

In [13]:
# Renaming metac site column to match oncology dummy column later
df_uro = df_uro.rename(columns = {'METAC_SITE_NM1': 'METAC_SITE'})

In [14]:
# Four columns should be dummified, exactly like Oncology except for the histology column
ls_not_numeric = [not pd.api.types.is_numeric_dtype(dtype) for dtype in df_uro.dtypes]
prog = re.compile('DATE[0-9]*$')
ls_not_date = [not bool(prog.search(colname)) for colname in df_uro.columns]
ls_both = [num and date for num, date in zip(ls_not_numeric, ls_not_date)]
df_uro.loc[:,ls_both].nunique()

METAC_SITE      11
URO_LATEST_T    10
URO_LATEST_N     2
URO_LATEST_M     7
dtype: int64

In [16]:
ls_other_columns = df_uro.loc[:, ls_both].columns

In [17]:
# Check if we can use clean_string without creating *unwanted* duplicates
for column in ls_other_columns:
    df_uro[column].unique()

array(['BONE & CONNECTIVE TISSUE CANCER', 'OTHER CANCER', 'LUNG CANCER',
       'BRAIN & NERVOUS SYSTEM CANCER', 'LIVER CANCER', 'BLADDER CANCER',
       'HEAD & NECK CANCER', 'COLORECTAL CANCER (CRC)', 'KIDNEY CANCER',
       'SKIN CANCER', 'BREAST CANCER'], dtype=object)

array([nan, 'T2c', 'T3a', 'T3b', 'T2b', 'T1c', 'T2a', 'T1', 'T3B', 'T1b',
       'T2C'], dtype=object)

array([nan, 'Nx', 'N0'], dtype=object)

array([nan, 'M1b', 'Mx', 'M1a', 'MX', 'M1B', 'm1c', 'M1c'], dtype=object)

In [134]:
df_uro[ls_other_columns] = df_uro[ls_other_columns].applymap(clean_string)

In [136]:
df_dummies = pd.get_dummies(df_uro[ls_other_columns])
dummy_colnames = df_dummies.columns
dummy_colnames = [clean_string(colname) for colname in dummy_colnames]
df_dummies.columns = dummy_colnames
df_dummies.head()

Unnamed: 0,metac_site_bladder_cancer,metac_site_bone_connective_tissue_cancer,metac_site_brain_nervous_system_cancer,metac_site_breast_cancer,metac_site_colorectal_cancer_crc,metac_site_head_neck_cancer,metac_site_kidney_cancer,metac_site_liver_cancer,metac_site_lung_cancer,metac_site_other_cancer,...,uro_latest_t_t2b,uro_latest_t_t2c,uro_latest_t_t3a,uro_latest_t_t3b,uro_latest_n_n0,uro_latest_n_nx,uro_latest_m_m1a,uro_latest_m_m1b,uro_latest_m_m1c,uro_latest_m_mx
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [137]:
# Drop MET_DATE1 as it is identical to index date
df_uro_no_metac = df_uro.drop(columns = ['MET_DATE1'])
df_uro_no_metac.head()

Unnamed: 0,PATIENT_ID,PN_FLAG,LOOKBACK_DATE,INDEX_DATE,DEIDCRPC2FLAG,DEIDCRPC2DATE,DEIDCRPC3FLAG,DEIDCRPC3DATE,DEIDMETASTATICFLAG,DEIDMETASTATICDATE,...,URO_FREE_TESTO_DATE7,URO_TNM_DATE,URO_LATEST_T,URO_LATEST_T_DATE,URO_LATEST_N,URO_LATEST_N_DATE,URO_LATEST_M,URO_LATEST_M_DATE,URO_LATEST_GLEASON,URO_LATEST_GLEASON_DATE
0,1563563420,0,2012-05-14,2014-05-14,1,2015-04-20,1,2016-07-20,0,,...,,,,,,,,,,
1,1165766479,0,2012-02-05,2014-02-04,1,2013-01-04,0,,1,,...,,,,,,,,,,
2,91728018,0,2013-01-19,2015-01-19,0,,0,,0,,...,,,,,,,,,,
3,240116111,0,2013-09-29,2015-09-29,0,,0,,0,,...,,2015-08-01,t2c,2015-08-01,,,,,,
4,72619119,0,2012-02-06,2014-02-05,0,,0,,1,,...,,2012-11-01,t3a,2012-11-01,,,,,,


In [138]:
# Drop all columns that are dummified
df_uro_no_cat = df_uro_no_metac.drop(columns = ls_other_columns)

In [139]:
df_uro_no_cat.tail()

Unnamed: 0,PATIENT_ID,PN_FLAG,LOOKBACK_DATE,INDEX_DATE,DEIDCRPC2FLAG,DEIDCRPC2DATE,DEIDCRPC3FLAG,DEIDCRPC3DATE,DEIDMETASTATICFLAG,DEIDMETASTATICDATE,...,URO_FREE_TESTO6,URO_FREE_TESTO_DATE6,URO_FREE_TESTO7,URO_FREE_TESTO_DATE7,URO_TNM_DATE,URO_LATEST_T_DATE,URO_LATEST_N_DATE,URO_LATEST_M_DATE,URO_LATEST_GLEASON,URO_LATEST_GLEASON_DATE
3639,559194684,1,2015-04-07,2017-04-06,1,2015-12-03,1,2015-12-03,0,,...,,,,,,,,,,
3640,212971802,1,2012-03-25,2014-03-25,0,,0,,1,2016-02-11,...,,,,,,,,,,
3641,1627395905,1,2013-07-23,2015-07-23,1,2014-11-06,0,,1,,...,,,,,,,,,,
3642,1273908304,1,2014-11-05,2016-11-04,1,2015-05-14,0,,1,2015-09-09,...,,,,,,,,,,
3643,911918575,1,2013-12-15,2015-12-15,0,,0,,1,,...,,,,,2015-01-01,2015-01-01,,,,


In [140]:
df_total = pd.concat([df_uro_no_cat, df_dummies], axis=1)

In [142]:
# Clean all strings
all_colnames = [clean_string(colname) for colname in df_total.columns]
df_total.columns = all_colnames
df_total.head()

Unnamed: 0,patient_id,pn_flag,lookback_date,index_date,deidcrpc2flag,deidcrpc2date,deidcrpc3flag,deidcrpc3date,deidmetastaticflag,deidmetastaticdate,...,uro_latest_t_t2b,uro_latest_t_t2c,uro_latest_t_t3a,uro_latest_t_t3b,uro_latest_n_n0,uro_latest_n_nx,uro_latest_m_m1a,uro_latest_m_m1b,uro_latest_m_m1c,uro_latest_m_mx
0,1563563420,0,2012-05-14,2014-05-14,1,2015-04-20,1,2016-07-20,0,,...,0,0,0,0,0,0,0,0,0,0
1,1165766479,0,2012-02-05,2014-02-04,1,2013-01-04,0,,1,,...,0,0,0,0,0,0,0,0,0,0
2,91728018,0,2013-01-19,2015-01-19,0,,0,,0,,...,0,0,0,0,0,0,0,0,0,0
3,240116111,0,2013-09-29,2015-09-29,0,,0,,0,,...,0,1,0,0,0,0,0,0,0,0
4,72619119,0,2012-02-06,2014-02-05,0,,0,,1,,...,0,0,1,0,0,0,0,0,0,0


## Save files 

In [144]:
ls_columns_reordered = df_total.columns.tolist()
ls_columns_reordered.remove('pn_flag')
ls_columns_reordered = [ls_columns_reordered[0]] + ['pn_flag'] + ls_columns_reordered[1:]

In [145]:
len(set(ls_columns_reordered))

105

In [150]:
df_total = df_total[ls_columns_reordered]

In [151]:
df_total.head()

Unnamed: 0,patient_id,pn_flag,lookback_date,index_date,deidcrpc2flag,deidcrpc2date,deidcrpc3flag,deidcrpc3date,deidmetastaticflag,deidmetastaticdate,...,uro_latest_t_t2b,uro_latest_t_t2c,uro_latest_t_t3a,uro_latest_t_t3b,uro_latest_n_n0,uro_latest_n_nx,uro_latest_m_m1a,uro_latest_m_m1b,uro_latest_m_m1c,uro_latest_m_mx
0,1563563420,0,2012-05-14,2014-05-14,1,2015-04-20,1,2016-07-20,0,,...,0,0,0,0,0,0,0,0,0,0
1,1165766479,0,2012-02-05,2014-02-04,1,2013-01-04,0,,1,,...,0,0,0,0,0,0,0,0,0,0
2,91728018,0,2013-01-19,2015-01-19,0,,0,,0,,...,0,0,0,0,0,0,0,0,0,0
3,240116111,0,2013-09-29,2015-09-29,0,,0,,0,,...,0,1,0,0,0,0,0,0,0,0
4,72619119,0,2012-02-06,2014-02-05,0,,0,,1,,...,0,0,1,0,0,0,0,0,0,0


In [158]:
df_total.to_csv(r'F:\Projects\Pfizer_mCRPC\Data\pre_modelling\EMR_Urology\01_Urology_EMR_cleaned_with_dummies.csv', index=False)