In [1]:
import pandas as pd
import numpy as np
import re
import sys

sys.path.append('F:\Frederik\pfizer_mcrpc\paqc')

from paqc.utils import config_utils, utils
from paqc.connectors import csv

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Function to clean up newly created column names in the dummies

def clean_string(s):
    """
    Function that cleans up string, typically used for column names. Changes
    the incoming string s so that:
        - only lowercase
        - white spaces replaced by underscores
        - brackets replaced by underscores
        - multiple underscores in a row replaced by one
          underscore (e.g. ' (' -> '__' -> '_'
    :param s: string
    :return: new string
    """
    if isinstance(s, float):
        return s
    # only lowercase
    else:
        s = s.lower()
        # whites spaces, brackets, &, ;, , replaced by underscores
        s = re.sub(r'( )|(\()|(\))|(;)|(,)|(&)|(:)', '_', s)
        # multiple underscores in a row replaced by one underscore
        s = re.sub(r'(_)\1+', r'\1', s)
        # trailing underscores are removed
        s = re.sub(r'[_]+$', '', s)
        return s

In [29]:
dict_config = config_utils.config_open(r'general_config_EMR_combined.yml')[1]

In [30]:
df_EMR = csv.read_csv(dict_config, r'F:\Projects\Pfizer_mCRPC\Data\pre_modelling\EMR_combined\Pfizer_mCRPC_EMR_combined.csv')

  if self.run_code(code, result):


In [31]:
df_EMR.shape
df_EMR.head()

(8524, 186)

Unnamed: 0,deidcrpc2date,deidcrpc2flag,deidcrpc3date,deidcrpc3flag,deidmetastaticdate,deidmetastaticflag,ecog1,ecog10,ecog11,ecog12,...,total_testo_date17,total_testo_date18,total_testo_date2,total_testo_date3,total_testo_date4,total_testo_date5,total_testo_date6,total_testo_date7,total_testo_date8,total_testo_date9
0,2015-04-20,1.0,2016-07-20,1.0,NaT,0.0,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
1,NaT,0.0,NaT,0.0,NaT,0.0,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2016-06-22,1.0,2016-06-22,1.0,NaT,0.0,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,2014-02-14,1.0,NaT,0.0,NaT,1.0,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
4,2017-08-14,1.0,NaT,0.0,NaT,1.0,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [7]:
# Get rid of some columns that have their dates after the index date
ls_wrong_dates = ['deidcrpc2date','deidcrpc2flag','deidcrpc3date','deidcrpc3flag','deidmetastaticdate','deidmetastaticflag', 
                 'latest_stage', 'latest_stage_date']

In [8]:
df_EMR = df_EMR.drop(columns=ls_wrong_dates)

## Make dummies 

In [9]:
df_categorical = df_EMR.select_dtypes(exclude=['float64', 'datetime64', 'int64'])

In [10]:
# Overview of unique values in these categorical columns, we clean these strings 
# before we make them dummies, to avoid that M1B and M1b e.g. become two different
# columns

for column in df_categorical:
    df_categorical[column].unique()

array(['ADENOCARCINOMA, NOS', 'GENITOURINARY: PROSTATE CANCER', nan,
       'ADENOCARCINOMA', 'TRANSITIONAL CELL CARCINOMA',
       'SMALL CELL CARCINOMA', 'SMALL CELL CARCINOMA, NOS', 'SMALL CELL',
       'CARCINOMA, NOS'], dtype=object)

array([nan, 'M1', 'M0', 'M1b', 'M1c', 'M1a', 'Mx', 'MX', 'M1B', 'm1c'], dtype=object)

array([nan, 'N1', 'N0', 'N1a', 'N2', 'N0(i-)', 'N2b', 'N2a', 'N3a', 'N3',
       'N1mi', 'N1b', 'N2c', 'N0(i+)', 'N1c', 'Nx'], dtype=object)

array([nan, 'T2', 'T1c', 'T3b', 'T2c', 'T3', 'T2b', 'T1', 'T4', 'T3a',
       'T0', 'T4a', 'Tis (DCIS)', 'T2a', 'T1a', 'T1b', 'T4b', 'T3c', 'Tis',
       'Tis (LCIS)', 'T3B', 'T2C'], dtype=object)

array(['BONE & CONNECTIVE TISSUE CANCER', 'OTHER CANCER', 'LUNG CANCER',
       'BRAIN & NERVOUS SYSTEM CANCER', 'LIVER CANCER', 'BLADDER CANCER',
       'HEAD & NECK CANCER', 'COLORECTAL CANCER (CRC)', 'SKIN CANCER',
       'BREAST CANCER', 'KIDNEY CANCER'], dtype=object)

In [11]:
# exception, we do not want that N0(i-) and N0(i+) become the same
map_values = {'N0(i-)': 'N0(i_minus)', 'N0(i+)': 'N0(i_plus)'}
df_categorical['latest_n'] = df_categorical['latest_n'].replace(map_values)
df_categorical['latest_n'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([nan, 'N1', 'N0', 'N1a', 'N2', 'N0(i_minus)', 'N2b', 'N2a', 'N3a',
       'N3', 'N1mi', 'N1b', 'N2c', 'N0(i_plus)', 'N1c', 'Nx'], dtype=object)

In [12]:
df_categorical = df_categorical.applymap(clean_string)

In [13]:
df_dummies = pd.get_dummies(df_categorical)

In [14]:
df_not_categorical = df_EMR.select_dtypes(include=['float64', 'datetime64', 'int64'])
df_not_categorical.shape
df_not_categorical.head()

(8524, 173)

Unnamed: 0,ecog1,ecog10,ecog11,ecog12,ecog13,ecog14,ecog15,ecog16,ecog17,ecog18,...,total_testo_date17,total_testo_date18,total_testo_date2,total_testo_date3,total_testo_date4,total_testo_date5,total_testo_date6,total_testo_date7,total_testo_date8,total_testo_date9
0,,,,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
1,,,,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,,,,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,,,,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
4,,,,,,,,,,,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT


In [15]:
df_EMR_with_dummies = pd.concat([df_not_categorical, df_dummies], axis=1)

## Make date difference variables 

In [16]:
df_EMR_with_dummies.dtypes.unique()

array([dtype('float64'), dtype('<M8[ns]'), dtype('int64'), dtype('uint8')], dtype=object)

In [17]:
df_datecols = df_EMR_with_dummies.select_dtypes(include=['<M8[ns]']).drop(columns=['index_date', 'lookback_date'])
df_datecols.shape

(8524, 87)

In [18]:
# Creating date difference variables
df_dd = pd.DataFrame()
for date_col in df_datecols:
    dd_col = date_col + '_index_dd'
    df_dd[dd_col] = (df_EMR['index_date'] - df_datecols[date_col]).dt.days

In [19]:
df_dd.shape
df_dd.head()

(8524, 87)

Unnamed: 0,ecog_date1_index_dd,ecog_date10_index_dd,ecog_date11_index_dd,ecog_date12_index_dd,ecog_date13_index_dd,ecog_date14_index_dd,ecog_date15_index_dd,ecog_date16_index_dd,ecog_date17_index_dd,ecog_date18_index_dd,...,total_testo_date17_index_dd,total_testo_date18_index_dd,total_testo_date2_index_dd,total_testo_date3_index_dd,total_testo_date4_index_dd,total_testo_date5_index_dd,total_testo_date6_index_dd,total_testo_date7_index_dd,total_testo_date8_index_dd,total_testo_date9_index_dd
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [20]:
df_nodates = df_EMR_with_dummies.select_dtypes(exclude=['<M8[ns]'])
df_EMR_dd_dummies = pd.concat([df_dd, df_nodates], axis=1)
df_EMR_dd_dummies.shape
df_EMR_dd_dummies.head()

(8524, 230)

Unnamed: 0,ecog_date1_index_dd,ecog_date10_index_dd,ecog_date11_index_dd,ecog_date12_index_dd,ecog_date13_index_dd,ecog_date14_index_dd,ecog_date15_index_dd,ecog_date16_index_dd,ecog_date17_index_dd,ecog_date18_index_dd,...,metac_site_nm1_bone_connective_tissue_cancer,metac_site_nm1_brain_nervous_system_cancer,metac_site_nm1_breast_cancer,metac_site_nm1_colorectal_cancer_crc,metac_site_nm1_head_neck_cancer,metac_site_nm1_kidney_cancer,metac_site_nm1_liver_cancer,metac_site_nm1_lung_cancer,metac_site_nm1_other_cancer,metac_site_nm1_skin_cancer
0,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
1,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
2,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
3,,,,,,,,,,,...,0,0,0,0,0,0,0,0,1,0
4,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0


## Get rid of sparse data

In [22]:
# Some columns are too spare to calculate bivariate stats out
ss_sparse = (~df_EMR_dd_dummies.isnull()).sum() < 3
ls_sparse_cols = ss_sparse[ss_sparse].index.tolist()

In [23]:
df_EMR_dd_dummies_no_sparse = df_EMR_dd_dummies.drop(columns=ls_sparse_cols)

In [24]:
df_EMR_dd_dummies_no_sparse.head()

Unnamed: 0,ecog_date1_index_dd,ecog_date10_index_dd,ecog_date11_index_dd,ecog_date12_index_dd,ecog_date13_index_dd,ecog_date14_index_dd,ecog_date15_index_dd,ecog_date16_index_dd,ecog_date17_index_dd,ecog_date18_index_dd,...,metac_site_nm1_bone_connective_tissue_cancer,metac_site_nm1_brain_nervous_system_cancer,metac_site_nm1_breast_cancer,metac_site_nm1_colorectal_cancer_crc,metac_site_nm1_head_neck_cancer,metac_site_nm1_kidney_cancer,metac_site_nm1_liver_cancer,metac_site_nm1_lung_cancer,metac_site_nm1_other_cancer,metac_site_nm1_skin_cancer
0,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
1,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
2,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
3,,,,,,,,,,,...,0,0,0,0,0,0,0,0,1,0
4,,,,,,,,,,,...,1,0,0,0,0,0,0,0,0,0


## Save files to CSV

In [25]:
df_EMR_dd_dummies.to_csv(r'F:\Projects\Pfizer_mCRPC\Data\pre_modelling\EMR_combined\02_EMR_combined_with_dd_with_dummies.csv')
df_EMR_dd_dummies_no_sparse.to_csv(r'F:\Projects\Pfizer_mCRPC\Data\pre_modelling\EMR_combined\02_EMR_combined_with_dd_with_dummies_no_sparse.csv')