In [1]:
import pandas as pd
import pipeline as p
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
pd.set_option('mode.chained_assignment', None)

In [3]:
df = p.open_pkl('breast_2000-2015_df.pkl')

In [4]:
# col_descr_dict = p.open_pkl('col_descr_dict.pkl')
# col_lookup = {v:k for k, v in col_descr_dict.items()}

In [4]:
def convert_to_num_nan(df, col, unk):
    '''Converts col(str) in df(DataFrame) to numeric (integer)
    Then converts values equal to 'unk'(str or int) in col to 'None' '''
    df[col] = pd.to_numeric(df[col], errors='coerce')
    return df[col].apply(lambda x: None if x == unk else x)

In [5]:
def drop_unk(df, col, unk):
    return df[col].apply(lambda x: None if x == unk else x)

In [6]:
df['SRV_TIME_MON'] = convert_to_num_nan(df, 'SRV_TIME_MON', 9999)

In [7]:
df['YEAR_DX'] = pd.to_numeric(df['YEAR_DX'])

Calculation of
Survival months = FLOOR((endpoint – date of diagnosis) / days in a month)

So: Drop any record after Dec 2010 (< 60 mo before study cut-off). 

See: https://seer.cancer.gov/survivaltime/
https://seer.cancer.gov/survivaltime/3-fields-survival-time-active.pdf

In [8]:
df2 = df.drop(df[df['YEAR_DX'] > 2010].index)

In [14]:
len(df2)

387762

### Create target classification

In [9]:
df2['TARGET'] = df2['SRV_TIME_MON'].apply(lambda x: 1 if x >= 60 else 0)

In [11]:
df2['TARGET'].value_counts(1)

1    0.790116
0    0.209884
Name: TARGET, dtype: float64

Be aware: target classification is imbalanced!

**Flags**

Survival Time Flag:
- 0  = complete dates, days of survival = 0
- 1 = complete dates, days of survival > 0
- 2 = Incomplete dates, days of follow-up *could* = 0. 
- 3 = Incomplete dates, days of follow-up *cannot* = 0
- 8 = not calc; death certificate / autopsy only

In [10]:
df2['SRV_TIME_MON_FLAG'] = convert_to_num_nan(df2, 'SRV_TIME_MON_FLAG', 9)

In [13]:
df2['SRV_TIME_MON_FLAG'].value_counts()

1    373548
3     11877
8      1741
0       432
2       164
Name: SRV_TIME_MON_FLAG, dtype: int64

Drop any entries with incomplete dates, to be on the safe side

In [11]:
df2 = df2.drop(df2[(df2['SRV_TIME_MON_FLAG'] == 2) | (df2['SRV_TIME_MON_FLAG'] == 3)].index)

In [20]:
len(df2)

375721

### Select & Clean Feature Columns 
- convert to numeric, where possible
- condense classes, where possible
- convert 9, 99, etc to 'n/a'

#### *Sequence number*

Sequence number = # and sequence of tumors *in a given patient*. 

From codebookm pg 14: 
>"The purpose of sequencing based on the patient’s lifetime is to truly identify the patients for survival analysis who only had one malignant primary in their lifetimes...This sequence number counts all tumors that were reportable in the year they were diagnosed even if the tumors occurred before the registry existed, or before the registry participated in the SEER Program")

- Keep only first entry for a given patient. 
- Include sequence number as a feature. Convert to numeric --> treat as ordinal
Distributions of survival months & diagnosis year appear similar between sequence numbers.
My interpretation of code book is that a seq num > 01 indicates that >1 tumor was found at time of diagnosis, so we can keep all numbers.

In [12]:
df3 = df2.drop_duplicates(subset='PUBCSNUM', keep='first')

In [13]:
df3['SEQ_NUM'] = convert_to_num_nan(df3, 'SEQ_NUM', 99)

In [23]:
vcs(df3, 'SEQ_NUM')

0     263627
1      49718
2      38668
3       4416
4        543
5         74
6          8
50         1
20         1
8          1
7          1
Name: SEQ_NUM, dtype: int64


In [24]:
len(df3)

357058

**Primary Site** 

A str-nominal

     C50.0    Nipple
     C50.1    Central portion of breast
     C50.2    Upper-inner quadrant of breast
     C50.3    Lower-inner quadrant of breast
     C50.4    Upper-outer quadrant of breast
     C50.5    Lower-outer quadrant of breast
     C50.6    Axillary tail of breast
     C50.8    Overlapping lesion of breast
     C50.9    Breast, NOS

In [25]:
vcs(df3, 'PRIMSITE')

C504    117692
C508     72253
C509     64066
C502     34583
C505     22786
C501     21370
C503     19362
C500      2780
C506      2166
Name: PRIMSITE, dtype: int64


**Age**

In [14]:
df3['AGE_DX'] = convert_to_num_nan(df3, 'AGE_DX', 999)

**Grade**

Convert to numeric, to treat as ordinal.

9 = not determined, not stated, or n/a. Drop these

In [15]:
df3['GRADE'] = convert_to_num_nan(df3, 'GRADE', 9)

In [28]:
vcs(df3, 'GRADE')

2.0    128271
3.0    105325
1.0     60140
4.0     11068
Name: GRADE, dtype: int64


**Tumor Size**

- EOD10_SZ: only for cases 1998-2003
- CSTUMSIZ: cases 2004 on

Need to combine into 1 feature. Then drop:
999 = unknown for both

- 001-988 Exact size in millimeters
- 989 989 millimeters or larger
- 990 Microscopic focus or foci only; no size of focus is given
- 991 Described as less than 1 cm
- 992 Described as less than 2 cm
- 993 Described as less than 3 cm
- 994 Described as less than 4 cm
- 995 Described as less than 5 cm
- 996-998 Site-specific codes where needed
- 999 Unknown; size not stated; not stated in patient record 
- 888             Not applicable

In [16]:
def get_one_or_other(row, col1, col2, null_str):
    if row[col1] == null_str:
        return row[col2]
    else:
        return row[col1]

In [17]:
ws = df3['EOD10_SZ'].value_counts().index[0]

df3.loc[:,'TUMSIZ'] = df3.apply(lambda row: get_one_or_other(row, 'EOD10_SZ','CSTUMSIZ', ws), axis=1)
df3.loc[:,'TUMSIZ'] = convert_to_num_nan(df3, 'TUMSIZ', 999)

df3.drop(columns = ['EOD10_SZ', 'CSTUMSIZ'], inplace=True)

df3['TUMSIZ'] = drop_unk(df3, 'TUMSIZ', 888)
df3['TUMSIZ'] = drop_unk(df3, 'TUMSIZ', 990)
df3['TUMSIZ'] = drop_unk(df3, 'TUMSIZ', 996)
df3['TUMSIZ'] = drop_unk(df3, 'TUMSIZ', 997)
df3['TUMSIZ'] = drop_unk(df3, 'TUMSIZ', 998)

df3['TUMSIZ'] = df3['TUMSIZ'].apply(lambda x: x-990 if x >= 991 else x)

In [26]:
df3['TUMSIZ'].describe()

count    305725.000000
mean         21.611798
std          23.995470
min           0.000000
25%          10.000000
50%          16.000000
75%          25.000000
max         989.000000
Name: TUMSIZ, dtype: float64

**Positive Nodes**

- EOD10_PN: 1998+ Number of positive regional nodes found to contain metastases.
- CSLYMPHN: 2004+ "Information on involvement". Don't include this one.

Any value > 90 is 'n/a' for our purposes and can be dropped

In [None]:
df3['EOD10_PN'] = convert_to_num_nan(df3, 'EOD10_PN', 99)
df3 = df3.drop(df3[df3['EOD10_PN'] > 90].index)
df3.rename(columns={'EOD10_PN':'POS_NODES'}, inplace=True)

In [23]:
df3['POS_NODES'].describe()

**Surgery or not**

'NO_SURG' column:
- 0 = surgery performed
- other = surgery not performed, and # = reason why
- 8 = unknown if performed
- 9 = unknown, death certficiate only

Create new columns 'SURG' whose values indicate:
- 1 = surgery was performed
- 0 = surgery was not performed

In [27]:
def surg_class(x):
    if x == '0':
        return 1
    elif (x == '8') | (x == '9'):
        return None
    else:
        return 0

In [28]:
df3['SURG'] = df3['NO_SURG'].apply(lambda x: surg_class(x))

In [29]:
df3['SURG'].value_counts(1)

1.0    0.986628
0.0    0.013372
Name: SURG, dtype: float64

**Marital Status**

- 1 Single (never married)
- 2 Married (including common law) 
- 3 Separated
- 4 Divorced
- 5 Widowed
- 6 Unmarried or domestic partner (same sex or opposite sex or unregistered)
- 9 Unknown

Create new column: 'MAR_STAT_MOD'
- 1 = Single
- 2 = Married or domestic partner

In [30]:
def marital_class(x):
    if x == '9':
        return None
    elif (x == '2') | (x == '6'):
        return int(1)
    else:
        return int(0)

In [31]:
# df3['MAR_STAT_MOD'] = convert_to_num_nan(df3, 'MAR_STAT', 9)
df3['MAR_STAT_MOD'] = df3['MAR_STAT'].apply(lambda x: marital_class(x))

**Race**

Convert to:
- 1 = White
- 2 = Black
- 3 = other

Note 99 = Unk

In [9]:
len(df3['RACE1V'].unique())

30

In [12]:
def convert_race(x):
    if (x == '01') | (x == '02'):
        return x
    elif x == '99':
        return None
    else:
        return '3'

In [33]:
df3['RACE_MOD'] = df3['RACE1V'].apply(lambda x: convert_race(x))

In [48]:
len(df3)

269951

**Behavior**

- 2 = noninvasive 
- 3 = invasive

In [35]:
p.vcs(df3, 'BEHO3V')

3    250578
2     19373
Name: BEHO3V, dtype: int64

In [36]:
df3['INVAS'] = df3['BEHO3V'].apply(lambda x: 0 if x == '2' else 1)

**Histology**

In [38]:
vcs(df3, 'HISTO2V')

8500    199068
8520     21879
8522     18553
8480      5333
8501      3311
8010      3057
8000      2624
8211      2614
8140      1804
8201      1721
8510      1596
8530      1373
8230      1149
8503      1022
8401       720
8541       672
8050       615
8521       416
8543       371
8504       281
8200       179
9020       177
8540       121
8481       106
8070       105
8260        93
8490        76
8022        71
8980        71
8560        57
         ...  
8210         2
8814         1
8511         1
8460         1
8004         1
8910         1
8851         1
8901         1
9473         1
8190         1
8894         1
8340         1
8470         1
8542         1
8110         1
8451         1
8100         1
8261         1
8075         1
8052         1
8076         1
8855         1
9170         1
8042         1
9040         1
8400         1
8804         1
8154         1
8900         1
8030         1
Name: HISTO2V, Length: 108, dtype: int64


too many values -- do not include

**Sub-type**

BRST_SUB = Breast Sub-type: Her2+/- and HR+/-
- 1         Her2+/HR+
- 2     Her2+/HR-
- 3       Her2-/HR+
- 4 Triple Negative
- 5         Unknown
- 9 Not 2010+ Breast

HER2 = Derived Her2 Recode: Her2 + or -
- 1 = Pos
- 2 = Neg
- 3 = Borderline
- 4 = Unk
- 9 = Not 2010+ Breast (unclear what this means)

In [51]:
vcs(df3, 'BRST_SUB')

9    243824
3     15868
5      4191
4      2750
1      2289
2      1029
Name: BRST_SUB, dtype: int64


In [52]:
vcs(df3, 'HER2')

9    243824
2     18669
4      3553
1      3334
3       571
Name: HER2, dtype: int64


If I wanted to include cancer subtype information, I'd have to reduce my dataset size by 10-fold:

In [37]:
len(df3[(df3['BRST_SUB'] != '9') & (df3['BRST_SUB'] != '5') & (df3['HER2'] != '9') & (df3['HER2'] != '4')]), len(df3)

(21936, 269951)

Is HER2 redundant with BRST_SUB?

In [26]:
len(df5[(df5['BRST_SUB'] == '1')|(df5['BRST_SUB'] == '2') & 
        (df5['HER2'] == 1)]) == len(df5[(df5['BRST_SUB'] == '1')|(df5['BRST_SUB'] == '2')])

In [None]:
len(df5[(df5['BRST_SUB'] == '3')|(df5['BRST_SUB'] == '4') 
        & (df5['HER2'] == 0)]) == len(df5[(df5['BRST_SUB'] == '3')|(df5['BRST_SUB'] == '4')])

Yes -- can drop HER2 and keep only BRST_SUB

In [38]:
df5 = df3.copy()
df5 = df5.drop(df5[(df5['BRST_SUB'] == '9') | (df5['BRST_SUB'] == '5')].index)

### Save 'cleaned' dataset

In [55]:
# df3 = p.open_pkl('breast_2000-2015_df3.pkl')
p.pkl_this('Data/breast_2000-2015_df_cleaned1.pkl', df3)

### 'Final' list of columns to include in model

In [57]:
cols_to_include = ['MAR_STAT_MOD',
                   'RACE_MOD',
                   'AGE_DX',
                   'GRADE',
                   'TUMSIZ',
                   'SURG',
                   'SEQ_NUM',
                   'PRIMSITE',
                   'POS_NODES',
                  'INVAS'] 

In [58]:
p.pkl_this('Data/model_cols', cols_to_include)


In [59]:
df4 = df3[cols_to_include + ['TARGET']]

In [60]:
df4.dtypes

MAR_STAT_MOD    float64
RACE_MOD         object
AGE_DX          float64
GRADE           float64
TUMSIZ          float64
SURG            float64
SEQ_NUM           int64
PRIMSITE         object
POS_NODES       float64
INVAS             int64
TARGET            int64
dtype: object

In [61]:
df4 = df4.dropna()

In [62]:
len(df4)

222302

In [63]:
p.pkl_this('Data/breast_mvp_01_df.pkl', df4)

**Data frame including Her2 sub-type info**

In [42]:
cols_to_include2 = ['MAR_STAT_MOD',
                   'RACE_MOD',
                   'AGE_DX',
                   'GRADE',
                   'TUMSIZ',
                   'SURG',
                   'SEQ_NUM',
                   'PRIMSITE',
                   'POS_NODES',
                    'HER2',
                    'BRST_SUB',
                   'INVAS']

In [65]:
# p.pkl_this('Data/model_cols_with_subtype.pkl', cols_to_include2)


In [43]:
df6 = df5[cols_to_include2 + ['TARGET']]


In [44]:
df6 = df6.dropna()


In [45]:
df6.head()


Unnamed: 0,MAR_STAT_MOD,RACE_MOD,AGE_DX,GRADE,TUMSIZ,SURG,SEQ_NUM,PRIMSITE,POS_NODES,HER2,BRST_SUB,INVAS,TARGET
589,0.0,3,72.0,3.0,17.0,1.0,0,C504,0.0,0,3,1,1
688,1.0,3,72.0,2.0,15.0,1.0,2,C504,0.0,0,3,1,1
1108,0.0,3,56.0,3.0,24.0,1.0,3,C508,9.0,0,3,1,0
1267,0.0,3,85.0,3.0,8.0,1.0,2,C508,0.0,0,3,1,1
1702,0.0,3,83.0,1.0,11.0,1.0,2,C508,0.0,0,3,1,1


In [75]:
len(df6)


19671

In [46]:
p.pkl_this('Data/breast_mvp_df_with_sub.pkl', df6)
