In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

In [2]:
#send data to rename.ipynb to rename all categories from numbers into words
data = pd.read_csv('data/data_lvi.csv')
print(data.shape)
data.head()


(1997, 29)


Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,...,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,Academic/Research Program,Midwest,52.0,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,...,Non-Expansion State,0.0,0.0,0.0,2A,2.0,0.0,1.0,0,0
1,Integrated Network Cancer Program,Midwest,60.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,10.3,0,...,Non-Expansion State,5400.0,0.0,1.0,3C,3.0,1.0,2.0,0,0
2,Integrated Network Cancer Program,Midwest,69.0,Female,White,Non-Hispanic,Medicare/Public,Rural,90.3,0,...,Non-Expansion State,0.0,1.0,0.0,3A,2.0,1.0,2.0,0,0
3,Integrated Network Cancer Program,Midwest,55.0,Male,White,Hispanic,Medicaid,Urban,5.2,0,...,Early Expansion (before 1/2014),9900.0,0.0,1.0,3A,0.0,1.0,1.0,0,0
4,Integrated Network Cancer Program,Midwest,57.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,12.7,0,...,Early Expansion (before 1/2014),0.0,0.0,0.0,2B,3.0,0.0,1.0,0,0


In [3]:
data['APR'].value_counts(normalize=True)

0.0    0.829244
1.0    0.170756
Name: APR, dtype: float64

In [4]:
data['Stage'].value_counts()

2A    741
3A    369
3C    343
1     243
2B    220
3B     81
Name: Stage, dtype: int64

In [5]:
#ordinal encode the 'Stage' column
data.loc[data['Stage'] == '2A', 'Stage'] = 2
data.loc[data['Stage'] == '2B', 'Stage'] = 3
data.loc[data['Stage'] == '2C', 'Stage'] = 4
data.loc[data['Stage'] == '3A', 'Stage'] = 5
data.loc[data['Stage'] == '3B', 'Stage'] = 6
data.loc[data['Stage'] == '3C', 'Stage'] = 7
data['Stage'].value_counts()


2    741
5    369
7    343
1    243
3    220
6     81
Name: Stage, dtype: int64

In [6]:
data['Charlson-Deyo Score'].value_counts()

0            1610
1             245
3 or more      75
2              67
Name: Charlson-Deyo Score, dtype: int64

In [7]:
data.loc[data['Charlson-Deyo Score'] == '0', 'Charlson-Deyo Score'] = 0
data.loc[data['Charlson-Deyo Score'] == '1', 'Charlson-Deyo Score'] = 1
data.loc[data['Charlson-Deyo Score'] == '2', 'Charlson-Deyo Score'] = 2
data.loc[data['Charlson-Deyo Score'] == '3 or more', 'Charlson-Deyo Score'] = 2
data['Charlson-Deyo Score'].value_counts()

0    1610
1     245
2     142
Name: Charlson-Deyo Score, dtype: int64

In [8]:
#sort columns by proportion of missing values
pd.DataFrame(data.isnull().sum()/data.shape[0]).sort_values(by=0, ascending=False).head(20)

Unnamed: 0,0
"Radiation Dose, >30 Gy",0.524787
"Radiation Dose, Rads",0.524787
"Duration of Radiation, Days",0.197296
Median Income Quartile,0.129194
No High School Degree (%),0.127692
Distance From Facility,0.116174
Facility Type,0.023535
Facility Location,0.023535
Spanish Hispanic Origin,0.017026
N Stage,0.015523


In [9]:
num_cols = ['Age', 'Distance From Facility', 'Tumor Size', 'Diagnosis/Radiation Interval, Days', 'Duration of Radiation, Days', 'Radiation Dose, Rads']
cat_cols = [x for x in data.columns if x not in num_cols]
cat_cols.remove('APR')
cat_cols.remove('YEAR_OF_DIAGNOSIS')

In [10]:
#set datatype of categorical columns to object
cat_imputer = SimpleImputer(strategy='constant', fill_value='unknown')
for col in cat_cols:
    data[col] = data[col].astype(str)
for x in cat_cols:
    data[x] = cat_imputer.fit_transform(data[x].values.reshape(-1,1))
num_imputer = SimpleImputer(strategy='median')
for x in num_cols:
    data[x] = num_imputer.fit_transform(data[x].values.reshape(-1,1))
#sort columns by proportion of missing values
pd.DataFrame(data.isnull().sum()/data.shape[0]).sort_values(by=0, ascending=False).head(5)

Unnamed: 0,0
Facility Type,0.0
"Duration of Radiation, Days",0.0
Keratinizing,0.0
"Facility Volume, Quartile",0.0
N Stage,0.0


In [11]:
data.head()

Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,...,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,Academic/Research Program,Midwest,52.0,Female,White,Non-Hispanic,Not Insured,Urban,3.6,0,...,Non-Expansion State,0.0,0.0,0.0,2,2.0,0.0,1.0,0,0
1,Integrated Network Cancer Program,Midwest,60.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,10.3,0,...,Non-Expansion State,5400.0,0.0,1.0,7,3.0,1.0,2.0,0,0
2,Integrated Network Cancer Program,Midwest,69.0,Female,White,Non-Hispanic,Medicare/Public,Rural,90.3,0,...,Non-Expansion State,0.0,1.0,0.0,5,2.0,1.0,2.0,0,0
3,Integrated Network Cancer Program,Midwest,55.0,Male,White,Hispanic,Medicaid,Urban,5.2,0,...,Early Expansion (before 1/2014),9900.0,0.0,1.0,5,0.0,1.0,1.0,0,0
4,Integrated Network Cancer Program,Midwest,57.0,Female,White,Non-Hispanic,Private Insurance or Managed Care,Urban,12.7,0,...,Early Expansion (before 1/2014),0.0,0.0,0.0,3,3.0,0.0,1.0,0,0


In [12]:
pd.crosstab(data['Lymphovascular Invasion'], data['YEAR_OF_DIAGNOSIS'])

YEAR_OF_DIAGNOSIS,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Lymphovascular Invasion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,203,150,217,212,214,198,226,172,48,21
Yes,25,37,26,43,52,38,46,39,19,11


In [13]:
pd.crosstab(data['Lymphovascular Invasion'], data['YEAR_OF_DIAGNOSIS'], normalize='columns')

YEAR_OF_DIAGNOSIS,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Lymphovascular Invasion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
No,0.890351,0.802139,0.893004,0.831373,0.804511,0.838983,0.830882,0.815166,0.716418,0.65625
Yes,0.109649,0.197861,0.106996,0.168627,0.195489,0.161017,0.169118,0.184834,0.283582,0.34375


In [14]:
data.loc[data['Lymphovascular Invasion'] == 'No', 'Lymphovascular Invasion'] = 0
data.loc[data['Lymphovascular Invasion'] == 'Yes', 'Lymphovascular Invasion'] = 2
data.loc[data['Lymphovascular Invasion'] == 'nan', 'Lymphovascular Invasion'] = 1
data['Lymphovascular Invasion'].value_counts()

0    1661
2     336
Name: Lymphovascular Invasion, dtype: int64

In [15]:
data['Median Income Quartile'].value_counts()

>=$46,000            582
$30,000 - $34,999    426
$35,000 - $45,999    390
< $30,000            341
nan                  258
Name: Median Income Quartile, dtype: int64

In [16]:
data.loc[data['Median Income Quartile'] == 'nan', 'Median Income Quartile'] = 0
data.loc[data['Median Income Quartile'] == '< $30,000', 'Median Income Quartile'] = 1
data.loc[data['Median Income Quartile'] == '$30,000 - $34,999', 'Median Income Quartile'] = 2
data.loc[data['Median Income Quartile'] == '$35,000 - $45,999', 'Median Income Quartile'] = 3
data.loc[data['Median Income Quartile'] == '>=$46,000', 'Median Income Quartile'] = 4
data['Median Income Quartile'].value_counts()

4    582
2    426
3    390
1    341
0    258
Name: Median Income Quartile, dtype: int64

In [17]:
data['No High School Degree (%)'].value_counts()

13.0-20.9%    503
7.0-12.9%     451
<7.0%         435
>=21.0%       353
nan           255
Name: No High School Degree (%), dtype: int64

In [18]:
data.loc[data['No High School Degree (%)'] == 'nan', 'No High School Degree (%)'] = 0
data.loc[data['No High School Degree (%)'] == '<7.0%', 'No High School Degree (%)'] = 1
data.loc[data['No High School Degree (%)'] == '7.0-12.9%', 'No High School Degree (%)'] = 2
data.loc[data['No High School Degree (%)'] == '13.0-20.9%', 'No High School Degree (%)'] = 3
data.loc[data['No High School Degree (%)'] == '>=21.0%', 'No High School Degree (%)'] = 4
data['No High School Degree (%)'].value_counts()


3    503
2    451
1    435
4    353
0    255
Name: No High School Degree (%), dtype: int64

In [19]:
no_encode = ['Charlseon-Deyo Score', 'Lymphovascular Invasion', 'Stage', 'YEAR_OF_DIAGNOSIS', 'Median Income Quartile', 'No High School Degree (%)']
for x in no_encode:
    try:
        cat_cols.remove(x)
    except ValueError:
        pass
for x in cat_cols:
    le = LabelEncoder()
    le.fit(data[x])
    keys = le.classes_
    values = le.transform(le.classes_)
    label_dict = dict(zip(keys, values))
    print('col:', x)
    print(label_dict)
    data[x] = le.transform(data[x])

col: Facility Type
{'Academic/Research Program': 0, 'Community Cancer Program': 1, 'Comprehensive Community Cancer Program': 2, 'Integrated Network Cancer Program': 3, 'nan': 4}
col: Facility Location
{'Midwest': 0, 'NE': 1, 'South': 2, 'West': 3, 'nan': 4}
col: Sex
{'Female': 0, 'Male': 1}
col: Race
{'Asian': 0, 'Black': 1, 'Other': 2, 'White': 3, 'nan': 4}
col: Spanish Hispanic Origin
{'Hispanic': 0, 'Non-Hispanic': 1, 'nan': 2}
col: Primary Payor
{'Medicaid': 0, 'Medicare/Public': 1, 'Not Insured': 2, 'Private Insurance or Managed Care': 3, 'nan': 4}
col: Urban/Rural Classification
{'Rural': 0, 'Suburban': 1, 'Urban': 2, 'nan': 3}
col: Charlson-Deyo Score
{'0': 0, '1': 1, '2': 2}
col: Grade
{'Cell type not determined, not stated or not applicable': 0, 'Grade I, Well differentiated': 1, 'Grade II, Moderately differentiated': 2, 'Grade III, Poorly differentiated': 3, 'Grade IV, Undifferentiated or anaplastic': 4, 'nan': 5}
col: RX_SUMM_CHEMO
{'0.0': 0, '1.0': 1, '2.0': 2, '3.0': 3, '8

In [20]:
#apply the standard scaler to the numerical columns
scaler = StandardScaler()
for x in num_cols:
    data[x] = scaler.fit_transform(data[x].values.reshape(-1,1))

In [21]:
print(data.shape[0] * 0.8)
print(data.shape[0] * 0.2)

1597.6000000000001
399.40000000000003


In [22]:
pd.crosstab(data['YEAR_OF_DIAGNOSIS'], data['APR'], normalize='index')

APR,0.0,1.0
YEAR_OF_DIAGNOSIS,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,0.864035,0.135965
2011,0.860963,0.139037
2012,0.90535,0.09465
2013,0.862745,0.137255
2014,0.853383,0.146617
2015,0.851695,0.148305
2016,0.871324,0.128676
2017,0.843602,0.156398
2018,0.223881,0.776119
2019,0.0,1.0


In [23]:
pd.crosstab(data['YEAR_OF_DIAGNOSIS'], data['APR'])

APR,0.0,1.0
YEAR_OF_DIAGNOSIS,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,197,31
2011,161,26
2012,220,23
2013,220,35
2014,227,39
2015,201,35
2016,237,35
2017,178,33
2018,15,52
2019,0,32


In [24]:
data['YEAR_OF_DIAGNOSIS'] = data['YEAR_OF_DIAGNOSIS'].astype(int)

In [25]:
data = data.loc[data['YEAR_OF_DIAGNOSIS'] != 2019]
# data = data.loc[data['YEAR_OF_DIAGNOSIS'] != 2018]

In [37]:
data.head()

Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,...,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,0,0,-0.757684,0,3,1,2,2,-0.21595,0,...,2,-1.922006,0.0,0,2,2,0,0,0,0
1,3,0,-0.036905,0,3,1,3,2,-0.147744,0,...,2,0.583815,0.0,1,7,3,1,1,0,0
2,3,0,0.773971,0,3,1,1,0,0.666651,0,...,2,-1.922006,1.0,0,5,2,1,1,0,0
3,3,0,-0.487392,1,3,0,0,2,-0.199662,0,...,0,2.671998,0.0,1,5,0,1,0,0,0
4,3,0,-0.307197,0,3,1,3,2,-0.123312,0,...,0,-1.922006,0.0,0,3,3,0,0,0,0


In [39]:
data.to_csv('data/data_lvi.csv', index=False)

In [26]:
test = data.loc[data['YEAR_OF_DIAGNOSIS'] >= 2016]
train = data.loc[data['YEAR_OF_DIAGNOSIS'] < 2016]
print(train.shape, test.shape)

(1415, 29) (550, 29)


In [27]:
print(test.shape[0] / data.shape[0])

0.27989821882951654


In [28]:
data.iloc[:,0:15].head()

Unnamed: 0,Facility Type,Facility Location,Age,Sex,Race,Spanish Hispanic Origin,Primary Payor,Urban/Rural Classification,Distance From Facility,Charlson-Deyo Score,YEAR_OF_DIAGNOSIS,Grade,Tumor Size,Lymphovascular Invasion,"Diagnosis/Radiation Interval, Days"
0,0,0,-0.757684,0,3,1,2,2,-0.21595,0,2016,2,-0.144255,0,1.922468
1,3,0,-0.036905,0,3,1,3,2,-0.147744,0,2013,2,0.739418,2,-1.028002
2,3,0,0.773971,0,3,1,1,0,0.666651,0,2016,0,-0.545925,0,-0.897834
3,3,0,-0.487392,1,3,0,0,2,-0.199662,0,2018,0,-1.6706,0,-0.160217
4,3,0,-0.307197,0,3,1,3,2,-0.123312,0,2015,2,0.538583,0,-1.11478


In [29]:
data.iloc[:,15:30].head()

Unnamed: 0,"Duration of Radiation, Days",RX_SUMM_CHEMO,No High School Degree (%),Median Income Quartile,Medicaid Expansion,"Radiation Dose, Rads",APR,"Radiation Dose, >30 Gy",Stage,T Stage,N Stage,"Facility Volume, Quartile",Keratinizing,Basaloid
0,-0.100238,3,4,1,2,-1.922006,0.0,0,2,2,0,0,0,0
1,-0.303256,3,2,3,2,0.583815,0.0,1,7,3,1,1,0,0
2,1.219379,3,3,1,2,-1.922006,1.0,0,5,2,1,1,0,0
3,-0.100238,3,3,2,0,2.671998,0.0,1,5,0,1,0,0,0
4,-2.02891,3,1,4,0,-1.922006,0.0,0,3,3,0,0,0,0


In [36]:
test.to_csv('data/test_lvi.csv', index=False)
train.to_csv('data/train_lvi.csv', index=False)

In [40]:
train['YEAR_OF_DIAGNOSIS'].value_counts()

2014    266
2013    255
2012    243
2015    236
2010    228
2011    187
Name: YEAR_OF_DIAGNOSIS, dtype: int64

In [41]:
test['YEAR_OF_DIAGNOSIS'].value_counts()

2016    272
2017    211
2018     67
Name: YEAR_OF_DIAGNOSIS, dtype: int64

In [42]:
cols = data.columns.to_list()
num_cols = ['Age', 'Distance From Facility', 'Tumor Size', 'Diagnosis/Radiation Interval, Days', 'Duration of Radiation, Days', 'Radiation Dose, Rads']
cat_cols = [x for x in cols if x not in num_cols]

incl = data.columns.to_list()
incl.remove('APR')
cat_cols.remove('APR')

In [None]:
from tableone import TableOne
tableone = TableOne(data, columns=incl, categorical=cat_cols, groupby='APR', pval=True)
tableone.to_excel('results/table1_lvi.xlsx')