In [1]:
# Check CUDA Version
!cat /usr/local/cuda/version.txt;
# !nvcc --version
# !cat /usr/local/cuda/include/cudnn.h # | grep CUDNN_MAJOR -A 2

CUDA Version 10.0.130


In [2]:
# Check your versions (conda env tf2)
!python --version;
!pip freeze | grep tensorflow;
!pip freeze | grep pandas;
!pip freeze | grep numpy;
!pip freeze | grep fastai;

Python 3.7.4
tensorflow==2.0.0
tensorflow-datasets==1.2.0
tensorflow-estimator==2.0.0
tensorflow-gpu==2.0.0
tensorflow-hub==0.6.0
tensorflow-metadata==0.15.0
pandas==0.25.1
numpy==1.17.2


In [3]:
# Check your versions (conda env tf2)
!python --version;
!pip freeze | grep tensorflow;
!pip freeze | grep pandas;
!pip freeze | grep numpy;
!pip freeze | grep fastai;

Python 3.7.4
tensorflow==2.0.0
tensorflow-datasets==1.2.0
tensorflow-estimator==2.0.0
tensorflow-gpu==2.0.0
tensorflow-hub==0.6.0
tensorflow-metadata==0.15.0
pandas==0.25.1
numpy==1.17.2


In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

# from fastai.tabular import *
# from fastai.collab import *

from pathlib import Path
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.metrics import confusion_matrix

import datetime
import typing
import numbers
import os
import unittest
import random

In [4]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Ensure training on one GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

Num GPUs Available:  1
1 Physical GPUs, 1 Logical GPU


In [5]:
path = Path('../mydata')
filename='patients_cleaned.csv'

In [6]:
# PARAMETERS

splits = 5 # number of folds for cross validation
recalls = [0.85,0.90,0.95] # sensitivities used for calculating results


# Imported columns from CSV
desired_cols = ['age','sex','Temp','exam_WBC','exam_Plt', 'Opd_Visit_Date',
                'ER', 'Heart Disease', 'CVA', 'CKD', 'Severe Liver Disease', 
                'DM', 'Hypertension', 'Cancer without Metastasis', 'Cancer with Metastasis',
                'lab_result']

# Features used for training + dependent variable
train_cols = ['age','Temp','exam_WBC','exam_Plt','lab_result']


# Features used for creating validation subgroups (includes features from train_cols)
subgroup_cols = ['age','sex','Temp','exam_WBC','exam_Plt', 'week',
                'ER', 'Heart Disease', 'CVA', 'CKD', 'Severe Liver Disease', 
                'DM', 'Hypertension', 'Cancer without Metastasis', 'Cancer with Metastasis',
                'lab_result']

cont_cols = ['age','Temp','exam_WBC','exam_Plt']
cat_cols = []

# Columns to be dropped after creating validation subgroups
drop_cols = list(set(subgroup_cols) - set(train_cols))

### Prep

In [7]:
# IMPORT DF

df = pd.read_csv(path/filename, usecols=desired_cols)
df = df.iloc[np.random.permutation(len(df))] # randomize data

# Convert Opd_Visit_Date to week of year format
if 'week' not in df.columns and 'Opd_Visit_Date' in df.columns:
    week_numbers = [int(datetime.datetime.strptime(d, "%Y/%m/%d").strftime("%U"))+1 for d in df['Opd_Visit_Date']]
    df.insert(0, 'week', week_numbers)
    df.drop(columns=['Opd_Visit_Date'],inplace=True)
    
# Convert 男 and 女 to 0 and 1 in column 'sex'
df_male_indx = df[df['sex']=='男']
df_female_indx = df[df['sex']=='女']

for i in df_male_indx.index.tolist():
    df.at[int(i),'sex'] = 0
for i in df_female_indx.index.tolist():
    df.at[int(i),'sex'] = 1

In [8]:
dataframe = df

# Equal length subsets of original dataframe
len_df = len(dataframe)
cut_indices = [int(i*(1/splits)*len_df) for i in range(0,splits+1)]
cut_indices = zip(cut_indices[:-1], cut_indices[1:])
subsets = [dataframe[i:j] for i,j in cut_indices]

valids = subsets
trains = [pd.concat(subsets[1:], axis=0)]
for n in range(1,splits):
    trains += [pd.concat(subsets[:n]+subsets[n+1:], axis=0)]

In [9]:
# some tests
assert len(valids[0])+len(trains[0])==len(df)
assert list(valids[0].index) == list(valids[0].index)
assert list(valids[1].index) != list(valids[0].index)
assert list(trains[1].index) != list(trains[0].index)

In [10]:
# Separate Dataframe into Subgroups

frames = [] # temporarily stores dataframes
subgroup_dicts = []

for modelnum in range(1,splits+1):

    dataframe = subsets[modelnum-1] # validation set

    # age
    df_age_under_18 = dataframe[dataframe['age']<18]
    df_age_18_to_65 = dataframe[(dataframe['age']>=18) & (dataframe['age']<65)]
    df_age_over_eq_65 = dataframe[dataframe['age']>=65]

    # sex
    df_female = dataframe[dataframe['sex']==1]
    df_male = dataframe[dataframe['sex']==0]

    # week
    df_wks_35 = dataframe[dataframe['week']<=35]
    df_wks_35_to_40 = dataframe[(dataframe['week']>35) & (dataframe['week']<=40)]
    df_wks_over_40 = dataframe[dataframe['week']>40]

    # Temp
    df_temp_over_eq_38 = dataframe[dataframe['Temp']>=38]
    df_temp_under_38 = dataframe[dataframe['Temp']<38]

    # exam_WBC
    df_wbc_low = dataframe[dataframe['exam_WBC']<=3.2]
    df_wbc_normal = dataframe[(dataframe['exam_WBC']>3.2) & (dataframe['exam_WBC']<10)]
    df_wbc_high = dataframe[dataframe['exam_WBC']>=10]

    # exam_Plt
    df_plt_low = dataframe[dataframe['exam_Plt']<100]
    df_plt_high = dataframe[dataframe['exam_Plt']>=100]

    # Comorbidities
    df_heart_disease = dataframe[dataframe['Heart Disease']==True]
    df_cva = dataframe[dataframe['CVA']==True]
    df_ckd = dataframe[dataframe['CKD']==True]
    df_liver = dataframe[dataframe['Severe Liver Disease']==True]
    df_dm = dataframe[dataframe['DM']==True]
    df_hypertension = dataframe[dataframe['Hypertension']==True]


    df_cancer1 = dataframe[(dataframe['Cancer with Metastasis']==True)]
    df_cancer2 = dataframe[(dataframe['Cancer without Metastasis']==True)]
    df_cancer = pd.concat([df_cancer1, df_cancer2], axis=0)

    df_er = dataframe[dataframe['ER']==True]
    
    overall = dataframe

    frame = [df_age_under_18, df_age_18_to_65, df_age_over_eq_65, df_female, df_male, df_wks_35, df_wks_35_to_40, 
          df_wks_over_40, df_temp_over_eq_38, df_temp_under_38, df_wbc_low, df_wbc_normal, df_wbc_high, 
          df_plt_low, df_plt_high, df_heart_disease, df_cva, df_ckd, df_liver, df_dm, df_hypertension, 
          df_cancer, df_er, overall]

    dfs_names = ['df_age_under_18', 'df_age_18_to_65', 'df_age_over_eq_65', 'df_female', 'df_male', 'df_wks_35', 'df_wks_35_to_40', 
          'df_wks_over_40', 'df_temp_over_eq_38', 'df_temp_under_38', 'df_wbc_low', 'df_wbc_normal', 'df_wbc_high', 
          'df_plt_low', 'df_plt_high', 'df_heart_disease', 'df_cva', 'df_ckd', 'df_liver', 'df_dm', 'df_hypertension', 
          'df_cancer', 'df_er','overall']

    subgroup_dict = {name:frame for (name, frame) in zip(dfs_names, frame)}
    
#     dfs = frame # because of older code... (technical debt)

    # Display Subgroup Sizes

#     print("Dataframes (Model "+str(modelnum)+')'," "*(20-len('dataframes (Model n)')),"| Length", ' ',"| Percent Length")
#     print("-"*50)
#     length = 0
#     for i in range(len(dfs)):
#         print(dfs_names[i],' '*(20-len(dfs_names[i])),'|',len(dfs[i]),' '*(7-len(str(len(dfs[i])))),f'| {100*len(dfs[i])/len(dataframe):0.2f}%')
#         length += len(dfs[i])
        
#     print('-'*42)
#     print('valid_df '+str(modelnum),' '*(20-len('valid_df 1')), '|', len(dataframe),' '*2 ,f' | 100%') 
#     print('\n'*2)

    frames += [frame]
    subgroup_dicts += [subgroup_dict]
    
# RESET DFS

dfs = frames


In [11]:
# drop_cols defined in Parameters section

for model_indx in range(len(dfs)):
    dataframes = dfs[model_indx]
    train_df = trains[model_indx]
    valid_df = subsets[model_indx]
    
    
    # Remove columns of unused features in validation subgroups
    for i in range(len(dataframes)):
        if drop_cols[0] in dataframes[i].columns:
            dataframes[i].drop(columns=drop_cols,inplace=True)

    # Remove columns of unused features in training dataset
    if drop_cols[0] in train_df.columns:
        train_df.drop(columns=drop_cols,inplace=True)

    # Remove columns of unused features in full validation dataset
    if drop_cols[0] in valid_df.columns:
        valid_df.drop(columns=drop_cols,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Training

In [12]:
train = trains[0]
valid = valids[0]

In [14]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32, target='target'):
    dataframe = dataframe.copy()
    labels = dataframe.pop(target)
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [15]:
train_ds = df_to_dataset(train, target='lab_result')
valid_ds = df_to_dataset(valid, shuffle=False, target='lab_result')

In [None]:
feature_columns = []

# numeric cols
for col in cont_cols:
    feature_columns.append(tf.feature_column.numeric_column(col))

In [28]:

model = keras.Sequential([
    layers.DenseFeatures(feature_columns),
    
    layers.BatchNormalization(),
    layers.Dense(16,activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dense(16,activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dense(16,activation='relu'),
    
    layers.BatchNormalization(),
    layers.Dense(1,activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=valid_ds,
          epochs=15,
          callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fe55fe1f510>

In [23]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_4 (DenseFeatu multiple                  0         
_________________________________________________________________
batch_normalization_7 (Batch multiple                  16        
_________________________________________________________________
dense_13 (Dense)             multiple                  80        
_________________________________________________________________
batch_normalization_8 (Batch multiple                  64        
_________________________________________________________________
dense_14 (Dense)             multiple                  272       
_________________________________________________________________
batch_normalization_9 (Batch multiple                  64        
_________________________________________________________________
dense_15 (Dense)             multiple                 

In [63]:
def make_pred(model, dataframe, target='lab_result'):
    # makes a prediction with given model
    return model.predict(df_to_dataset(dataframe, shuffle=False, target=target))

In [79]:
def make_targ(dataframe, target='lab_result'):
    # returns targets as numpy array for given dataframe
    return dataframe[target].to_numpy()

In [97]:
name = 'df_female'
preds = make_pred(model,df_female)
targets = make_targ(df_female)

In [98]:
min_recall = 0.9

In [99]:
score = preds
fpr, tpr, thresholds = metrics.roc_curve(targets, score, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)

precision, recall, thresholds = metrics.precision_recall_curve(targets, score, pos_label=1)
recall = np.asarray(recall)
idx = (np.abs(recall - min_recall)).argmin() # Find nearest threshold
thresh = thresholds[idx]

predict_label = [1 if s >= thresh else 0 for s in range(len(score))]
conf_mat = confusion_matrix(targets, predict_label)

TN, FP, FN, TP = conf_mat.flatten()

In [100]:
conf_mat

array([[  1, 189],
       [  0, 295]])

In [101]:
name = 'df_female'
PPV = TP / (TP + FP) if (TP+FP != 0) else 0 # positive predict value
NPV = TN / (TN + FN) if (TN+FN != 0) else 0 # negative predict value
F1 = 2*TP / (2*TP + FP + FN) #
accuracy = (TP + TN) / (TP + TN + FP + FN)
sensitivity = TP /(TP + FN) if (TP+FN != 0) else 0
specificity = TN /(TN + FP) if (TN+FP != 0) else 0
odds_ratio = (TP * TN) /(FP * FN) if (FP*FN != 0) else 0
total = FN+FP+TN+TP


result = [name,sensitivity, specificity, accuracy, PPV, NPV, F1, odds_ratio, TN, FP, FN, TP, total]
result_titles = ['Set','sensitivity', 'specificity', 'accuracy', 'PPV', 'NPV', 'F1', 'odds_ratio', 'TN', 'FP', 'FN', 'TP', 'total']
list(zip(result_titles, result))

[('Set', 'df_female'),
 ('sensitivity', 1.0),
 ('specificity', 0.005263157894736842),
 ('accuracy', 0.6103092783505155),
 ('PPV', 0.609504132231405),
 ('NPV', 1.0),
 ('F1', 0.7573812580231065),
 ('odds_ratio', 0),
 ('TN', 1),
 ('FP', 189),
 ('FN', 0),
 ('TP', 295),
 ('total', 485)]