## Playing around with data loading approaches

In [1]:
import glob
import pylab
import random
import numpy as np
import pandas as pd
import pydicom
import tensorflow as tf
tf.enable_eager_execution()

### Step 1 is to quickly make a train/validate set from training csv, would like to also keep in mind the ratio of 1:3 class 1 to 0 and maintain that in both train and validation.

In [2]:
data_path = '/home/keil/data/RSNA-pneumonia/stage_1/'
train_img = 'stage_1_train_images/'
train_csv = 'stage_1_train_labels.csv'
######## TEST DUMMY ########
dummy_path = '/home/keil/data/RSNA-pneumonia/stage_dummy/'
dummy_csv = 'stage_dummy_train_labels.csv'
dummy_train = 'dummy_train.csv'
dummy_val = 'dummy_val.csv'

In [145]:
# load data csv file into pandas, split into two dataframes for Target = 0/1
# Randomly draw 25% from both DFs into two other DFs
# Concat back the old DFs and the two new DFs, calling the first train and new one valid
# save to csv, giving us both train and validate csvs with a 75/25 split and even class balance between both.
# from the EDA notebook:
    # Positive: 8964
    # Negative: 20025
    # Ratio of 1.0 to 3.2 pos to neg

# constants
shuffle_seed = 10 #to argparse later for the random shuffle and draw
pneumonia = 1
valid_percent = 0.25

# create DFs
df = pd.read_csv(dummy_path + dummy_csv)  #df.shape = (28989, 6)
df_1 = df[df['Target'] >= pneumonia].reset_index(drop=True) #shape = (8964, 6)
df_0 = df[df['Target'] < pneumonia].reset_index(drop=True) #shape = (20025, 6)

# Create subsamples of both class DFs with an amount = valid_percent
df_1_valid = df_1.sample(frac=valid_percent,random_state=shuffle_seed).sort_index()
df_0_valid = df_0.sample(frac=valid_percent,random_state=shuffle_seed).sort_index()

# Using the subdsample lets get the symmetric diference or disjoint from the parent sets
class_1_diff = df_1.index.symmetric_difference(df_1_valid.index).tolist()
class_0_diff = df_0.index.symmetric_difference(df_0_valid.index).tolist()

# Create our training DFs based on that subset from above
df_1_train = df_1.iloc[class_1_diff]
df_0_train = df_0.iloc[class_0_diff]

# Check that our subset DFs for train and valid are equal in size to df_1 and df_0
assert df_1_valid.shape[0] + df_1_train.shape[0] == df_1.shape[0]
assert df_0_valid.shape[0] + df_0_train.shape[0] == df_0.shape[0]

#concat DFs
df_train = pd.concat([df_1_train, df_0_train])
df_valid = pd.concat([df_1_valid, df_0_valid])

#check final shapes
assert df_train.shape[0] + df_valid.shape[0] == df.shape[0]

#Write out DFs to CSVs
df_train.to_csv(dummy_path + 'dummy_train.csv',index=False)
df_valid.to_csv(dummy_path + 'dummy_val.csv',index=False)

for _ in [df_train,df_valid]:
    label_bool = _['Target'].tolist()
    data_count = len(label_bool)
    positives = np.sum(label_bool)
    print('Positive: {}\nNegative: {}'.format(positives,(data_count-positives)))
    print('Ratio of {} to {} pos to neg\n'.format(positives/positives,np.round(data_count/positives,3)))


Positive: 6723
Negative: 15019
Ratio of 1.0 to 3.234 pos to neg

Positive: 2241
Negative: 5006
Ratio of 1.0 to 3.234 pos to neg



## Step 2 is to get DICOM images into tf.data.dataset for training...

In [3]:
DATA_PATH = '/home/keil/data/RSNA-pneumonia/stage_1/'
IMG_DIR = 'stage_1_train_images/'
CSV_PATH = DATA_PATH + 'train.csv'

def split_data_labels(csv_path, path):
    """ take CSVs with filepaths/labels and extracts them into parallel lists"""
    filenames = []
    labels = []
    with open(csv_path, 'r') as f:
        next(f)
        for line in f:
            new_line = line.strip().split(',')
            #[0]=patientID (same as DICOM name) [5]=Target
            filenames.append(path + new_line[0]+'.dcm')
            labels.append(int(new_line[5])) #DEBUG float??? was float before
    return filenames,labels

train_imgs, train_labels = split_data_labels(CSV_PATH, DATA_PATH+IMG_DIR)

assert len(train_imgs) == len(train_labels)
print(train_imgs[0])
print(train_labels[0])

/home/keil/data/RSNA-pneumonia/stage_1/stage_1_train_images/00436515-870c-4b36-a041-de91049b9ab4.dcm
1


In [28]:
ds = pydicom.dcmread(train_imgs[0])
image = ds.pixel_array
print(type(image))
image.shape

<class 'numpy.ndarray'>


(1024, 1024)

In [25]:
def build_dataset(data, labels):
    """todo"""
    labels = tf.one_hot(tf.cast(labels, tf.uint8), 1) #cast labels to dim 2 tf obj
#     print(labels)
    #data = pydicom.dcmread(data).pixel_array
    dataset = tf.data.Dataset.from_tensor_slices((data, labels))
#     dataset = dataset.shuffle(len(data))
#     dataset = dataset.repeat()
    dataset = dataset.map(decode)
    dataset = dataset.map(preprocess_img, num_parallel_calls=2)
    # dataset = dataset.map(img_augmentation, num_parallel_calls=2)
#     dataset = dataset.batch(BATCH_SIZE) # (?, x, y) unknown batch size because the last batch will have fewer elements.
#     dataset = dataset.prefetch(PREFETCH_SIZE) #single training step consumes n elements
    print(data[0])
    return dataset

def dicom_to_np(filename):
#     image_string = tf.read_file(filename)
    ds = pydicom.dcmread(filename)
    image = ds.pixel_array
#     print('ji'*50)
#     print(image.shape)
    return image.astype(np.float32)

def decode(filename, label):
    # input = tf.placeholder(tf.float32)
    image_string = tf.read_file(filename)
    return tf.py_func(dicom_to_np, [image_string], tf.float32), label

def preprocess_img(img, label):
    image = tf.image.convert_image_dtype(img, tf.float32) #convert to float values in [0, 1]
#     image = tf.image.resize_images(image, [1024, 1024])
    return image, label

train_dataset = build_dataset(train_imgs, train_labels)


/home/keil/data/RSNA-pneumonia/stage_1/stage_1_train_images/00436515-870c-4b36-a041-de91049b9ab4.dcm


In [26]:
print(train_dataset)

<ParallelMapDataset shapes: (<unknown>, (1,)), types: (tf.float32, tf.float32)>


In [29]:

iterator = train_dataset.make_one_shot_iterator()
for x in iterator:
    print(x)
    break

UnknownError: AttributeError: 'bytes' object has no attribute 'read'
Traceback (most recent call last):

  File "/home/keil/miniconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/script_ops.py", line 158, in __call__
    ret = func(*args)

  File "<ipython-input-25-cdc0b6753722>", line 19, in dicom_to_np
    ds = pydicom.dcmread(filename)

  File "/home/keil/miniconda3/envs/tensorflow/lib/python3.6/site-packages/pydicom/filereader.py", line 886, in dcmread
    force=force, specific_tags=specific_tags)

  File "/home/keil/miniconda3/envs/tensorflow/lib/python3.6/site-packages/pydicom/filereader.py", line 689, in read_partial
    preamble = read_preamble(fileobj, force)

  File "/home/keil/miniconda3/envs/tensorflow/lib/python3.6/site-packages/pydicom/filereader.py", line 627, in read_preamble
    preamble = fp.read(128)

AttributeError: 'bytes' object has no attribute 'read'


	 [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT], token="pyfunc_11"](ReadFile)]] [Op:IteratorGetNextSync]

In [16]:
DenseNet169 = tf.keras.applications.densenet.DenseNet169(include_top=False,
        weights='imagenet',
        input_tensor=None,
        input_shape=(1024, 1024, 3),
        pooling='max',
        classes=2)
last_layer = DenseNet169.output
# print(last_layer)
preds = tf.keras.layers.Dense(1, activation='sigmoid')(last_layer)
model = tf.keras.Model(DenseNet169.input, preds)

# https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
optimizer = tf.train.AdamOptimizer(learning_rate=1,
        beta1=1,
        beta2=1)

optimizer_keras = tf.keras.optimizers.Adam(lr=1,
        beta_1=1,
        beta_2=1,
        decay=0.10)

# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath='./',
        monitor="val_loss",
        verbose=1,
        save_best_only=True)

# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/TensorBoard
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./',
        # histogram_freq=1, #this screwed us over... caused tensorboard callback to fail.. why??? DEBUG !!!!!!
        # batch_size=BATCH_SIZE, # and take this out... and boom.. histogam frequency works. sob
        write_graph=True,
        write_grads=False,
        write_images=True)

print("Compiling Model!")
model.compile(optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy'])

print("Beginning to Train Model")
model.fit(train_dataset,
        epochs=1,
        steps_per_epoch=(len(train_labels)//3), #36808 train number
        verbose=1,
        validation_data=None,
        validation_steps=None,  #3197 validation number
        callbacks=None)  #https://towardsdatascience.com/learning-rate-schedules-and-adaptive-learning-rate-methods-for-deep-learning-2c8f433990d1


Compiling Model!
Beginning to Train Model


ValueError: Error when checking target: expected dense_2 to have 2 dimensions, but got array with shape ()

In [4]:
dataset = tf.data.TextLineDataset(data_path+train_csv)

In [14]:
dataset.list_files('train')

<ShuffleDataset shapes: (), types: tf.string>

In [18]:
tmp = dataset.make_one_shot_iterator()
print(type(tmp))

<class 'tensorflow.python.data.ops.iterator_ops.Iterator'>


In [16]:
# sess.run(iterator.initializer)
for x in tmp:
    print(x)
    break

TypeError: 'method' object is not iterable