## Playing around with data loading approaches

In [2]:
import glob
import pylab
import random
import numpy as np
import pandas as pd
import pydicom
import tensorflow as tf

### Step 1 is to quickly make a train/validate set from training csv, would like to also keep in mind the ratio of 1:3 class 1 to 0 and maintain that in both train and validation.

In [19]:
data_path = '/home/keil/data/RSNA-pneumonia/stage_1/'
train_img = 'stage_1_train_images/'
train_csv = 'stage_1_train_labels.csv'
######## TEST DUMMY ########
dummy_path = '/home/keil/data/RSNA-pneumonia/stage_dummy/'
dummy_csv = 'stage_dummy_train_labels.csv'
dummy_train = 'dummy_train.csv'
dummy_val = 'dummy_val.csv'

In [145]:
# load data csv file into pandas, split into two dataframes for Target = 0/1
# Randomly draw 25% from both DFs into two other DFs
# Concat back the old DFs and the two new DFs, calling the first train and new one valid
# save to csv, giving us both train and validate csvs with a 75/25 split and even class balance between both.
# from the EDA notebook:
    # Positive: 8964
    # Negative: 20025
    # Ratio of 1.0 to 3.2 pos to neg

# constants
shuffle_seed = 10 #to argparse later for the random shuffle and draw
pneumonia = 1
valid_percent = 0.25

# create DFs
df = pd.read_csv(dummy_path + dummy_csv)  #df.shape = (28989, 6)
df_1 = df[df['Target'] >= pneumonia].reset_index(drop=True) #shape = (8964, 6)
df_0 = df[df['Target'] < pneumonia].reset_index(drop=True) #shape = (20025, 6)

# Create subsamples of both class DFs with an amount = valid_percent
df_1_valid = df_1.sample(frac=valid_percent,random_state=shuffle_seed).sort_index()
df_0_valid = df_0.sample(frac=valid_percent,random_state=shuffle_seed).sort_index()

# Using the subdsample lets get the symmetric diference or disjoint from the parent sets
class_1_diff = df_1.index.symmetric_difference(df_1_valid.index).tolist()
class_0_diff = df_0.index.symmetric_difference(df_0_valid.index).tolist()

# Create our training DFs based on that subset from above
df_1_train = df_1.iloc[class_1_diff]
df_0_train = df_0.iloc[class_0_diff]

# Check that our subset DFs for train and valid are equal in size to df_1 and df_0
assert df_1_valid.shape[0] + df_1_train.shape[0] == df_1.shape[0]
assert df_0_valid.shape[0] + df_0_train.shape[0] == df_0.shape[0]

#concat DFs
df_train = pd.concat([df_1_train, df_0_train])
df_valid = pd.concat([df_1_valid, df_0_valid])

#check final shapes
assert df_train.shape[0] + df_valid.shape[0] == df.shape[0]

#Write out DFs to CSVs
df_train.to_csv(dummy_path + 'dummy_train.csv',index=False)
df_valid.to_csv(dummy_path + 'dummy_val.csv',index=False)

for _ in [df_train,df_valid]:
    label_bool = _['Target'].tolist()
    data_count = len(label_bool)
    positives = np.sum(label_bool)
    print('Positive: {}\nNegative: {}'.format(positives,(data_count-positives)))
    print('Ratio of {} to {} pos to neg\n'.format(positives/positives,np.round(data_count/positives,3)))


Positive: 6723
Negative: 15019
Ratio of 1.0 to 3.234 pos to neg

Positive: 2241
Negative: 5006
Ratio of 1.0 to 3.234 pos to neg



In [4]:
dataset = tf.data.TextLineDataset(data_path+train_csv)

In [14]:
dataset.list_files('train')

<ShuffleDataset shapes: (), types: tf.string>

In [18]:
tmp = dataset.make_one_shot_iterator()
print(type(tmp))

<class 'tensorflow.python.data.ops.iterator_ops.Iterator'>


In [16]:
# sess.run(iterator.initializer)
for x in tmp:
    print(x)
    break

TypeError: 'method' object is not iterable