In [13]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
%tensorflow_version 2.x

import os
import gc
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf
from progressbar import ProgressBar

In [0]:
root = '/content/drive/Shared drives/IML Project/Project'

In [0]:
data_path = os.path.join(root, 'numpy_data')
tf_data_path = os.path.join(root, 'tfrecords_data')

In [0]:
labels = pd.read_csv(os.path.join(root, 'demographics.csv'))

In [0]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [0]:
write_options = tf.io.TFRecordOptions(compression_type='GZIP',
                                      compression_level=9)

In [0]:
def create_tfrecords(file_name, img_data, demographics):
    # pbar = ProgressBar()
    assert img_data.shape[0] == demographics.shape[0]
    with tf.io.TFRecordWriter(os.path.join(tf_data_path, file_name + '.tfrecords'), options=write_options) as writer:
        for i in np.random.choice(list(range(img_data.shape[0])), replace=False, size=(img_data.shape[0])):
            img_3d = img_data[i, :, :, :, :]
            dem_row = demographics.iloc[i]
            channels, height, width = img_3d.shape[0], img_3d.shape[1], img_3d.shape[2]
            img_raw = img_3d.tostring()
            onehot_label = np.eye(3)[dem_row.diagnosis - 1]
            label_raw = onehot_label.tostring()
            
            example = tf.train.Example(features=tf.train.Features(feature={
                'img_channels': _int64_feature(channels),
                'img_height': _int64_feature(height),
                'img_width': _int64_feature(width),
                'img_raw': _bytes_feature(img_raw),
                'sex': _bytes_feature(dem_row.sex.encode()),
                'age': _float_feature(dem_row.age_at_scan),
                'label': _bytes_feature(label_raw)
            }))
            writer.write(example.SerializeToString())
    writer.close()

In [0]:
def load_datasets(type: str):
    if type not in ['train', 'test', 'valid']: raise Exception('Unsupported dataset type')
    train_valid_test = 0 if type == 'train' else 1 if type == 'valid' else 2
    i = 1
    dataset = np.load(os.path.join(data_path, f'img_array_{type}_6k_{i}.npy'))
    while True:
        try:
            i += 1
            dataset = np.vstack((dataset, np.load(os.path.join(data_path, f'img_array_{type}_6k_{i}.npy'))))
        except FileNotFoundError:
            print(f'Loaded all {type} datasets')
            break
    # dataset = np.expand_dims(dataset, axis=1)
    for n in range(dataset.shape[0]):
        dataset[n, :, :] = dataset[n, :, :] / np.amax(dataset[n, :, :].flatten())
    print(f'Normalized {n+1} images')
    dataset = np.reshape(dataset, (-1, 62, 96, 96, 1))
    return dataset, labels[labels.train_valid_test == train_valid_test]

In [30]:
valid_data, valid_demo = load_datasets('valid')
create_tfrecords('validation', valid_data, valid_demo)
del valid_data, valid_demo
gc.collect()

Loaded all valid datasets
Normalized 26970 images


364

In [31]:
test_data, test_demo = load_datasets('test')
create_tfrecords('test', test_data, test_demo)
del test_data, test_demo
gc.collect()

Loaded all test datasets


  from ipykernel import kernelapp as app


Normalized 29078 images


15

In [32]:
train_data, train_demo = load_datasets('train')
create_tfrecords('train', train_data, train_demo)
del train_data, train_demo
gc.collect()

Loaded all train datasets
Normalized 130758 images


66