# Automatic scoring of x-ray images

## Data augmentation

Import dependencies

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# PIL to open & manipulate images
from PIL import Image, ImageOps, ImageChops

# for messages in loops
from IPython.display import clear_output

# to save arrays
import h5py

# for folder-timestamp
from datetime import datetime

# for train/test split
from sklearn.model_selection import train_test_split
# for one-hot encoding
from sklearn.preprocessing import LabelBinarizer
# for class weights
from sklearn.utils import class_weight
# for model evaluation
from sklearn.metrics import confusion_matrix, classification_report
# for efficient loops
import itertools

# keras
from tensorflow.contrib.keras.python.keras import backend as K
from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Lambda, Activation
from tensorflow.contrib.keras.python.keras.layers.normalization import BatchNormalization
from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.contrib.keras.python.keras import callbacks


Define image format & random seeds

In [2]:
# image format -> (rows, cols, channels)
K.set_image_data_format("channels_last")
# fix random seed for reproducibility
seed = 1
np.random.seed(seed)
tf.set_random_seed(seed)

In [3]:
with h5py.File('/data/joint_scoring/img_train.h5', 'r') as hf:
    img_train = hf['img_train'][:]

with h5py.File('/data/joint_scoring/labels_train.h5', 'r') as hf:
    labels_train = hf['labels_train'][:]

In [5]:
datagen = ImageDataGenerator(
        rotation_range=25,
        shear_range=0.1,
        zoom_range=[0,1.1])

In [6]:
n_img = np.unique(ar=labels_train, return_counts=True)[1]
print(n_img)

[48820 19676  1638   754   435   975]


In [7]:
print(sum(50000 - n_img))

227702


In [None]:
tmp = 50000 - n_img
probs = tmp / sum(tmp)
print(probs)

[ 0.00518221  0.13317406  0.21239163  0.2162739   0.21767486  0.21530334]


In [None]:
img_train_augmented = []
labels_train_augmented = []

n_augmented = 2277
b_size = 100

for i in range(n_augmented):
    clear_output()
    print("Augmenting data, {0:.2f} % finished".format(i/n_augmented*100))
    lbl = np.random.choice(a=6, p=probs)
    for batch in datagen.flow(img_train[labels_train == lbl], batch_size=b_size):
        img_train_augmented.append(batch)
        labels_train_augmented.append(np.repeat(lbl,b_size))
        break

img_train_augmented = np.array(img_train_augmented)

labels_train_augmented = np.array(labels_train_augmented)

print(img_train.shape)
print(img_train_augmented.shape)
print(labels_train.shape)
print(labels_train_augmented.shape)

Augmenting data, 99.96 % finished


In [11]:
with h5py.File('/data/joint_scoring/img_train_augmented.h5', 'w') as hf:
    hf.create_dataset("img_train_augmented",  data=img_train_augmented)

with h5py.File('/data/joint_scoring/labels_train_augmented.h5', 'w') as hf:
    hf.create_dataset("labels_train_augmented",  data=labels_train_augmented)

In [4]:
with h5py.File('/data/joint_scoring/img_train_augmented.h5', 'r') as hf:
    img_train_augmented = hf['img_train_augmented'][:]

with h5py.File('/data/joint_scoring/labels_train_augmented.h5', 'r') as hf:
    labels_train_augmented = hf['labels_train_augmented'][:]

In [5]:
img_train_augmented = img_train_augmented.reshape(img_train_augmented.shape[0] * img_train_augmented.shape[1], 
                                                  img_train_augmented.shape[2], img_train_augmented.shape[3], 
                                                  img_train_augmented.shape[4])
labels_train_augmented = labels_train_augmented.reshape(labels_train_augmented.shape[0] * 
                                                        labels_train_augmented.shape[1])

print(img_train.shape)
print(img_train_augmented.shape)
print(labels_train.shape)
print(labels_train_augmented.shape)

(72298, 150, 150, 1)
(227700, 150, 150, 1)
(72298,)
(227700,)


In [6]:
img_train_combined = np.concatenate([img_train, img_train_augmented])
labels_train_combined = np.concatenate([labels_train, labels_train_augmented])

idx = np.random.permutation(len(img_train_combined))
img_train_combined = img_train_combined[idx]
labels_train_combined = labels_train_combined[idx]

In [7]:
with h5py.File('/data/joint_scoring/img_train_combined.h5', 'w') as hf:
    hf.create_dataset("img_train_combined",  data=img_train_combined)

with h5py.File('/data/joint_scoring/labels_train_combined.h5', 'w') as hf:
    hf.create_dataset("labels_train_combined",  data=labels_train_combined)

In [7]:
with h5py.File('/data/joint_scoring/labels_train_combined.h5', 'r') as hf:
    labels_train_combined = hf['labels_train_combined'][:]

# one hot encode outputs
labels_train_combined_onehot = LabelBinarizer().fit_transform(labels_train_combined)

with h5py.File('/data/joint_scoring/labels_train_combined_onehot.h5', 'w') as hf:
    hf.create_dataset("labels_train_combined_onehot",  data=labels_train_combined_onehot)