# Automatic scoring of x-ray images

## Data augmentation

Import dependencies

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# PIL to open & manipulate images
from PIL import Image, ImageOps, ImageChops

# for messages in loops
from IPython.display import clear_output

# to save arrays
import h5py

# for folder-timestamp
from datetime import datetime

# for train/test split
from sklearn.model_selection import train_test_split
# for one-hot encoding
from sklearn.preprocessing import LabelBinarizer
# for class weights
from sklearn.utils import class_weight
# for model evaluation
from sklearn.metrics import confusion_matrix, classification_report
# for efficient loops
import itertools

# keras
from tensorflow.contrib.keras.python.keras import backend as K
from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Lambda, Activation
from tensorflow.contrib.keras.python.keras.layers.normalization import BatchNormalization
from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.contrib.keras.python.keras import callbacks


Define image format & random seeds

In [2]:
# image format -> (rows, cols, channels)
K.set_image_data_format("channels_last")
# fix random seed for reproducibility
seed = 1
np.random.seed(seed)
tf.set_random_seed(seed)

In [3]:
with h5py.File('/data/joint_scoring/img_train_regression.h5', 'r') as hf:
    img_train = hf['img_train_regression'][:]

with h5py.File('/data/joint_scoring/labels_train_regression.h5', 'r') as hf:
    labels_train = hf['labels_train_regression'][:]

In [4]:
datagen = ImageDataGenerator(
        rotation_range=25,
        shear_range=0.1,
        zoom_range=[0,1.1])

In [5]:
def round_x(x, base=5):
    return int(base * round(float(x)/base))
    
labels_train_rounded = [ round_x(x) for x in labels_train ]

In [17]:
unique_numbers, n_img = np.unique(ar=labels_train_rounded, return_counts=True)
print(unique_numbers)
print(n_img)

[  0   5  10  15  20  25  30  35  40  45  50  55  60  65  70  75  80  85
  90  95 100]
[48855 11547  5418  1376  1657   414   634   179   469   114   317    59
   236    54   160    25   154    31   125     9   792]


In [13]:
print(sum(10000 - n_img))

137375


In [16]:
tmp = 10000 - n_img
tmp = [max(0, x) for x in tmp]
probs = tmp / sum(tmp)
print(probs)
print(len(probs))

[ 0.          0.          0.02577386  0.04851021  0.04692958  0.05392149
  0.05268398  0.05524337  0.05361211  0.05560899  0.05446711  0.05591837
  0.05492274  0.05594649  0.05535024  0.05610962  0.05538399  0.05607587
  0.05554712  0.05619962  0.05179523]
21


In [19]:
img_train_augmented = []
labels_train_augmented = []

n_augmented = 137400
b_size = 1

for i in range(n_augmented):
    clear_output()
    print("Augmenting data, {0:.2f} % finished".format(i/n_augmented*100))
    lbl = np.random.choice(a=unique_numbers, p=probs)
    for batch in datagen.flow(img_train[labels_train_rounded == lbl], batch_size=b_size):
        img_train_augmented.append(batch)
        labels_train_augmented.append(np.repeat(lbl,b_size))
        break

img_train_augmented = np.array(img_train_augmented)

labels_train_augmented = np.array(labels_train_augmented)

print(img_train.shape)
print(img_train_augmented.shape)
print(labels_train.shape)
print(labels_train_augmented.shape)

Augmenting data, 100.00 % finished
(72625, 150, 150, 1)
(137400, 1, 150, 150, 1)
(72625,)
(137400, 1)


In [20]:
with h5py.File('/data/joint_scoring/img_train_augmented_regression.h5', 'w') as hf:
    hf.create_dataset("img_train_augmented_regression",  data=img_train_augmented)

with h5py.File('/data/joint_scoring/labels_train_augmented_regression.h5', 'w') as hf:
    hf.create_dataset("labels_train_augmented_regression",  data=labels_train_augmented)

In [4]:
with h5py.File('/data/joint_scoring/img_train_augmented_regression.h5', 'r') as hf:
    img_train_augmented = hf['img_train_augmented_regression'][:]

with h5py.File('/data/joint_scoring/labels_train_augmented_regression.h5', 'r') as hf:
    labels_train_augmented = hf['labels_train_augmented_regression'][:]

In [21]:
img_train_augmented = img_train_augmented.reshape(img_train_augmented.shape[0] * img_train_augmented.shape[1], 
                                                  img_train_augmented.shape[2], img_train_augmented.shape[3], 
                                                  img_train_augmented.shape[4])
labels_train_augmented = labels_train_augmented.reshape(labels_train_augmented.shape[0] * 
                                                        labels_train_augmented.shape[1])

print(img_train.shape)
print(img_train_augmented.shape)
print(labels_train.shape)
print(labels_train_augmented.shape)

(72625, 150, 150, 1)
(137400, 150, 150, 1)
(72625,)
(137400,)


In [22]:
img_train_combined = np.concatenate([img_train, img_train_augmented])
labels_train_combined = np.concatenate([labels_train, labels_train_augmented])

idx = np.random.permutation(len(img_train_combined))
img_train_combined = img_train_combined[idx]
labels_train_combined = labels_train_combined[idx]

In [23]:
with h5py.File('/data/joint_scoring/img_train_combined_regression.h5', 'w') as hf:
    hf.create_dataset("img_train_combined_regression",  data=img_train_combined)

with h5py.File('/data/joint_scoring/labels_train_combined_regression.h5', 'w') as hf:
    hf.create_dataset("labels_train_combined_regression",  data=labels_train_combined)