# Automatic scoring of x-ray images

## 1. Preprocessing

Import dependencies

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# PIL to open & manipulate images
from PIL import Image, ImageOps, ImageChops

# for messages in loops
from IPython.display import clear_output

# to save arrays
import h5py

# for folder-timestamp
from datetime import datetime

# for train/test split
from sklearn.model_selection import train_test_split
# for one-hot encoding
from sklearn.preprocessing import LabelBinarizer
# for class weights
from sklearn.utils import class_weight
# for model evaluation
from sklearn.metrics import confusion_matrix, classification_report
# for efficient loops
import itertools

# keras
from tensorflow.contrib.keras.python.keras import backend as K
from tensorflow.contrib.keras.python.keras.utils.io_utils import HDF5Matrix
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Lambda, Activation
from tensorflow.contrib.keras.python.keras.layers.normalization import BatchNormalization
from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.contrib.keras.python.keras import callbacks


Define image format & random seeds

In [2]:
# image format -> (rows, cols, channels)
K.set_image_data_format("channels_last")
# fix random seed for reproducibility
seed = 1
np.random.seed(seed)
tf.set_random_seed(seed)

Read image array and labels from disk & change shape of images

In [3]:
with h5py.File('/data/joint_scoring/img_array.h5', 'r') as hf:
    images = hf['img_array'][:]
    
with h5py.File('/data/joint_scoring/lbl_array.h5', 'r') as hf:
    labels = hf['lbl_array'][:]
    
with h5py.File('/data/joint_scoring/pid_array.h5', 'r') as hf:
    patients = hf['pid_array'][:]

images = images.reshape(images.shape[0], images.shape[1], images.shape[2], 1)

Create train / test / validation split
Stratified by class, but all images of the same patient have to stay together

In [4]:
img_train = []
labels_train = []
patients_train = []

img_test = []
labels_test = []
patients_test = []

img_val = []
labels_val = []
patients_val = []

indices = np.random.permutation(images.shape[0])
images = images[indices]
labels = labels[indices]
patients = patients[indices]

for i in range(labels.shape[0]):
    
    if (i % 100) == 0:
        clear_output()
        print("Creating test/train/validation split, {0:.2f} % finished".format(i/labels.shape[0]*100))
    
    if patients[i] in patients_train:
        img_train.append(images[i])
        labels_train.append(labels[i])
        patients_train.append(patients[i])
    elif patients[i] in patients_test:
        img_test.append(images[i])
        labels_test.append(labels[i])
        patients_test.append(patients[i])
    elif patients[i] in patients_val:
        img_val.append(images[i])
        labels_val.append(labels[i])
        patients_val.append(patients[i])
    else:
        choice = np.random.choice(a=3, p=[0.7, 0.2, 0.1])
        if choice == 0:
            img_train.append(images[i])
            labels_train.append(labels[i])
            patients_train.append(patients[i])
        elif choice == 1:
            img_test.append(images[i])
            labels_test.append(labels[i])
            patients_test.append(patients[i])
        else:
            img_val.append(images[i])
            labels_val.append(labels[i])
            patients_val.append(patients[i])      

Creating test/train/validation split, 99.94 % finished


In [5]:
img_train = np.array(img_train)
labels_train = np.array(labels_train)
patients_train = np.array(patients_train)

img_test = np.array(img_test)
labels_test = np.array(labels_test)
patients_test = np.array(patients_test)

img_val = np.array(img_val)
labels_val = np.array(labels_val)
patients_val = np.array(labels_val)

In [6]:
with h5py.File('/data/joint_scoring/img_train_regression.h5', 'w') as hf:
    hf.create_dataset("img_train_regression",  data=img_train)
with h5py.File('/data/joint_scoring/img_test_regression.h5', 'w') as hf:
    hf.create_dataset("img_test_regression",  data=img_test)
with h5py.File('/data/joint_scoring/img_val_regression.h5', 'w') as hf:
    hf.create_dataset("img_val_regression",  data=img_val)

with h5py.File('/data/joint_scoring/labels_train_regression.h5', 'w') as hf:
    hf.create_dataset("labels_train_regression",  data=labels_train)
with h5py.File('/data/joint_scoring/labels_test_regression.h5', 'w') as hf:
    hf.create_dataset("labels_test_regression",  data=labels_test)
with h5py.File('/data/joint_scoring/labels_val_regression.h5', 'w') as hf:
    hf.create_dataset("labels_val_regression",  data=labels_val)
    
with h5py.File('/data/joint_scoring/patients_train_regression.h5', 'w') as hf:
    hf.create_dataset("patients_train_regression",  data=patients_train)
with h5py.File('/data/joint_scoring/patients_test_regression.h5', 'w') as hf:
    hf.create_dataset("patients_test_regression",  data=patients_test)
with h5py.File('/data/joint_scoring/patients_val_regression.h5', 'w') as hf:
    hf.create_dataset("patients_val_regression",  data=patients_val)