In [1]:
import os
import glob
import dicom
import numpy as np
import pandas as pd
from skimage.transform import resize, rescale

In [2]:
# run scripts/distribute_dataset.sh for the train directory
# Download min_max_frame_idxs.csv from Slack and put it in ../input directory
# Download landmarks_v2.zip from Slack and unpack to ../input
# run all cells
# files ../input/X_train.npy ../input/Y_train.npy ../input/X_valid.npy ../input/Y_valid.npy should appear

[0m[01;34mlandmarks_v2[0m/                 min_max_frame_idxs.csv  validate-128x128-data.csv
[01;31mlandmarks_v2.zip[0m              [01;36mtrain[0m@                  validate-label.csv
local_test-128x128-data.csv   train-128x128-data.csv  X_train.npy
local_test-label.csv          train.csv               y_train.npy
local_train-128x128-data.csv  train-label.csv
local_train-label.csv         [01;36mvalidate[0m@


In [3]:
minmax = pd.read_csv("../input/min_max_frame_idxs.csv", delim_whitespace=True, index_col=0, 
                     names=['min', 'max'])
labels = pd.read_csv("../input/train.csv", index_col=0)

IMG_SIZE = 64
MAX_SAXES = 15

In [4]:
def crop_resize(filename, img_shape=(IMG_SIZE, IMG_SIZE)):
    """
    Crop center and resize.
    :param img: image to be cropped and resized.
    """
    dcm = dicom.read_file(filename)
    scale = map(float, dcm.PixelSpacing)
    img = dcm.pixel_array.astype(np.float) / dcm.LargestImagePixelValue
    img = rescale(img, scale)
    
    if img.shape[0] < img.shape[1]:
        img = img.T
    # we crop image from center
    short_edge = min(img.shape[:2])
    yy = int((img.shape[0] - short_edge) / 2)
    xx = int((img.shape[1] - short_edge) / 2)
    crop_img = img[yy: yy + short_edge, xx: xx + short_edge]
    img = crop_img
    img = resize(img, img_shape)
    return img[np.newaxis]

def get_good_saxes(patient):
    fname = "../input/landmarks_v2/%d_contour_areas.csv" % patient
    saxes = []
    with open(fname, 'r') as f:
        for line in f:
            saxes.append(line.split()[0])
    return saxes

def get_patient_slices(patient, min_idx, max_idx):
    mins = [min_idx - 1 if min_idx > 2 else 30 , min_idx, min_idx + 1 if min_idx < 30 else 1]
    maxs = [max_idx - 1 if max_idx > 2 else 30 , max_idx, max_idx + 1 if max_idx < 30 else 1]
    saxes = get_good_saxes(patient)
    sax_slices = []
    for sax in saxes:
        path = os.path.join('../input/train/', str(patient),'study', sax)
        slices_min = map(lambda x: glob.glob(path + "/IM-*-%.4d*.dcm" % x)[0], mins)
        slices_max = map(lambda x: glob.glob(path + "/IM-*-%.4d*.dcm" % x)[0], maxs)
        slices_min.extend(slices_min)
        sax_slices.append(np.vstack(map(crop_resize, slices_min))[np.newaxis])
    return np.vstack(sax_slices)

In [5]:
val_images = []
train_images = []
val_y = []
train_y = []

for patient, minidx, max_idx in minmax.itertuples():
    if patient > 500:
        continue
    if (patient % 50) == 0:
        print "%.1f %% " % (100 * float(patient) / 500)
    systole, diastole = labels.loc[patient]
    r = get_patient_slices(patient, minidx + 1, max_idx + 1)
    n_saxes = r.shape[0]
    if n_saxes < MAX_SAXES:
        part = r[:(MAX_SAXES - n_saxes)].copy()
        r = np.vstack((r, part))
    else:
        r = r[:MAX_SAXES]
    assert r.shape[0] == MAX_SAXES
    if np.random.random() < 0.1:
        # validation
        val_images.append(r[:, :3].reshape(1, -1, IMG_SIZE, IMG_SIZE))
        val_images.append(r[:, 3:].reshape(1, -1, IMG_SIZE, IMG_SIZE))
        val_y.append(systole)
        val_y.append(diastole)
    else:
        train_images.append(r[:, :3].reshape(1, -1, IMG_SIZE, IMG_SIZE))
        train_images.append(r[:, 3:].reshape(1, -1, IMG_SIZE, IMG_SIZE))
        train_y.append(systole)
        train_y.append(diastole)


X_train = np.vstack(train_images).astype(np.float32)
Y_train = np.array(train_y)
np.save("../input/X_train.npy", X_train)
np.save("../input/y_train.npy", Y_train)

X_valid = np.vstack(val_images).astype(np.float32)
Y_valid = np.array(val_y)
np.save("../input/X_valid.npy", X_train)
np.save("../input/y_valid.npy", Y_train)

30.0 % 
40.0 % 
50.0 % 
60.0 % 
70.0 % 
80.0 % 
90.0 % 
100.0 % 
10.0 % 
