In [None]:
import pydicom
import os
import numpy as np
import pandas as pd
from matplotlib import cm
from matplotlib import pyplot as plt
import scipy.misc
from scipy import ndimage
import h5py

import tensorflow as tf
# print(tf.__version__)

# plot the image
%matplotlib inline

# Load information file

In [None]:
path = "C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/"

In [None]:
dat = pd.read_table(path + "data/data_to_read_in.csv", sep = ",", encoding = "latin-1", low_memory = False)

In [None]:
dat[:3]

In [None]:
dat = dat.sort_values(["p_id", "modality", "instance_no"])

In [None]:
dat[:3]

In [None]:
# check how duplicated works
# x = pd.DataFrame({"x": [1,1,2,2,3,4,3,4], "y": 1})
# print(x.duplicated("x"))
# print(x.index[x.duplicated("x")])

# p = np.unique(dat.p_id.values)[0]
# pat = dat.loc[dat.p_id == p,:]
# pat_m = pat.loc[pat.modality == np.unique(pat.modality)[0]]
# pat_m.index[pat_m.duplicated("instance_no")]

In [None]:
# search for duplicated instance numbers
idx = []
for p in np.unique(dat.p_id.values):
    pat = dat.loc[dat.p_id == p,:]
    for m in np.unique(pat.modality):
        pat_m = pat.loc[pat.modality == m]
        idx.append(pat_m.index[pat_m.duplicated("instance_no")])

# combine the arrays in the list to one array with the indices:
idx = np.hstack(idx)

In [None]:
# delete duplicates
dat0 = dat
dat = dat.drop(index = idx)

In [None]:
# run again to check if all duplicated instance numbers are gone
idx = []
for p in np.unique(dat.p_id.values):
    pat = dat.loc[dat.p_id == p,:]
    for m in np.unique(pat.modality):
        pat_m = pat.loc[pat.modality == m]
        idx.append(pat_m.index[pat_m.duplicated("instance_no")])

# combine the arrays in the list to one array with the indices:
idx = np.hstack(idx)
idx

In [None]:
# reset the index
print(dat.index[:10])
dat = dat.reset_index(drop = True)
dat.index

In [None]:
# check image dimensions

# quadratic images?
all(dat.loc[:,"columns"] == dat.loc[:,"rows"])

# same number of pixels per modality?
for i in np.unique(dat.modality):
    print(i, np.unique(dat.loc[dat.modality == i, "columns"], return_counts = True))
    
# number of images per modality per patient
for i in np.unique(dat.modality):
    print(i, np.unique(dat.loc[dat.modality == i, "p_id"], return_counts = True)[1])

# Check DICOM data

In [None]:
# consider a dicom as example image
slice_ex = pydicom.read_file(dat.path[100])

In [None]:
slice_ex

In [None]:
# Read a slice from each modality from one patient
dat_pat = dat.loc[dat.p_id == 442968, :] #516341

i = 10
slice_adc = pydicom.read_file(dat_pat.loc[dat_pat.modality == "ADC", "path"].values[i])
slice_dwi = pydicom.read_file(dat_pat.loc[dat_pat.modality == "DWI", "path"].values[i])
slice_cbf = pydicom.read_file(dat_pat.loc[dat_pat.modality == "CBF", "path"].values[i])
slice_cbv = pydicom.read_file(dat_pat.loc[dat_pat.modality == "CBV", "path"].values[i])
slice_tmax = pydicom.read_file(dat_pat.loc[dat_pat.modality == "TMAX", "path"].values[i])
slice_mtt = pydicom.read_file(dat_pat.loc[dat_pat.modality == "MTT", "path"].values[i])
slice_ttp = pydicom.read_file(dat_pat.loc[dat_pat.modality == "TTP", "path"].values[i])

In [None]:
example_slices = [0, slice_adc.pixel_array, 
                  slice_dwi.pixel_array, 
                  slice_cbf.pixel_array, 
                  slice_cbv.pixel_array, 
                  slice_tmax.pixel_array, 
                  slice_mtt.pixel_array, 
                  slice_ttp.pixel_array]
fig=plt.figure(figsize=(15, 15))
columns = 7
rows = 1
for i in range(1, columns*rows +1):
    img = example_slices[i]
    fig.add_subplot(rows, columns, i)
    plt.imshow(img)
plt.show()

In [None]:
# slice thickness & pixel spacing: (seems to change with patients...)
# ADC slices are 5mm and each voxel represents 1.8mm
print(slice_adc.SliceThickness, slice_adc.PixelSpacing)
print(slice_dwi.SliceThickness, slice_dwi.PixelSpacing)
print(slice_cbf.SliceThickness, slice_cbf.PixelSpacing)
print(slice_cbv.SliceThickness, slice_cbv.PixelSpacing)
print(slice_mtt.SliceThickness, slice_mtt.PixelSpacing)
print(slice_tmax.SliceThickness, slice_tmax.PixelSpacing)

# Read DICOM data

### Read slices of one patient

In [None]:
# checked how drop_duplicates() works
# dat0 = pd.DataFrame({"name": [1,1,2,2,3,3,4,4,5,5,6,6,7,7], "var": [1,2,3,4,5,6,7,8,9,10,11,12,13,14]})
# dat0.drop_duplicates("name")

In [None]:
ref_slice = slice_cbf
ref_slice.pixel_array.shape

In [None]:
modality = "TMAX"
pat = dat.loc[(dat.p_id == np.unique(dat.p_id)[100]) & (dat.modality == modality),]

# make sure that the images are in the correct order and that there are no duplicated series
pat = pat.sort_values("instance_no")
pat = pat.drop_duplicates("instance_no")

# take a reference slice and save information
ref_slice = pydicom.read_file(pat.path.values[0])
pat["pixel_spacing_x"] = float(ref_slice.PixelSpacing[0])
pat["pixel_spacing_y"] = float(ref_slice.PixelSpacing[1])
pat["pixel_spacing_z"] = float(ref_slice.SliceThickness)

# array to store the images
X = np.ndarray((ref_slice.Rows, ref_slice.Columns, len(pat), ref_slice.pixel_array.shape[2]), dtype = "int")

# load slices
for i, p in enumerate(pat.path.values):
    slice_tmp = pydicom.read_file(p)
    X[:,:,i] = slice_tmp.pixel_array

In [None]:
def plot_slices(X, pat, plane = ["axial", "coronal", "sagittal"], modality = "perfusion"):
    # total figure size (including all subplots)
    nslices = X.shape[2]
    ncols = 6
    nrows = int(nslices / ncols)
    base_size = 2
    aspect_ratio = 0.5
    # ax_aspect = pat.pixel_spacing_y.values[0]/pat.pixel_spacing_x.values[0]
    # cor_aspect = pat.pixel_spacing_z.values[0]/pat.pixel_spacing_x.values[0]
    # sag_aspect = pat.pixel_spacing_y.values[0]/pat.pixel_spacing_z.values[0]

    figsize = (ncols*3, nrows*3)
    fig = plt.figure(figsize = figsize)
    
    if plane == "axial":
        fig_all = []
        for i in range(1, ncols*nrows):
            if modality != "perfusion":
                img = X[:,:,i,0]
                fig_all.append(fig.add_subplot(nrows, ncols, i))
                plt.imshow(img)
            else:
                img = X[:,:,i,:]
                fig_all.append(fig.add_subplot(nrows, ncols, i))
                plt.imshow(img)
        plt.show()
    if plane == "coronal":
        # which images do we want to consider
        idx = int(X.shape[1]/(ncols*nrows))
        idx = list(range(0, X.shape[1], idx))
        fig_all = []
        for i in range(1, ncols*nrows):
            if modality != "perfusion":
                img = X[idx[i],:,:,0]
                fig_all.append(fig.add_subplot(nrows, ncols, i))
                plt.imshow(img, aspect = "auto")
            else:
                img = X[idx[i],:,:,:]
                fig_all.append(fig.add_subplot(nrows, ncols, i))
                plt.imshow(img, aspect = "auto")
        plt.show()
    if plane == "sagittal":
        # which images do we want to consider
        idx = int(X.shape[0]/(ncols*nrows))
        idx = list(range(0, X.shape[0], idx))
        fig_all = []
        for i in range(1, ncols*nrows):
            if modality != "perfusion":
                img = X[:,idx[i],:,0]
                fig_all.append(fig.add_subplot(nrows, ncols, i))
                plt.imshow(img, aspect = "auto")
            else:
                img = X[:,idx[i],:,:]
                fig_all.append(fig.add_subplot(nrows, ncols, i))
                plt.imshow(img, aspect = "auto")
        plt.show()

In [None]:
plot_slices(X, pat, "axial")

In [None]:
plot_slices(X, pat, "coronal")

In [None]:
plot_slices(X, pat, "sagittal")

In [None]:
# remove the bar and the name
plt.imshow(X[:40,:50,0])

In [None]:
modality = "TMAX"
pat = dat.loc[(dat.p_id == np.unique(dat.p_id)[100]) & (dat.modality == modality),]

# make sure that the images are in the correct order and that there are no duplicated series
pat = pat.sort_values("instance_no")
pat = pat.drop_duplicates("instance_no")

# take a reference slice and save information
ref_slice = pydicom.read_file(pat.path.values[0])
pat["pixel_spacing_x"] = float(ref_slice.PixelSpacing[0])
pat["pixel_spacing_y"] = float(ref_slice.PixelSpacing[1])
pat["pixel_spacing_z"] = float(ref_slice.SliceThickness)

# array to store the images
X = np.ndarray((ref_slice.Rows, ref_slice.Columns, len(pat), ref_slice.pixel_array.shape[2]), dtype = "int")

# load slices
for i, p in enumerate(pat.path.values):
    slice_tmp = pydicom.read_file(p)
    X[:,:,i] = slice_tmp.pixel_array
    
    # remove modality name and color bar for OLEA images
    if(modality in ["CBF", "CBV", "TMAX", "MTT", "TTP"]):
        X[:40,:50,i] = 0
        X[:,200:,i] = 0

In [None]:
ref_slice.pixel_array.shape

In [None]:
plot_slices(X, pat, "axial")

In [None]:
# read one patient
def load_slices(dat, modality, p_id):
    pat = dat.loc[(dat.p_id == p_id) & (dat.modality == modality),]
    
    # make sure that the images are in the correct order and that there are no duplicated series
    pat = pat.sort_values("instance_no")
    pat = pat.drop_duplicates("instance_no")
    
    # take a reference slice and save information
    ref_slice = pydicom.read_file(pat.path.values[0])
    pat["pixel_spacing_x"] = float(ref_slice.PixelSpacing[0])
    pat["pixel_spacing_y"] = float(ref_slice.PixelSpacing[1])
    pat["pixel_spacing_z"] = float(ref_slice.SliceThickness)
    
    # array to store the images
    if modality in ["DWI", "ADC"]:
        X = np.ndarray((ref_slice.Rows, ref_slice.Columns, len(pat), 1), dtype = "int")
    else:
        X = np.ndarray((ref_slice.Rows, ref_slice.Columns, len(pat), ref_slice.pixel_array.shape[2]), dtype = "int")
    
    # load slices
    for i, p in enumerate(pat.path.values):
        slice_tmp = pydicom.read_file(p)
        if(modality in ["DWI", "ADC"]):
            X[:,:,i,0] = slice_tmp.pixel_array
        
        # remove modality name and color bar for OLEA images
        if(modality in ["CBF", "CBV", "TMAX", "MTT", "TTP"]):
            X[:,:,i,:] = slice_tmp.pixel_array
            X[:40,:50,i,:] = 0
            X[:,200:,i,:] = 0
        
            
    return X, pat

In [None]:
X, pat = load_slices(dat, "ADC", np.unique(dat.p_id)[10])

In [None]:
plot_slices(X, pat, "coronal")

In [None]:
pat.head(3)

In [None]:
print(np.min(X), np.max(X))

In [None]:
# scale data
dim = (128, 128, 64, 3)
scaling_factor = [dim[0]/X.shape[0], dim[1]/X.shape[1], dim[2]/X.shape[2], dim[3]/X.shape[3]]
X_scaled = ndimage.zoom(X, scaling_factor, order = 1) # order = 1: linear interpolation

In [None]:
print(X_scaled.shape, np.min(X_scaled), np.max(X_scaled))

In [None]:
plot_slices(X_scaled, pat, "axial")

In [None]:
# dat_cbf = pd.DataFrame(index=range(1), columns=pat.columns)
# dat_cbf

### Read all patients from one modality

In [None]:
# read all patients and all modalities
n = len(np.unique(dat.p_id)[:2])
dat_mod = pd.DataFrame(index = range(1), columns = pat.columns)
mod = "CBF"
dim = (128, 128, 64, 3)
X_mod = np.zeros((n, 128, 128, 64, 3), dtype = "uint8")

for i, p_id in enumerate(np.unique(dat.p_id)[:2]):
    X_tmp, pat_tmp = load_slices(dat, mod, p_id)
    scaling_factor = [dim[0]/X_tmp.shape[0], dim[1]/X_tmp.shape[1], dim[2]/X_tmp.shape[2], dim[3]/X_tmp.shape[3]]
    X_scaled = ndimage.zoom(X_tmp, scaling_factor, order = 1)
    X_mod[i,:,:,:,:] = X_scaled
    dat_mod = dat_mod.append(pat_tmp.iloc[0]) # we only need one row because we consider each patient now

dat_mod = dat_mod.reset_index(drop = True)
dat_mod = dat_mod.drop(index = 0)

In [None]:
dat_mod

In [None]:
plot_slices(X_mod[0], dat_mod, "axial")

In [None]:
def read_patient(dat, modality):
    # read all patients and all modalities
    n = len(np.unique(dat.p_id))
    dat_mod = pd.DataFrame(index = range(1), columns = pat.columns)
    mod = modality
    
    if mod in ["DWI", "ADC"]:
        dim = (128, 128, 64, 1)
        X_mod = np.zeros((n, 128, 128, 64, 1), dtype = "uint16")
    else:
        dim = (128, 128, 64, 3)
        X_mod = np.zeros((n, 128, 128, 64, 3), dtype = "uint16")
    
    
    for i, p_id in enumerate(np.unique(dat.p_id)):
        X_tmp, pat_tmp = load_slices(dat, mod, p_id)
        scaling_factor = [dim[0]/X_tmp.shape[0], dim[1]/X_tmp.shape[1], dim[2]/X_tmp.shape[2], dim[3]/X_tmp.shape[3]]
        X_scaled = ndimage.zoom(X_tmp, scaling_factor, order = 1)
        X_mod[i,:,:,:,:] = X_scaled
        dat_mod = dat_mod.append(pat_tmp.iloc[0]) # we only need one row because we consider each patient now
    
    dat_mod = dat_mod.reset_index(drop = True)
    dat_mod = dat_mod.drop(index = 0)
    
    return X_mod, dat_mod

In [None]:
# read patients: perfusion maps
X_cbf, dat_cbf = read_patient(dat, "CBF")
X_cbv, dat_cbv = read_patient(dat, "CBV")
X_mtt, dat_mtt = read_patient(dat, "MTT")
X_ttp, dat_ttp = read_patient(dat, "TTP")
X_tmax, dat_tmax = read_patient(dat, "TMAX")

In [None]:
# read patients: diffusion maps
X_dwi, dat_dwi = read_patient(dat, "DWI")
X_adc, dat_adc = read_patient(dat, "ADC")

In [None]:
plot_slices(X_ttp[10], dat_ttp.iloc[10,:], "axial")

In [None]:
plot_slices(X_dwi[10], dat_dwi.iloc[10,:], "axial", modality = 0)

In [None]:
# write to HDF5
with h5py.File("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020.h5", "w") as h5:
    # Image matrices
    h5.create_dataset("X_dwi", data=X_dwi)
    h5.create_dataset("X_adc", data=X_adc)
    h5.create_dataset("X_cbf", data=X_cbf)
    h5.create_dataset("X_cbv", data=X_cbv)
    h5.create_dataset("X_mtt", data=X_mtt)
    h5.create_dataset("X_ttp", data=X_ttp)
    h5.create_dataset("X_tmax", data=X_tmax)

In [None]:
dat_dwi.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_dwi.csv", index = False)
dat_adc.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_adc.csv", index = False)
dat_cbf.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_cbf.csv", index = False)
dat_cbv.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_cbv.csv", index = False)
dat_mtt.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_mtt.csv", index = False)
dat_ttp.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_ttp.csv", index = False)
dat_tmax.to_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_tmax.csv", index = False)

# Old code

In [None]:
# # Read the images into an array
# # all perfusion images are of size 256x256x3
# # ADC/DWI images vary
# # Resize all images to 256x256x3
# 
# def read_images(dat, image_size):
#     # define the array to store the images
#     X = np.zeros((len(dat.path), image_size, image_size, 3), dtype=np.int16) 
#     for i, (_, row) in enumerate(dat.iterrows()): # iterate over the rows and i = row value not index
#         print(i)
#         img_in = pydicom.read_file(row.path)
#         # make sure that all images have equal size
#         img = scipy.misc.imresize(img_in.pixel_array, (image_size, image_size), interp = 'cubic')
#         
#         # For the DWI's and ADC's we repeat the image in the three colour channels
#         if row.modality == "ADC" or row.modality == "DWI":
#             X[i, :, :, 0] = img
#             X[i, :, :, 1] = img
#             X[i, :, :, 2] = img
#         else:
#             X[i, :, :, :] = img
#     return(X)

In [None]:
# X = read_images(dat, 256)

In [None]:
# # to get the spacing in the three dimensions
# def get_spacing(dat):
#     x = []
#     y = []
#     z = []
#     # define the array to store the images
#     for i, (_, row) in enumerate(dat.iterrows()): # iterate over the rows and i = row value not index
#         img_in = pydicom.read_file(row.path)
#         x.append(img_in.PixelSpacing[0])
#         y.append(img_in.PixelSpacing[1])
#         z.append(img_in.SliceThickness)
#     return(x, y, z)
# 
# x, y, z = get_spacing(dat)

In [None]:
# dat["spacing_x"] = x
# dat["spacing_y"] = y
# dat["spacing_z"] = z

In [None]:
# dat.to_csv(path + "data/data_march20_information.csv", index = False)

In [None]:
# import h5py
# 
# # Encode strings to save
# def encode_data(string):
#     encoded_string = [n.encode("UTF-8", "ignore") for n in string]
#     return(encoded_string)
# 
# # write to HDF5
# with h5py.File("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_march_20.h5", "w") as h5:
#     # Image matrices
#     h5.create_dataset("X", data=X)
#     # Path: Then we can merge the data and the labels later again
#     h5.create_dataset("path", data=encode_data(dat.path.get_values()))

# Consider the images

In [None]:
# write to HDF5
with h5py.File("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020.h5", "r") as h5:
    # Image matrices
    X_cbf = h5["X_cbf"][:]
    X_tmax = h5["X_tmax"][:]
    X_dwi = h5["X_dwi"][:]

In [None]:
dat_cbf = pd.read_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_cbf.csv")
dat_tmax = pd.read_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_tmax.csv")
dat_dwi = pd.read_csv("C:/Users/hezo/Dropbox/PhD/Stroke/Stroke_perfusion/data/data_bern_25_11_2020_dwi.csv")

In [None]:
plot_slices(X_dwi[20], dat_dwi.iloc[20,:], "axial", modality = 0)

In [None]:
plot_slices(X_cbf[20], dat_cbf.iloc[20,:], "axial")

In [None]:
plot_slices(X_tmax[20], dat_tmax.iloc[20,:], "axial")