In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras

from sklearn.decomposition import IncrementalPCA, MiniBatchNMF
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

import os
import random
import pickle
import pydicom
import matplotlib.pyplot as plt


import data_splitter as ds
import data_loader as dl
import feature_extraction as fex
import run_model as rm

In [3]:
splitter = ds.DataSplitter(verbose=True)

Total patient_id in training set:  9530
Total patient_id in test set:  2383
Total image_id in training set:  43767
Total image_id in test set:  10939
Total patient_id in training set:  7624
Total patient_id in calibration set:  1906
Total image_id in training set:  35003
Total image_id in calibration set:  8764


## Image Resolution
- 1024 x 1024
- 512 x 512
- 256 x 256
- 1 channel

## Normalization
- [-1, 1]
- [0, 1]

For PCA and NMF, flatten the image.

For pre-train CNN, duplicate the array to 3 channels.

In [None]:
# image id of all sample training set after split to calib and train set

train_img_ids = [id for k, v in splitter.trainset.items() for id in v]
print(len(train_img_ids))

In [None]:
metadata = pd.read_csv('train.csv')

In [None]:
BASEPATH = '' # directory that contain the original DICOM file are stored

BATCH_SIZE = 5000 # lose 3 samples
NORMALIZATION = (-1, 1)

# from PCA, image resolution does not seem to have any effect on no. of component
# so we'll use the lowest resolution to minimize training cost
IMG_SIZE = (256, 256, 1)

RANDOM_STATE = 42

# define DataGenerators for training split
train_gen = dl.DataGenerator(
    list_IDs = train_img_ids,
    labels = splitter.labels,
    patient_img_dict = splitter.trainset,
    basepath = BASEPATH,
    batch_size = BATCH_SIZE,
    img_size = IMG_SIZE,
 
    shuffle = False,
    normalize = NORMALIZATION,
    feature_extractor = None,
    verbose = True
    )

# check no. of batch
print(len(train_gen))

In [None]:
for i, (batch_X, batch_y) in enumerate(train_gen):
    print('Processing batch ', i)
    if i == 0:
        X = batch_X
        y = batch_y
    else:
        X = np.concatenate((X, batch_X), axis=0)
        y = np.concatenate((y, batch_y), axis=0)
        
    print('---X.shape = ', X.shape)
    print('---y.shape = ', y.shape)
    
print('Finished!')
print('---Final X.shape = ', X.shape)
print('---Final y.shape = ', y.shape)

In [None]:
# with open('../X_256_norm11.npy','wb') as f:
#     np.save(f, X)
    
# with open('../256_norm11.npy','wb') as f:
#     np.save(f, y)

In [None]:
BASEPATH = '' # directory that contain the original DICOM file are stored

BATCH_SIZE = 5000 # lose 3 samples
NORMALIZATION = (0, 1)

# from PCA, image resolution does not seem to have any effect on no. of component
# so we'll use the lowest resolution to minimize training cost
IMG_SIZE = (256, 256, 1)

RANDOM_STATE = 42

# define DataGenerators for training split
train_gen = dl.DataGenerator(
    list_IDs = train_img_ids,
    labels = splitter.labels,
    patient_img_dict = splitter.trainset,
    basepath = BASEPATH,
    batch_size = BATCH_SIZE,
    img_size = IMG_SIZE,
    n_classes = 2,
    shuffle = False,
    normalize = NORMALIZATION,
    feature_extractor = None,
    verbose = True
    )

# check no. of batch
print(len(train_gen))

In [None]:
for i, (batch_X, batch_y) in enumerate(train_gen):
    print('Processing batch ', i)
    if i == 0:
        X = batch_X
        y = batch_y
    else:
        X = np.concatenate((X, batch_X), axis=0)
        y = np.concatenate((y, batch_y), axis=0)
        
    print('---X.shape = ', X.shape)
    print('---y.shape = ', y.shape)
    
print('Finished!')
print('---Final X.shape = ', X.shape)
print('---Final y.shape = ', y.shape)

In [None]:
with open('../X_256_norm01.npy','wb') as f:
    np.save(f, X)
    
with open('../y_256_norm01.npy','wb') as f:
    np.save(f, y)

In [None]:
BASEPATH = '' # directory that contain the original DICOM file are stored

BATCH_SIZE = 5000 # lose 3 samples
NORMALIZATION = (0, 1)

# from PCA, image resolution does not seem to have any effect on no. of component
# so we'll use the lowest resolution to minimize training cost
IMG_SIZE = (512, 512, 1)

RANDOM_STATE = 42

# define DataGenerators for training split
train_gen = dl.DataGenerator(
    list_IDs = train_img_ids,
    labels = splitter.labels,
    patient_img_dict = splitter.trainset,
    basepath = BASEPATH,
    batch_size = BATCH_SIZE,
    img_size = IMG_SIZE,

    shuffle = False,
    normalize = NORMALIZATION,
    feature_extractor = None,
    verbose = True
    )

# check no. of batch
print(len(train_gen))

In [None]:
for i, (batch_X, batch_y) in enumerate(train_gen):
    print('Processing batch ', i)
    if i == 0:
        X = batch_X
        y = batch_y
    else:
        X = np.concatenate((X, batch_X), axis=0)
        y = np.concatenate((y, batch_y), axis=0)
        
    print('---X.shape = ', X.shape)
    print('---y.shape = ', y.shape)
    
print('Finished!')
print('---Final X.shape = ', X.shape)
print('---Final y.shape = ', y.shape)

In [None]:
with open('../X_512_norm01.npy','wb') as f:
    np.save(f, X)
    
with open('../y_512_norm01.npy','wb') as f:
    np.save(f, y)

# Preprocess and save each image as 1 .npy file

In [None]:
BASEPATH = '' # directory that contain the original DICOM file are stored
SAVEPATH = '' #directory that will be used to save the preprocessed image'

IMG_SIZE = (1024, 1024, 1)
NORMALIZE = (0, 1)

for img_id in train_img_ids:
    
    print('Image ID: ', img_id)
    patient_id = [i for i in splitter.trainset if img_id in (splitter.trainset[i])][0]
    
    input_path = BASEPATH + '/' + str(patient_id)
    output_path = SAVEPATH + '/' + str(img_id) + '.npy'
    
    if os.path.exists(output_path):
        print('---File already exist')
    
    else:
        print('---Processing {}.dcm'.format(img_id))
        img = dl.read_and_preprocess(input_path, img_id, img_size=IMG_SIZE, normalize=NORMALIZE)
        print('---Saving...')
        with open(output_path,'wb') as f:
            np.save(f, img) 
        print('---Done!')

In [None]:
print('Finished!')

In [None]:
import re
import glob

path = "../train/*.npy"

array_files = []
for filepath in glob.glob(path):
    array_files.append(int(re.search('.+/([0-9]+).npy', filepath).group(1)))

print(len(array_files))   

In [5]:
# image id of all sample calibration set after split to calib and train set

calibrate_img_ids = [id for k, v in splitter.calibset.items() for id in v]
print(len(calibrate_img_ids))

8764


In [None]:
BASEPATH = '../train_images'
SAVEPATH = ''

IMG_SIZE = (1024, 1024, 1)
NORMALIZE = (0, 1)

for img_id in calibrate_img_ids:
    
    print('Image ID: ', img_id)
    patient_id = [i for i in splitter.calibset if img_id in (splitter.calibset[i])][0]
    
    input_path = BASEPATH + '/' + str(patient_id)
    output_path = SAVEPATH + '/' + str(img_id) + '.npy'
    
    if os.path.exists(output_path):
        print('---File already exist')
    
    else:
        print('---Processing {}.dcm'.format(img_id))
        img = dl.read_and_preprocess(input_path, img_id, img_size=IMG_SIZE, normalize=NORMALIZE)
        print('---Saving...')
        with open(output_path,'wb') as f:
            np.save(f, img) 
        print('---Done!')

In [None]:
print('Finished! (calibration set)')

In [7]:
import re
import glob

path = "../*.npy"

array_files = []
for filepath in glob.glob(path):
    array_files.append(int(re.search('.+/([0-9]+).npy', filepath).group(1)))

print(len(array_files))  

8764


In [8]:
# image id of all sample in the test set

test_img_ids = [id for k, v in splitter.test.items() for id in v]
print(len(test_img_ids))

10939


In [10]:
BASEPATH = '../train_images'
SAVEPATH = ''

IMG_SIZE = (1024, 1024, 1)
NORMALIZE = (0, 1)

for img_id in test_img_ids:
    
    print('Image ID: ', img_id)
    patient_id = [i for i in splitter.test if img_id in (splitter.test[i])][0]
    
    input_path = BASEPATH + '/' + str(patient_id)
    output_path = SAVEPATH + '/' + str(img_id) + '.npy'
    
    if os.path.exists(output_path):
        print('---File already exist')
    
    else:
        print('---Processing {}.dcm'.format(img_id))
        img = dl.read_and_preprocess(input_path, img_id, img_size=IMG_SIZE, normalize=NORMALIZE)
        print('---Saving...')
        with open(output_path,'wb') as f:
            np.save(f, img) 
        print('---Done!')

Image ID:  905437475
---Processing 905437475.dcm
---Saving...
---Done!
Image ID:  1878852029
---Processing 1878852029.dcm
---Saving...
---Done!
Image ID:  1829474195
---Processing 1829474195.dcm
---Saving...
---Done!
Image ID:  1870285784
---Processing 1870285784.dcm
---Saving...
---Done!
Image ID:  398969865
---Processing 398969865.dcm
---Saving...
---Done!
Image ID:  2114167175
---Processing 2114167175.dcm
---Saving...
---Done!
Image ID:  133155485
---Processing 133155485.dcm
---Saving...
---Done!
Image ID:  1750012010
---Processing 1750012010.dcm
---Saving...
---Done!
Image ID:  84884968
---Processing 84884968.dcm
---Saving...
---Done!
Image ID:  164811751
---Processing 164811751.dcm
---Saving...
---Done!
Image ID:  950620623
---Processing 950620623.dcm
---Saving...
---Done!
Image ID:  951735282
---Processing 951735282.dcm
---Saving...
---Done!
Image ID:  300225852
---Processing 300225852.dcm
---Saving...
---Done!
Image ID:  1628548011
---Processing 1628548011.dcm
---Saving...
---Do

In [11]:
print('Finished! (test set)')

Finished! (test set)


In [12]:
import re
import glob

path = "../test/*.npy"

array_files = []
for filepath in glob.glob(path):
    array_files.append(int(re.search('.+/([0-9]+).npy', filepath).group(1)))

print(len(array_files))  

10939
