In [1]:
import tensorflow as tf
import scipy.io 
import matplotlib.pyplot as plt
import cv2
import keras
from glob import glob
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import pandas as pd
import cv2

from sklearn.model_selection import KFold
# from keras.preprocessing.image import ImageDataGenerator

# import keras_metrics

from keras.applications import mobilenet, resnet50 #, vgg16, inception_v3, resnet50, 
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, History

from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

import logging
# logging.getLogger().setLevel(logging.DEBUG)


# from scikitplot.metrics import plot_roc_curve

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
keras.__version__

'2.1.5'

## Params

In [3]:
all_data_dir = 'E:\\Work/PathoBarIlan/Shlomi2018/'# '/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018'
is_relative_path_csv = False
seed = 4221

pos_name_init = 'Cancer'
neg_name_init = 'Normal'

use_rgb = False # True=rgb, False=spectral
if use_rgb:
    file_ext = '.png'
else:
    file_ext = '.npy'
    
window_size = (200, 200)
shift = (100, 100)

In [4]:
w,h = window_size
if use_rgb:
    input_shape = (w,h,3)
else:
    input_shape = (w,h,40)
batch_size = 16

## utils

In [5]:
def read_slide(path):
    mat = scipy.io.loadmat(path)
    spectral = mat["Spec"]
    rgb = mat["Section"]
    shape = rgb.shape
    
    return spectral, rgb

In [6]:
def create_batch_of_crops_from_slide(img, window_size, shift, vis_flag=False):
    crops = []

    n_iter_x = (img.shape[1]-window_size[0])//shift[0] + 1

    n_iter_y = (img.shape[0]-window_size[1])//shift[1] + 1

#     n_iter_x, n_iter_y

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            init_y = i*shift[0]
            init_x = j*shift[1]
        
            crops.append(img[init_x:init_x+window_size[0], init_y:init_y+window_size[1], :])
    if vis_flag:
        visualize_batch_of_crops(crops, n_iter_y, n_iter_x)
    return crops

In [7]:
def visualize_batch_of_crops(crops, n_iter_y, n_iter_x):
    fig, axes = plt.subplots(n_iter_y, n_iter_x, figsize=(5, 5), gridspec_kw = {'wspace':0, 'hspace':0})

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            axes[j, i].imshow(crops[i*n_iter_y + j])
            axes[j, i].axis('off')
            axes[j, i].set_aspect('equal')
    plt.show()

In [8]:
def create_crops_from_fileslist(fileslist, window_size, shift):
    rgb_crops = []
    spectral_crops = []
    labels = []

    for file in tqdm(fileslist):
#         file_name = os.path.basename(file)
#         print('Saving crops for file {} ...'.format(file_name))
#         print(file)
        spectral, rgb = read_slide(file)
        spectral_crops = create_batch_of_crops_from_slide(spectral, window_size=window_size, shift=shift)
        rgb_crops = create_batch_of_crops_from_slide(rgb, window_size=window_size, shift=shift)
        save_dir = file.replace('.mat', '_win{}-{}_shift{}-{}'.format(window_size[0], window_size[1], shift[0], shift[1]))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        for idx, (im_np, spec_np) in enumerate(zip(rgb_crops, spectral_crops)):
            im = Image.fromarray(im_np)
            im.save(os.path.join(save_dir, '{:05}.png'.format(idx)))
            np.save(os.path.join(save_dir, '{:05}.npy'.format(idx)), spec_np)


In [9]:
def create_crops_from_dir(dir_path, window_size, shift):
    print('Saving crops for slides in dir: {}'.format(dir_path))
    fileslist = glob(dir_path + '/*.mat')
    create_crops_from_fileslist(fileslist, window_size, shift)

In [10]:
def create_csv_for_folder(data_dir, ext):
    if ext[0] == '.':
        ext = ext[1:]
    data_df = pd.DataFrame(columns=['filename', 'label'])
    files = glob(os.path.join(data_dir,'*', '*.{}'.format(ext)))
    files = [file for file in files if "Mixed" not in file]
#     print(data_dir+'/*/*.{}'.format(ext))
    
    init_len = len(data_dir)
    delete_folder = all_data_dir
    if not is_relative_path_csv:
        delete_folder = '/'
    if not delete_folder[-1] == '/':
        delete_folder += '/'
    files = [file.replace(delete_folder, '/') for file in files]
#     print(files)
    labels = [1 if pos_name_init in file else 0 for file in files]
#     print(labels)
    data_df['filename'] = files
    data_df['label'] = labels
#     data_df.to_csv(os.path.join(data_dir, os.path.basename(data_dir)+'.csv'), index=False)
#     print('Created CSV successfully for folder {}'.format(data_dir))
    
    return data_df    

In [11]:
slides = glob(os.path.join(all_data_dir, "*/"))
slides

['E:\\Work/PathoBarIlan/Shlomi2018\\Case10\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case11\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case12\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case14\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case16\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case16b\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case17\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case18\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case19484\\',
 'E:\\Work/PathoBarIlan/Shlomi2018\\Case8\\']

In [12]:
skf = KFold(n_splits=5, shuffle=True, random_state=seed)

train_slides_all = []
test_slides_all = []
val_slides_all = []

for train_index, test_index in skf.split(np.arange(len(slides)).T, np.arange(len(slides)).T):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_slides_all.append(train_index)
    val_slides_all.append([test_index[0]])
    test_slides_all.append([test_index[1]])

TRAIN: [2 3 4 5 6 7 8 9] TEST: [0 1]
TRAIN: [0 1 2 3 5 7 8 9] TEST: [4 6]
TRAIN: [0 1 3 4 5 6 7 8] TEST: [2 9]
TRAIN: [0 1 2 4 6 7 8 9] TEST: [3 5]
TRAIN: [0 1 2 3 4 5 6 9] TEST: [7 8]


In [13]:
i = 3 # take one of the K-Folds

train_index = train_slides_all[i]
val_index = val_slides_all[i]
test_index = test_slides_all[i]

train_index, val_index, test_index

(array([0, 1, 2, 4, 6, 7, 8, 9]), [3], [5])

In [14]:
def get_dfs_for_indices(slides, index_list):
    dfs = []
    for slide in np.array(slides)[index_list]:
        data_dir = slide
        dfs.append(create_csv_for_folder(data_dir, file_ext))
    df = pd.concat(dfs, ignore_index=True)
    df = df.sample(frac=1, random_state=seed)  # frac=1 is same as shuffling df.
    return df

In [15]:
df_train = get_dfs_for_indices(slides, train_index)
df_test = get_dfs_for_indices(slides, test_index)
df_val = get_dfs_for_indices(slides, val_index)

In [16]:
pd.options.display.max_colwidth = 150

In [17]:
print(len(df_train.index.values))
print(len(set(df_train.index.values)))

print(len(df_train.columns.values))
print(len(set(df_train.columns.values)))

2991
2991
2
2


In [18]:
assert len(set(df_train.label.values)) == 2 and len(set(df_val.label.values)) == 2 and len(set(df_test.label.values)) == 2  

In [19]:
df_train = df_train[:35]
df_test = df_train
df_val = df_train

In [20]:
n_batches_train = df_train.shape[0]//batch_size
n_batches_test = df_test.shape[0]//batch_size
n_batches_val = df_val.shape[0]//batch_size

In [21]:
def sample_norm(X):
    return X


def generator_from_df(df, batch_size, shuffle=True): 
    
    n_batches = df.shape[0]//batch_size
    while True:
        if shuffle:
            df_tmp = df.copy().sample(frac=1)  # frac=1 is same as shuffling df.
        else:
            df_tmp = df
        
        for i in range(n_batches):
            sub = df_tmp.iloc[batch_size*i:batch_size*(i+1)]
            if use_rgb:
                X = [sample_norm(img_to_array(load_img(f, target_size=input_shape))) for f in sub.filename]
            else:
                X = [sample_norm(np.load(f)) for f in sub.filename]
                
            logging.debug(f"from file {sub.iloc[0].filename}\nto file {sub.iloc[-1].filename}")

            X = np.stack(X)
            Y = sub.label.values
            Y = to_categorical(Y, num_classes=2)
            # Simple model, one input, one output.
            
            yield X, Y

####### copying generator_from_df:
https://gist.github.com/timehaven/257eef5b0e2d9e2625a9eb812ca2226b#file-akmtdfgen-py

In [22]:
train_generator = generator_from_df(df_train, batch_size)
val_generator = generator_from_df(df_val, batch_size)
test_generator = generator_from_df(df_test, batch_size, shuffle=False)

In [23]:
# input_shape = train_generator.image_shape
mobilenet_model = mobilenet.MobileNet(include_top=True, weights=None, input_shape=input_shape, classes=2, dropout=0.2)
# mobilenet_model = resnet50.ResNet50(include_top=True, weights=None, input_shape=input_shape, classes=2)

  str(input_shape[-1]) + ' input channels.')


In [24]:
mobilenet_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200, 200, 40)      0         
_________________________________________________________________
conv1_pad (ZeroPadding2D)    (None, 202, 202, 40)      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 100, 100, 32)      11520     
_________________________________________________________________
conv1_bn (BatchNormalization (None, 100, 100, 32)      128       
_________________________________________________________________
conv1_relu (Activation)      (None, 100, 100, 32)      0         
_________________________________________________________________
conv_pad_1 (ZeroPadding2D)   (None, 102, 102, 32)      0         
_________________________________________________________________
conv_dw_1 (DepthwiseConv2D)  (None, 100, 100, 32)      288       
__________

_________________________________________________________________
conv_pad_9 (ZeroPadding2D)   (None, 15, 15, 512)       0         
_________________________________________________________________
conv_dw_9 (DepthwiseConv2D)  (None, 13, 13, 512)       4608      
_________________________________________________________________
conv_dw_9_bn (BatchNormaliza (None, 13, 13, 512)       2048      
_________________________________________________________________
conv_dw_9_relu (Activation)  (None, 13, 13, 512)       0         
_________________________________________________________________
conv_pw_9 (Conv2D)           (None, 13, 13, 512)       262144    
_________________________________________________________________
conv_pw_9_bn (BatchNormaliza (None, 13, 13, 512)       2048      
_________________________________________________________________
conv_pw_9_relu (Activation)  (None, 13, 13, 512)       0         
_________________________________________________________________
conv_pad_1

In [25]:
optimizer = Adam(lr=1e-3) # 1e-3
mobilenet_model.compile(loss="binary_crossentropy", optimizer=optimizer) #  binary_crossentropy , categorical_crossentropy
# history = History()
lrReduce = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=4, verbose=1, min_lr=1e-6)
if use_rgb:
    chkpnt = ModelCheckpoint("my_models/model_rgb_weights_epoch{epoch:02d}-val_loss{val_loss:.3f}.hdf5", save_best_only=True) # -train_loss{history.History()[loss][-1]:.2f}
else:
    chkpnt = ModelCheckpoint("my_models/model_spec_weights_epoch{epoch:02d}-val_loss{val_loss:.3f}.hdf5", save_best_only=True) # -train_loss{history.History()[loss][-1]:.2f}
num_of_epochs = 100

In [None]:
STEP_SIZE_TRAIN=len(df_train)//batch_size
STEP_SIZE_VALID=len(df_val)//batch_size
# print(STEP_SIZE_VALID)
history = mobilenet_model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=num_of_epochs, callbacks=[lrReduce, chkpnt], shuffle=False) # chkpnt

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100

Epoch 00047: ReduceLROnPlateau reducing learning rate to 2.700000040931627e-05.
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100

Epoch 00051: ReduceLROnPlateau reducing learning rate to 8.100000013655517e-06.
Epoch 52/100
Epoch 53/

In [None]:
# STEP_SIZE_TEST=len(df_test)//batch_size
# mobilenet_model.evaluate_generator(train_generator, steps=1)
mobilenet_model.evaluate(xxx, yyy)

In [None]:
# y_pred = mobilenet_model.predict_generator(train_generator, steps=1)
y_pred = mobilenet_model.predict(xx)
y_pred

In [None]:
y_pred.argmax(axis=1), y_train.values[:, 1].astype(int)

In [None]:
y_test = df_test['label'][:len(y_pred)].values

In [None]:
plot_roc_curve(y_test, y_pred)
plt.show()

In [None]:
y_pred = np.argmax(y_pred, axis=1)
y_pred.shape

In [None]:
y_pred.sum()

In [None]:
y_test.sum()

In [None]:
(y_pred==y_test).sum()/220