In [None]:
import tensorflow
import scipy.io 
import matplotlib.pyplot as plt
import cv2
import keras
from glob import glob
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import pandas as pd
import cv2

from sklearn.model_selection import KFold
# from keras.preprocessing.image import ImageDataGenerator

from keras.applications import mobilenet, resnet50 #, vgg16, inception_v3, resnet50, 
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, History

from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

# from scikitplot.metrics import plot_roc_curve

## Params

In [None]:
all_data_dir = '/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018'
is_relative_path_csv = False
seed = 4221

pos_name_init = 'Cancer'
neg_name_init = 'Normal'

use_rgb = True # True=rgb, False=spectral
if use_rgb:
    file_ext = '.png'
else:
    file_ext = '.npy'
    
window_size = (200, 200)
shift = (100, 100)

## utils

In [None]:
def read_slide(path):
    mat = scipy.io.loadmat(path)
    spectral = mat["Spec"]
    rgb = mat["Section"]
    shape = rgb.shape
    
    return spectral, rgb

In [None]:
def create_batch_of_crops_from_slide(img, window_size, shift, vis_flag=False):
    crops = []

    n_iter_x = (img.shape[1]-window_size[0])//shift[0] + 1

    n_iter_y = (img.shape[0]-window_size[1])//shift[1] + 1

#     n_iter_x, n_iter_y

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            init_y = i*shift[0]
            init_x = j*shift[1]
        
            crops.append(img[init_x:init_x+window_size[0], init_y:init_y+window_size[1], :])
    if vis_flag:
        visualize_batch_of_crops(crops, n_iter_y, n_iter_x)
    return crops

In [None]:
def visualize_batch_of_crops(crops, n_iter_y, n_iter_x):
    fig, axes = plt.subplots(n_iter_y, n_iter_x, figsize=(5, 5), gridspec_kw = {'wspace':0, 'hspace':0})

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            axes[j, i].imshow(crops[i*n_iter_y + j])
            axes[j, i].axis('off')
            axes[j, i].set_aspect('equal')
    plt.show()

In [None]:
def create_crops_from_fileslist(fileslist, window_size, shift):
    rgb_crops = []
    spectral_crops = []
    labels = []

    for file in tqdm(fileslist):
#         file_name = os.path.basename(file)
#         print('Saving crops for file {} ...'.format(file_name))
#         print(file)
        spectral, rgb = read_slide(file)
        spectral_crops = create_batch_of_crops_from_slide(spectral, window_size=window_size, shift=shift)
        rgb_crops = create_batch_of_crops_from_slide(rgb, window_size=window_size, shift=shift)
        save_dir = file.replace('.mat', '_win{}-{}_shift{}-{}'.format(window_size[0], window_size[1], shift[0], shift[1]))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        for idx, (im_np, spec_np) in enumerate(zip(rgb_crops, spectral_crops)):
            im = Image.fromarray(im_np)
            im.save(os.path.join(save_dir, '{:05}.png'.format(idx)))
            np.save(os.path.join(save_dir, '{:05}.npy'.format(idx)), spec_np)


In [None]:
def create_crops_from_dir(dir_path, window_size, shift):
    print('Saving crops for slides in dir: {}'.format(dir_path))
    fileslist = glob(dir_path + '/*.mat')
    create_crops_from_fileslist(fileslist, window_size, shift)

In [None]:
def create_csv_for_folder(data_dir, ext):
    if ext[0] == '.':
        ext = ext[1:]
    data_df = pd.DataFrame(columns=['filename', 'label'])
    files = glob(os.path.join(data_dir,'*', '*.{}'.format(ext)))
    files = [file for file in files if "Mixed" not in file]
#     print(data_dir+'/*/*.{}'.format(ext))
    
    init_len = len(data_dir)
    delete_folder = all_data_dir
    if not is_relative_path_csv:
        delete_folder = '/'
    if not delete_folder[-1] == '/':
        delete_folder += '/'
    files = [file.replace(delete_folder, '/') for file in files]
#     print(files)
    labels = [1 if pos_name_init in file else 0 for file in files]
#     print(labels)
    data_df['filename'] = files
    data_df['label'] = labels
#     data_df.to_csv(os.path.join(data_dir, os.path.basename(data_dir)+'.csv'), index=False)
#     print('Created CSV successfully for folder {}'.format(data_dir))
    
    return data_df    

In [None]:
slides = glob(os.path.join(all_data_dir, "*/"))
slides

In [None]:
skf = KFold(n_splits=5, shuffle=True, random_state=seed)

train_slides_all = []
test_slides_all = []
val_slides_all = []

for train_index, test_index in skf.split(np.arange(len(slides)).T, np.arange(len(slides)).T):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_slides_all.append(train_index)
    val_slides_all.append([test_index[0]])
    test_slides_all.append([test_index[1]])

In [None]:
i = 3 # take one of the K-Folds

train_index = train_slides_all[i]
val_index = val_slides_all[i]
test_index = test_slides_all[i]

train_index, val_index, test_index

In [None]:
def get_dfs_for_indices(slides, index_list):
    dfs = []
    for slide in np.array(slides)[index_list]:
        data_dir = slide
        dfs.append(create_csv_for_folder(data_dir, file_ext))
    df = pd.concat(dfs, ignore_index=True)
    df = df.sample(frac=1, random_state=seed)  # frac=1 is same as shuffling df.
    return df

In [None]:
df_train = get_dfs_for_indices(slides, train_index)
df_test = get_dfs_for_indices(slides, test_index)
df_val = get_dfs_for_indices(slides, val_index)

In [None]:
pd.options.display.max_colwidth = 150

In [None]:
print(len(df_train.index.values))
print(len(set(df_train.index.values)))

print(len(df_train.columns.values))
print(len(set(df_train.columns.values)))

In [None]:
assert len(set(df_train.label.values)) == 2 and len(set(df_val.label.values)) == 2 and len(set(df_test.label.values)) == 2  

####### copying generator_from_df:
https://gist.github.com/timehaven/257eef5b0e2d9e2625a9eb812ca2226b#file-akmtdfgen-py

In [None]:
def sample_norm(X):
#     X = X - X.min()
#     X = X / X.max()
#     X = X - 0.5
    # print(X.min(), X.max()) -> (-0.5, 0.5)
    
#     X = X / 255.
    return X

def generator_from_df(df, batch_size, shuffle=True): 
#     print(df.iloc[1])
    nbatches, n_skipped_per_epoch = divmod(df.shape[0], batch_size)
    count = 1
    epoch = 0
    while 1:
        if shuffle:
            df = df.sample(frac=1)  # frac=1 is same as shuffling df.
        epoch += 1
        i, j = 0, batch_size
        # Mini-batches within epoch.
        mini_batches_completed = 0
        for _ in range(nbatches):
#             print("Top of generator for loop, epoch / count / i / j = %d / %d / %d / %d" % (epoch, count, i, j))
            sub = df.iloc[i:j]
            # preprocess_input()
            # https://github.com/fchollet/keras/blob/master/keras/applications/inception_v3.py#L389
            if use_rgb:
                X = [sample_norm(img_to_array(load_img(f, target_size=input_shape))) for f in sub.filename]
            else:
                X = [sample_norm(np.load(f)) for f in sub.filename]
                     # Resizing on the fly is efficient enough for
                     # pre-caching when a GPU is training a
                     # mini-batch.  Here is where some additional
                     # data augmentation could take place.
#                          (img_to_array(load_img(f, target_size=target_size))
            X = np.stack(X)
            Y = sub.label.values
            Y = to_categorical(Y, num_classes=2)
            # Simple model, one input, one output.
            mini_batches_completed += 1
            i = j 
            j += batch_size
            yield X, Y

In [None]:
w,h = window_size
if use_rgb:
    input_shape = (w,h,3)
else:
    input_shape = (w,h,40)
batch_size =16

In [None]:
# df_train = df_train[:batch_size]
df_val = df_val[:batch_size]
df_test = df_test[:batch_size]

# df_val = df_train[:batch_size]
# df_test = df_train[:batch_size]
# assert(df_val == df_train).values.all()

In [None]:
df_val = df_train
df_test = df_train

In [None]:
train_generator = generator_from_df(df_train, batch_size)
val_generator = generator_from_df(df_val, batch_size)
test_generator = generator_from_df(df_test, batch_size, shuffle=False)

In [None]:
for xx, yy in train_generator:
    print(xx.shape, "\n", yy)
    break

In [None]:
# input_shape = train_generator.image_shape
mobilenet_model = mobilenet.MobileNet(include_top=True, weights=None, input_shape=input_shape, classes=2)#, dropout=0.2)
# mobilenet_model = resnet50.ResNet50(include_top=True, weights=None, input_shape=input_shape, classes=2)

In [None]:
mobilenet_model.summary()

In [None]:
optimizer = Adam(lr=1e-3) # 1e-3
mobilenet_model.compile(loss="binary_crossentropy", optimizer=optimizer) #  binary_crossentropy , categorical_crossentropy
# history = History()
lrReduce = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=4, verbose=1, min_lr=1e-6)
if use_rgb:
    chkpnt = ModelCheckpoint("my_models/model_rgb_weights_epoch{epoch:02d}-val_loss{val_loss:.3f}.hdf5", save_best_only=True) # -train_loss{history.History()[loss][-1]:.2f}
else:
    chkpnt = ModelCheckpoint("my_models/model_spec_weights_epoch{epoch:02d}-val_loss{val_loss:.3f}.hdf5", save_best_only=True) # -train_loss{history.History()[loss][-1]:.2f}
num_of_epochs = 100

In [None]:
STEP_SIZE_TRAIN=len(df_train)//batch_size
STEP_SIZE_VALID=len(df_val)//batch_size
# print(STEP_SIZE_VALID)
history = mobilenet_model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=num_of_epochs, callbacks=[lrReduce, chkpnt], shuffle=False) # chkpnt

In [None]:
def plot_otsu_triplet(org, threshed):
    gray = cv2.cvtColor(org, cv2.BGR2GRAY)
    plt.figure(figsize=(15, 8))
    plt.subplot(131)
    plt.imshow(org)
    plt.subplot(132)
    plt.imshow(gray, cmap="gray")
    plt.subplot(133)
    plt.imshow(thr, cmap="gray")
    cv2.destroyAllWindows()

In [None]:
def get_otsu_treshed_img(img, i):
    i=10
    assert img.max() > 1
    x = cv2.cvtColor((img[i]).astype(np.uint8), cv2.COLOR_BGR2GRAY)re
    x1 = cv2.GaussianBlur(x,(5,5),0)
    ret,thr = cv2.threshold(x1,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
#     plot_otsu_triplet(org, threshed)
    return thr

In [None]:
# STEP_SIZE_TEST=len(df_test)//batch_size
# mobilenet_model.evaluate_generator(train_generator, steps=1)
mobilenet_model.evaluate(xxx, yyy)

In [None]:
# y_pred = mobilenet_model.predict_generator(train_generator, steps=1)
y_pred = mobilenet_model.predict(xx)
y_pred

In [None]:
y_pred.argmax(axis=1), y_train.values[:, 1].astype(int)

In [None]:
y_test = df_test['label'][:len(y_pred)].values

In [None]:
plot_roc_curve(y_test, y_pred)
plt.show()

In [None]:
y_pred = np.argmax(y_pred, axis=1)
y_pred.shape

In [None]:
y_pred.sum()

In [None]:

y_test.sum()

In [None]:
(y_pred==y_test).sum()/220