In [None]:
import scipy.io 
import matplotlib.pyplot as plt
import cv2
import keras
from glob import glob
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import pandas as pd

from keras.applications import mobilenet #, vgg16, inception_v3, resnet50, 
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

## Params

In [None]:
normal_paths = glob('E:/Datasets/PathoBarIlan/Case8/Normal*.mat')
cancer_paths = glob('E:/Datasets/PathoBarIlan/Case8/Cancer*.mat')
mixed_paths = glob('E:/Datasets/PathoBarIlan/Case8/Mixed*.mat')

data_dir = 'E:/Datasets/PathoBarIlan/Case8'

pos_name_init = 'Cancer'
neg_name_init = 'Normal'

use_rgb = False # True=rgb, False=spectral
if use_rgb:
    file_ext = '.png'
else:
    file_ext = '.npy'
    
window_size = (200, 200)
shift = (100, 100)

## utils

In [None]:
def read_slide(path):
    mat = scipy.io.loadmat(path)
    spectral = mat["Spec"]
    rgb = mat["Section"]
    shape = rgb.shape
    
    return spectral, rgb

In [None]:
def create_batch_of_crops_from_slide(img, window_size, shift, vis_flag=False):
    crops = []

    n_iter_x = (img.shape[1]-window_size[0])//shift[0] + 1

    n_iter_y = (img.shape[0]-window_size[1])//shift[1] + 1

#     n_iter_x, n_iter_y

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            init_y = i*shift[0]
            init_x = j*shift[1]
        
            crops.append(img[init_x:init_x+window_size[0], init_y:init_y+window_size[1], :])
    if vis_flag:
        visualize_batch_of_crops(crops, n_iter_y, n_iter_x)
    return crops

In [None]:
def visualize_batch_of_crops(crops, n_iter_y, n_iter_x):
    fig, axes = plt.subplots(n_iter_y, n_iter_x, figsize=(5, 5), gridspec_kw = {'wspace':0, 'hspace':0})

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            axes[j, i].imshow(crops[i*n_iter_y + j])
            axes[j, i].axis('off')
            axes[j, i].set_aspect('equal')
    plt.show()

In [None]:
def create_crops_from_fileslist(fileslist, window_size, shift):
    rgb_crops = []
    spectral_crops = []
    labels = []

    for file in tqdm(fileslist):
#         file_name = os.path.basename(file)
#         print('Saving crops for file {} ...'.format(file_name))
#         print(file)
        spectral, rgb = read_slide(file)
        spectral_crops = create_batch_of_crops_from_slide(spectral, window_size=window_size, shift=shift)
        rgb_crops = create_batch_of_crops_from_slide(rgb, window_size=window_size, shift=shift)
#         if pos_name_init in file_name:
#             labels += [True]*len(added_rgb_crops)
#         elif neg_name_init in file_name:
#             labels += [False]*len(added_rgb_crops)
#         else:
#             raise ValueError('File {} is not in the right format ({}-pos, {}-neg)'.format(file_name, pos_name_init, neg_name_init))
#         print(labels)
        save_dir = file.replace('.mat', '_win{}-{}_shift{}-{}'.format(window_size[0], window_size[1], shift[0], shift[1]))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
#         np.save(save_dir+'/Spectral_crops.npy', spectral_crops)
#         np.save(file.replace('.mat', '_RGB_win{}-{}_shift{}-{}.npy'.format(window_size[0], window_size[1], shift[0], shift[1])), rgb_crops)
        for idx, (im_np, spec_np) in enumerate(zip(rgb_crops, spectral_crops)):
            im = Image.fromarray(im_np)
            im.save(os.path.join(save_dir, '{:05}.png'.format(idx)))
            np.save(os.path.join(save_dir, '{:05}.npy'.format(idx)), spec_np)
    
#     out_labels = to_categorical(labels)
#     out_sepc = np.stack(spectral_crops)
#     out_rgb = np.stack(rgb_crops)
    
    
#     return out_sepc, out_rgb, out_labels

In [None]:
def create_crops_from_dir(dir_path, window_size, shift):
    print('Saving crops for slides in dir: {}'.format(dir_path))
    fileslist = glob(dir_path + '/*.mat')
    create_crops_from_fileslist(fileslist, window_size, shift)
#     return spectral_crops, rgb_crops, labels

In [None]:
def create_csv_for_folder(data_dir, ext):
    if ext[0] == '.':
        ext = ext[1:]
    data_df = pd.DataFrame(columns=['filename', 'label'])
    files = glob(os.path.join(data_dir,'*', '*.{}'.format(ext)))
#     print(data_dir+'/*/*.{}'.format(ext))
    
    init_len = len(data_dir)
    files = [file[init_len:] for file in files]
#     print(files)
    labels = [1 if pos_name_init in file else 0 for file in files]
#     print(labels)
    data_df['filename'] = files
    data_df['label'] = labels
    data_df.to_csv(os.path.join(data_dir, os.path.basename(data_dir)+'.csv'), index=False)
    print('Created CSV successfully for folder {}'.format(data_dir))

## test and vis

## prepare data

##### old prepare data

## build and train model

############

In [None]:
folders = glob(data_dir+"/*/")

In [None]:
pos_folders = [i for i in folders if "Cancer" in i]
neg_folders = [i for i in folders if "Normal" in i]

pos_crops_files = []
for folder in pos_folders:
#     print(folder)
    pos_crops_files.append(len(glob(folder+"*"+file_ext)))
    
neg_crops_files = []
for folder in neg_folders:
#     print(folder)
    neg_crops_files.append(len(glob(folder+"*"+file_ext)))

In [None]:
pos_crops_files, neg_crops_files

In [None]:
tot_pos = np.sum(pos_crops_files)
tot_neg = np.sum(neg_crops_files)

In [None]:
eval_min_percentage = 0.1

In [None]:
eval_num_pos = int(eval_min_percentage*tot_pos)
eval_num_neg = int(eval_min_percentage*tot_neg)

print(eval_num_pos, eval_num_neg)

In [None]:
pos_folders_rand = pd.DataFrame({"folder":pos_folders, "n_files":pos_crops_files}).sample(len(pos_folders), random_state=0).reset_index(drop=True)
neg_folders_rand = pd.DataFrame({"folder":neg_folders, "n_files":neg_crops_files}).sample(len(neg_folders), random_state=0).reset_index(drop=True)

In [None]:
pos_folders_rand

In [None]:

eval_files_neg = []

In [None]:
def get_part_files(all_folders_df, frac):
    tot_files = all_folders_df.n_files.sum()
    min_num_files = int(frac*tot_files)
    chosen_files_list= []
    while len(chosen_files_list) < min_num_files:
        folder = all_folders_df.iloc[0].folder
        all_folders_df.drop(0, inplace=True)
        all_folders_df.reset_index(inplace=True, drop=True)
        chosen_files_list += glob(folder+"/*"+file_ext)
    return chosen_files_list

print(pos_folders_rand.shape)
eval_files_pos = get_part_files(pos_folders_rand, 0.1)
eval_files_neg = get_part_files(neg_folders_rand, 0.1)
print(pos_folders_rand.shape)

test_files_pos = get_part_files(pos_folders_rand, 0.1)
test_files_neg = get_part_files(neg_folders_rand, 0.1)

print(pos_folders_rand.shape)

train_files_pos = get_part_files(pos_folders_rand, 1)
train_files_neg = get_part_files(neg_folders_rand, 1)

print(pos_folders_rand.shape)

In [None]:
len(eval_files_pos), len(eval_files_neg), len(test_files_pos), len(test_files_neg)

In [None]:
def get_df_pos_neg_files(list_neg, list_pos):
    return pd.DataFrame({"filepath":list_neg+list_pos, "label":[False]*len(list_neg)+[True]*len(list_pos)})

In [None]:
df_eval = get_df_pos_neg_files(eval_files_neg, eval_files_pos)
df_train = get_df_pos_neg_files(train_files_neg, train_files_pos)
df_test = get_df_pos_neg_files(test_files_neg, test_files_pos)

In [None]:
df_train.to_csv(data_dir+'/train.csv', index=False)
df_test.to_csv(data_dir+'/test.csv', index=False)
df_eval.to_csv(data_dir+'/eval.csv', index=False)