In [1]:
import tensorflow

In [2]:
import scipy.io 
import matplotlib.pyplot as plt
import cv2
import keras
from glob import glob
import numpy as np
from tqdm import tqdm
import os
from PIL import Image
import pandas as pd

from keras.applications import mobilenet #, vgg16, inception_v3, resnet50, 
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint

Using TensorFlow backend.


## Params

In [3]:
# normal_paths = glob('E:/Datasets/PathoBarIlan/Case8/Normal*.mat')
# cancer_paths = glob('E:/Datasets/PathoBarIlan/Case8/Cancer*.mat')
# mixed_paths = glob('E:/Datasets/PathoBarIlan/Case8/Mixed*.mat')

# data_dir = 'E:/Datasets/PathoBarIlan/Case8'
# all_data_dir = 'E:/Datasets/PathoBarIlan/Shlomi2018/'
all_data_dir = '/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018'
is_relative_path_csv = False


pos_name_init = 'Cancer'
neg_name_init = 'Normal'

use_rgb = True # True=rgb, False=spectral
if use_rgb:
    file_ext = '.png'
else:
    file_ext = '.npy'
    
window_size = (200, 200)
shift = (100, 100)

## utils

In [4]:
def read_slide(path):
    mat = scipy.io.loadmat(path)
    spectral = mat["Spec"]
    rgb = mat["Section"]
    shape = rgb.shape
    
    return spectral, rgb

In [5]:
def create_batch_of_crops_from_slide(img, window_size, shift, vis_flag=False):
    crops = []

    n_iter_x = (img.shape[1]-window_size[0])//shift[0] + 1

    n_iter_y = (img.shape[0]-window_size[1])//shift[1] + 1

#     n_iter_x, n_iter_y

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            init_y = i*shift[0]
            init_x = j*shift[1]
        
            crops.append(img[init_x:init_x+window_size[0], init_y:init_y+window_size[1], :])
    if vis_flag:
        visualize_batch_of_crops(crops, n_iter_y, n_iter_x)
    return crops

In [6]:
def visualize_batch_of_crops(crops, n_iter_y, n_iter_x):
    fig, axes = plt.subplots(n_iter_y, n_iter_x, figsize=(5, 5), gridspec_kw = {'wspace':0, 'hspace':0})

    for i in range(n_iter_x):
        for j in range(n_iter_y):
            axes[j, i].imshow(crops[i*n_iter_y + j])
            axes[j, i].axis('off')
            axes[j, i].set_aspect('equal')
    plt.show()

In [7]:
def create_crops_from_fileslist(fileslist, window_size, shift):
    rgb_crops = []
    spectral_crops = []
    labels = []

    for file in tqdm(fileslist):
#         file_name = os.path.basename(file)
#         print('Saving crops for file {} ...'.format(file_name))
#         print(file)
        spectral, rgb = read_slide(file)
        spectral_crops = create_batch_of_crops_from_slide(spectral, window_size=window_size, shift=shift)
        rgb_crops = create_batch_of_crops_from_slide(rgb, window_size=window_size, shift=shift)
#         if pos_name_init in file_name:
#             labels += [True]*len(added_rgb_crops)
#         elif neg_name_init in file_name:
#             labels += [False]*len(added_rgb_crops)
#         else:
#             raise ValueError('File {} is not in the right format ({}-pos, {}-neg)'.format(file_name, pos_name_init, neg_name_init))
#         print(labels)
        save_dir = file.replace('.mat', '_win{}-{}_shift{}-{}'.format(window_size[0], window_size[1], shift[0], shift[1]))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
#         np.save(save_dir+'/Spectral_crops.npy', spectral_crops)
#         np.save(file.replace('.mat', '_RGB_win{}-{}_shift{}-{}.npy'.format(window_size[0], window_size[1], shift[0], shift[1])), rgb_crops)
        for idx, (im_np, spec_np) in enumerate(zip(rgb_crops, spectral_crops)):
            im = Image.fromarray(im_np)
            im.save(os.path.join(save_dir, '{:05}.png'.format(idx)))
            np.save(os.path.join(save_dir, '{:05}.npy'.format(idx)), spec_np)
    
#     out_labels = to_categorical(labels)
#     out_sepc = np.stack(spectral_crops)
#     out_rgb = np.stack(rgb_crops)
    
    
#     return out_sepc, out_rgb, out_labels

In [8]:
def create_crops_from_dir(dir_path, window_size, shift):
    print('Saving crops for slides in dir: {}'.format(dir_path))
    fileslist = glob(dir_path + '/*.mat')
    create_crops_from_fileslist(fileslist, window_size, shift)
#     return spectral_crops, rgb_crops, labels

In [9]:
def create_csv_for_folder(data_dir, ext):
    if ext[0] == '.':
        ext = ext[1:]
    data_df = pd.DataFrame(columns=['filename', 'label'])
    files = glob(os.path.join(data_dir,'*', '*.{}'.format(ext)))
    files = [file for file in files if "Mixed" not in file]
#     print(data_dir+'/*/*.{}'.format(ext))
    
    init_len = len(data_dir)
    delete_folder = all_data_dir
    if not is_relative_path_csv:
        delete_folder = '/'
    if not delete_folder[-1] == '/':
        delete_folder += '/'
    files = [file.replace(delete_folder, '/') for file in files]
#     print(files)
    labels = [1 if pos_name_init in file else 0 for file in files]
#     print(labels)
    data_df['filename'] = files
    data_df['label'] = labels
#     data_df.to_csv(os.path.join(data_dir, os.path.basename(data_dir)+'.csv'), index=False)
#     print('Created CSV successfully for folder {}'.format(data_dir))
    
    return data_df    

In [10]:
slides = glob(os.path.join(all_data_dir, "*/"))

In [11]:
from sklearn.model_selection import KFold
from keras.preprocessing.image import ImageDataGenerator

skf = KFold(n_splits=5, random_state=None, shuffle=True)

train_slides_all = []
test_slides_all = []
val_slides_all = []

for train_index, test_index in skf.split(np.arange(len(slides)).T, np.arange(len(slides)).T):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_slides_all.append(train_index)
    val_slides_all.append([test_index[0]])
    test_slides_all.append([test_index[1]])

TRAIN: [0 1 2 3 4 5 7 9] TEST: [6 8]
TRAIN: [0 1 2 3 5 6 8 9] TEST: [4 7]
TRAIN: [0 2 3 4 5 6 7 8] TEST: [1 9]
TRAIN: [0 1 4 5 6 7 8 9] TEST: [2 3]
TRAIN: [1 2 3 4 6 7 8 9] TEST: [0 5]


In [12]:
i = 3

train_index = train_slides_all[i]
val_index = val_slides_all[i]
test_index = test_slides_all[i]

train_index, val_index, test_index

(array([0, 1, 4, 5, 6, 7, 8, 9]), [2], [3])

In [13]:
def get_dfs_for_indices(slides, index_list):
    dfs = []

    for slide in np.array(slides)[index_list]:
        data_dir = slide
        dfs.append(create_csv_for_folder(data_dir, file_ext))
    return pd.concat(dfs, ignore_index=True)

In [14]:
df_train = get_dfs_for_indices(slides, train_index)
df_test = get_dfs_for_indices(slides, test_index)
df_val = get_dfs_for_indices(slides, val_index)

df_train.to_csv(os.path.join(all_data_dir, 'train_files.csv'), index=False)
df_val.to_csv(os.path.join(all_data_dir, 'val_files.csv'), index=False)
df_test.to_csv(os.path.join(all_data_dir, 'test_files.csv'), index=False)

In [15]:
pd.options.display.max_colwidth = 150

In [16]:
print(len(df_train.index.values))
print(len(set(df_train.index.values)))

print(len(df_train.columns.values))
print(len(set(df_train.columns.values)))

2225
2225
2
2


In [17]:
df_train.head()

Unnamed: 0,filename,label
0,/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018/Case10/Cancer_Case10a1_win200-200_shift100-100/00000.png,1
1,/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018/Case10/Cancer_Case10c1_win200-200_shift100-100/00000.png,1
2,/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018/Case10/Cancer_Case10c1_win200-200_shift100-100/00001.png,1
3,/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018/Case10/Cancer_Case10c1_win200-200_shift100-100/00002.png,1
4,/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018/Case10/Cancer_Case10c1_win200-200_shift100-100/00003.png,1


In [18]:
# df_train = df_train.set_index('filename')

In [19]:
# df_train.head()

In [20]:
# df_train = df_train.reset_index()

In [21]:
# df_train.head()

In [22]:
# df_train.ndim

'00000.png'

In [26]:
datagen=ImageDataGenerator() # rescale=1./255
directory = '/media/leetwito/DATA/Datasets/PathoBarIlan/Shlomi2018'
# df_train = df_train.set_index('filename')
directory = None
# directory = '/'
# directory = all_data_dir
train_generator=datagen.flow_from_dataframe(dataframe=df_train, directory=directory, x_col="filename", y_col="label", has_ext=True, class_mode="binary", batch_size=4) # , interpolation='bilinear'

AttributeError: 'NoneType' object has no attribute 'rfind'

## test and vis

## prepare data

In [None]:
create_crops_from_dir(data_dir, window_size=window_size, shift=shift)

In [None]:
dirs = glob('E:/Datasets/PathoBarIlan/Shlomi2018/*')
for data_dir in dirs:
    create_crops_from_dir(data_dir, window_size=window_size, shift=shift)

##### old prepare data

## build and train model

############

In [None]:
folders = glob(data_dir+"/*/")

In [None]:
pos_folders = [i for i in folders if "Cancer" in i]
neg_folders = [i for i in folders if "Normal" in i]

pos_crops_files = []
for folder in pos_folders:
#     print(folder)
    pos_crops_files.append(len(glob(folder+"*"+file_ext)))
    
neg_crops_files = []
for folder in neg_folders:
#     print(folder)
    neg_crops_files.append(len(glob(folder+"*"+file_ext)))

In [None]:
pos_crops_files, neg_crops_files

In [None]:
tot_pos = np.sum(pos_crops_files)
tot_neg = np.sum(neg_crops_files)

In [None]:
eval_min_percentage = 0.1

In [None]:
eval_num_pos = int(eval_min_percentage*tot_pos)
eval_num_neg = int(eval_min_percentage*tot_neg)

print(eval_num_pos, eval_num_neg)

In [None]:
pos_folders_rand = pd.DataFrame({"folder":pos_folders, "n_files":pos_crops_files}).sample(len(pos_folders), random_state=0).reset_index(drop=True)
neg_folders_rand = pd.DataFrame({"folder":neg_folders, "n_files":neg_crops_files}).sample(len(neg_folders), random_state=0).reset_index(drop=True)

In [None]:
pos_folders_rand

In [None]:

eval_files_neg = []

In [None]:
def get_part_files(all_folders_df, frac):
    tot_files = all_folders_df.n_files.sum()
    min_num_files = int(frac*tot_files)
    chosen_files_list= []
    while len(chosen_files_list) < min_num_files:
        folder = all_folders_df.iloc[0].folder
        all_folders_df.drop(0, inplace=True)
        all_folders_df.reset_index(inplace=True, drop=True)
        chosen_files_list += glob(folder+"/*"+file_ext)
    return chosen_files_list

print(pos_folders_rand.shape)
eval_files_pos = get_part_files(pos_folders_rand, 0.1)
eval_files_neg = get_part_files(neg_folders_rand, 0.1)
print(pos_folders_rand.shape)

test_files_pos = get_part_files(pos_folders_rand, 0.1)
test_files_neg = get_part_files(neg_folders_rand, 0.1)

print(pos_folders_rand.shape)

train_files_pos = get_part_files(pos_folders_rand, 1)
train_files_neg = get_part_files(neg_folders_rand, 1)

print(pos_folders_rand.shape)

In [None]:
len(eval_files_pos), len(eval_files_neg), len(test_files_pos), len(test_files_neg)

In [None]:
def get_df_pos_neg_files(list_neg, list_pos):
    return pd.DataFrame({"filepath":list_neg+list_pos, "label":[False]*len(list_neg)+[True]*len(list_pos)})

In [None]:
df_eval = get_df_pos_neg_files(eval_files_neg, eval_files_pos)
df_train = get_df_pos_neg_files(train_files_neg, train_files_pos)
df_test = get_df_pos_neg_files(test_files_neg, test_files_pos)

In [None]:
df_train.to_csv(data_dir+'/train.csv', index=False)
df_test.to_csv(data_dir+'/test.csv', index=False)
df_eval.to_csv(data_dir+'/eval.csv', index=False)