In [1]:
##################################################################################################
# Patch-based Classification of Breast Cancer Histology Images using CNNs
# LE48: MiniProject
# Jan Ondras (jo356), Trinity College
# 2017/2018
##################################################################################################
######################################################################################
# From non stained images, create patches from training data (train & validation folds), DONE
# ONLY FOR BEST PATCH GENERATION METHOD
######################################################################################

path_prefix = './../Dataset/ICIAR2018_BACH_Challenge/' #'/media/jo/86011c15-cf21-41e3-9ac2-f1045d4c589a/'

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import cv2
import glob
import os
import tifffile as tiff

stain_normalized = False
if stain_normalized:
    path = path_prefix + 'Photos_SN/'
else:
    path = path_prefix + 'Photos/'

# Image size (assume same for all images)
img_width =  2048
img_height = 1536
pix_scale = 0.42 # micrometers
    
# Number of examples per class (train, test) assuming balanced set
N_imgs_per_class = {}

###################################################################################### TO SET
# Create patches
patch_width = 512
patch_height = patch_width
patch_stride = 256
######################################################################################
data_type = '_' + str(patch_width) + '_' + str(patch_stride)
print data_type

N_patches_x = (((img_width - patch_width) / patch_stride) + 1)
N_patches_y = (((img_height - patch_height) / patch_stride) + 1)
N_patches_per_img = N_patches_x * N_patches_y
print "Number of patches per image: ", N_patches_per_img

# Load dataset
classes = ['Normal', 'Benign', 'InSitu', 'Invasive'] # correspond to labels 0,1,2,3 in this order
imgs = {}
imgs['train'] = {}
imgs['validation'] = {}
# if not os.path.exists('./../Dataset/patches/'):
os.mkdir(path + 'patches' + data_type + '/')
if not os.path.exists(path + 'patches' + data_type + '/train/'):
    os.mkdir(path + 'patches' + data_type + '/train/')
if not os.path.exists(path + 'patches' + data_type + '/validation/'):
    os.mkdir(path + 'patches' + data_type + '/validation/')

B_mean = 0.
G_mean = 0.
R_mean = 0.

for c in classes:
    if not os.path.exists(path + 'patches' + data_type + '/train/' + c):
        os.mkdir(path + 'patches' + data_type + '/train/' + c)
    if not os.path.exists(path + 'patches' + data_type + '/validation/' + c):
        os.mkdir(path + 'patches' + data_type + '/validation/' + c)
    # Train-validation split
    imgs['train'][c], imgs['validation'][c] = train_test_split(
        glob.glob(path + c + '/*.tif'), random_state=32, train_size=0.75)
    print "Class ",c, " train: ", len(imgs['train'][c]), ", validation: ", len(imgs['validation'][c])
    # Generate patches
    for set_type in ['train', 'validation']:
        print set_type
        for img_name in imgs[set_type][c]:
#             img = cv2.imread(img_name)
            img = tiff.imread(img_name)
            
            patch_ID = 0
            for i in range(N_patches_x):
                for j in range(N_patches_y):
                    patch = img[j*patch_stride:j*patch_stride + patch_height, i*patch_stride:i*patch_stride + patch_width, :]
#                     cv2.imwrite(path + 'patches/' + set_type + '/' + c + '/' + img_name.split('/')[-1][:-4] + '_{:04d}.tif'.format(patch_ID), 
#                                patch)
                    tiff.imsave(path + 'patches' + data_type + '/' + set_type + '/' + c + '/' + img_name.split('/')[-1][:-4] + '_{:04d}.tif'.format(patch_ID), 
                               patch)
                    patch_ID += 1
        
            if set_type == 'train':
#                         B_mean += np.mean(patch[:,:,0])
#                         G_mean += np.mean(patch[:,:,1])
#                         R_mean += np.mean(patch[:,:,2])
                B_mean += np.mean(img[:,:,2])
                G_mean += np.mean(img[:,:,1])
                R_mean += np.mean(img[:,:,0])
                    
#                     if set_type == 'train':
# #                         B_mean += np.mean(patch[:,:,0])
# #                         G_mean += np.mean(patch[:,:,1])
# #                         R_mean += np.mean(patch[:,:,2])
#                         B_mean += np.mean(patch[:,:,2])
#                         G_mean += np.mean(patch[:,:,1])
#                         R_mean += np.mean(patch[:,:,0])
                
N_train_imgs_total = len(imgs['train']['Normal']) + len(imgs['train']['Benign']) + len(imgs['train']['InSitu']) + len(imgs['train']['Invasive'])
N_val_imgs_total = len(imgs['validation']['Normal']) + len(imgs['validation']['Benign']) + len(imgs['validation']['InSitu']) + len(imgs['validation']['Invasive'])
B_mean /=  (N_patches_per_img * N_train_imgs_total )
G_mean /=  (N_patches_per_img * N_train_imgs_total )
R_mean /=  (N_patches_per_img * N_train_imgs_total )

print "RGB means: ", R_mean, ",", G_mean, ",", B_mean
print "Total patches: ", N_patches_per_img * (N_train_imgs_total + N_val_imgs_total)
print "Training patches: ", N_patches_per_img * N_train_imgs_total
print "Validation patches: ", N_patches_per_img * N_val_imgs_total

_512_256
Number of patches per image:  35
Class  Normal  train:  75 , validation:  25
train




validation
Class  Benign  train:  75 , validation:  25
train
validation
Class  InSitu  train:  75 , validation:  25
train
validation
Class  Invasive  train:  75 , validation:  25
train
validation
RGB means:  45354.9621101 , 39148.4015205 , 53026.7132272
Total patches:  14000
Training patches:  10500
Validation patches:  3500


In [None]:
just a small mistake in RGBs calculated above; only divide by 35