#Image cropping

Run this fragment with different values to generate different datasets:

In [None]:
input_dir = '/home/rafael/deep_learning_sedimentos/tagged_data'
output_dir = '/home/rafael/deep_learning_sedimentos/cropped_data_filtered'
size = 300
step = 40
crop_images(input_dir, output_dir, size, step)

Run this fragment to create train/test splits:

In [None]:
splits_output_dir = '/home/rafael/deep_learning_sedimentos/cropped_data_filtered 300x300 40'
train_test_splits(splits_output_dir)

##Setup

Define threshold and other constants below:

In [None]:
label_classes = {
    1: 'packing voids',
    2: 'vesicles',
    3: 'channels',
    4: 'chambers',
    5: 'vughs',
    6: 'planes'
}
bw_thresh = 0.05
class_thresh = 0.15
class_threshold = {
    'packing voids': 0.8,
    'vesicles': 0.05,
    'channels': 0.15,
    'chambers': 0.15,
    'vughs': 0.15,
    'planes': 0.05
}

##Scripts

In [None]:
import argparse
import glob
import math
import os
import os.path as osp
import sys
from shutil import copyfile

import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


def save_window(output_dir, output_file, img_window, label_window):
    output_file = osp.join(output_dir, output_file)
    cv2.imwrite(output_file, img_window)
    cv2.imwrite(output_file.replace('jpg', 'png'), label_window)

    
def get_relevant_class(window):
    relevant_class = None
    relevant_class_sum = 0
    non_bw_sum = 0
    
    core_side = int(window.shape[0] / 3)
    core_window = window[core_side:core_side * 2, core_side:core_side * 2]
   
    for class_num, class_name in label_classes.items():
        class_sum = (core_window == class_num).sum()
        non_bw_sum = (window == class_num).sum() + non_bw_sum
        if (class_sum > core_window.size * class_threshold[class_name]) and (class_sum > relevant_class_sum):
            relevant_class = class_name
            relevant_class_sum = class_sum
    if (relevant_class == None) and (non_bw_sum < window.size * bw_thresh):
        relevant_class = 'background'
    return relevant_class
    

def crop_images(input_dir, output_dir, size, step):

    base_output_dir = '{0} {1}x{1} {2}'.format(output_dir, size, step)
    i = 1
    while osp.exists(output_dir):
        print('Output directory already exists:', output_dir)
        output_dir = '{0}({1})'.format(base_output_dir, i)
        i = i + 1
    os.makedirs(output_dir)
    class_output_dir = output_dir

    for class_name in label_classes.values():
        class_dir = osp.join(class_output_dir, class_name)
        if not osp.exists(class_dir):
            os.makedirs(class_dir)
    bw_dir = osp.join(class_output_dir, 'background')
    if not osp.exists(bw_dir):
        os.makedirs(bw_dir)

    (window_x, window_y) = (size, size)
    (x_step, y_step) = (step, step)

    class_counters = [0, 0, 0, 0, 0, 0, 0]
    class_keys=list(label_classes.keys())
    class_values=list(label_classes.values())
    for class_file in glob.glob(osp.join(input_dir, 'SegmentationClassVisualization_filtered/*/*.jpg')):
        print('Opening:' + class_file)
        npy_file = class_file.replace('ClassVisualization_filtered', 'Class').replace('jpg', 'npy')
        seg_file = class_file.replace('ClassVisualization_filtered', 'ClassPNG').replace('jpg', 'png')
        img_file = seg_file.replace('SegmentationClassPNG', 'JPEGImages').replace('png', 'jpg')
        
        if osp.isfile(img_file):
            img = cv2.imread(img_file)
        else:
            print('File not found: ' + img_file)
            continue
        if osp.isfile(seg_file):
            seg_img = cv2.imread(seg_file)
        else:
            print('File not found: ' + seg_file)
            continue
        if osp.isfile(npy_file):
            img_array = np.load(npy_file)
        else:
            print('File not found: ' + npy_file)
            continue

        window_num = 1
        relevant_class = None
        for x in range(0, img_array.shape[0] - window_x, x_step):
            for y in range(0, img_array.shape[1] - window_y, y_step):
                window = img_array[x:x + window_x, y:y + window_y]
                relevant_class = get_relevant_class(window)
                if relevant_class:
                  # basic file name
                  output_file = osp.basename(seg_file).replace('.png', '_{}.jpg'.format(window_num))
                  # get window for training
                  img_window = img[x:x + window_x, y:y + window_y]
                  label_window = seg_img[x:x + window_x, y:y + window_y]
                  class_output_file = osp.join(relevant_class, output_file)
                  save_window(class_output_dir, class_output_file, img_window, label_window)
                  if  relevant_class == 'background':
                      class_counters[0] = class_counters[0] + 1 
                  else:
                      class_counters[class_values.index(relevant_class) + 1] += 1 
                      #save_window(segm_output_dir, output_file, img_window, label_window)
                  window_num += 1  
                  
        print('Saved:', window_num, 'windows')
    print(class_counters)


def train_test_splits(output_dir, train_total = 1500, test_total = 1500):
    test_dir = output_dir + '/splits/test'
    train_dir = output_dir + '/splits/train'
    splits_dir = output_dir + '/splits'
  
    if not(osp.exists(splits_dir)):
        print('Creating Splits Dir: ' + splits_dir)
        os.makedirs(splits_dir) 
        os.makedirs(test_dir)
        os.makedirs(train_dir)
              
        for class_name in ['background'] + list(label_classes.values()):
            os.makedirs(test_dir + '/' + class_name) 
            os.makedirs(train_dir + '/' + class_name)   
            class_files = glob.glob(osp.join(output_dir, class_name, '*.jpg'))         
            X_train, X_test = train_test_split(class_files, train_size=train_total,
                                               test_size=min(len(class_files) - train_total, test_total))
        
            print ('Class: ' + class_name)
            print ('Train split:')
            for x in X_train:
                new_file_path = osp.join(splits_dir, 'train', class_name, os.path.basename(x))
                print('Copying: {0} -> {1}'.format(x, new_file_path))
                copyfile(x, new_file_path)
            
            print ('Test split:')
            for x in X_test:
                new_file_path = osp.join(splits_dir, 'test', class_name, os.path.basename(x))
                print('Copying: {0} -> {1}'.format(x, new_file_path))
                copyfile(x, new_file_path)
    else:
        print('splits directory already exist')
    return

In [None]:
def kfold(data_path, splits_path, n_folds):

    MAX_CLASS_SIZE =  1500
    classSize = MAX_CLASS_SIZE
    classLabel = {
        0: 'background',
        1: 'packing voids',
        2: 'vesicles',
        3: 'channels',
        4: 'chambers',
        5: 'vughs',
        6: 'planes'
    }

    X = np.empty(0)
    Y = np.empty(0)
    for class_num, class_name in classLabel.items():
        class_files = glob.glob(osp.join(data_path, class_name, '*.jpg'))
        x = np.asarray(class_files)
        classSize = min(len(x), MAX_CLASS_SIZE)               
        y = np.full(classSize, class_num)
        np.random.shuffle(x)
        X = np.append(X, x[0:classSize])
        Y = np.append(Y, y);
    
    kf = KFold(n_splits=n_folds, random_state=0, shuffle=True)
   
    #create fold dirs
    for fold in range(1, n_folds + 1):
        #os.makedirs(splits_path + '/fold_' + str(fold))
        for class_num, class_name in classLabel.items():
            os.makedirs('{0}/fold_{1}/training_set/{2}'.format(splits_path, fold, class_name))
            os.makedirs('{0}/fold_{1}/validation_set/{2}'.format(splits_path, fold, class_name))

    #copy files to folds
    fold_cont = 1
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        #train imgs
        for i in range(len(X_train)):
            x = X_train[i]
            class_name =  classLabel[y_train[i]]
            copyfile(x, '{0}/fold_{1}/training_set/{2}/{3}'.format(
                splits_path, fold_cont, class_name, os.path.basename(x)))
             
        #validation imgs
        for i in range(len(X_test)):
            x = X_test[i]
            class_name =  classLabel[y_test[i]]
            copyfile(x, '{0}/fold_{1}/validation_set/{2}/{3}'.format(
                splits_path, fold_cont, class_name, os.path.basename(x)))
            
        fold_cont += 1

In [None]:
kfold('/home/rafael/deep_learning_sedimentos/cropped_data_filtered 300x300 40/splits/train',
      '/home/rafael/deep_learning_sedimentos/cropped_data_filtered 300x300 40/splits/folds', 5)