In [1]:
import os
import numpy as np
from numpy import random

from random import sample

import pandas as pd 

import skimage
from skimage import io
from skimage.feature import daisy, hog, ORB, local_binary_pattern, SIFT
from skimage.color import label2rgb, rgb2gray
from skimage.transform import resize, rotate, downscale_local_mean

from scipy import ndimage as ndi

from skimage.util import img_as_float
from skimage.filters import gabor_kernel
from skimage.filters import threshold_niblack
from skimage.morphology import convex_hull_image
from skimage.measure import find_contours
from skimage import exposure

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from tqdm.notebook import tqdm

import pickle

import gc

from joblib import Parallel, delayed, parallel_backend, cpu_count
import psutil

from platform import python_version

import multiprocessing as mp
from more_itertools import chunked

import gabor_filters
from  gabor_filters import gabor_filter
from  gabor_filters import gabor_filter_response

import importlib
importlib.reload(gabor_filters)
importlib.reload(gabor_filters.gabor_filter)
importlib.reload(gabor_filters.gabor_filter_response)

from gabor_filters.gabor_filter import GaborFilterBank as gbb
from gabor_filters.gabor_filter_response import GaborFilteredResponseBank as gbfrb






In [3]:
print(python_version())
print(skimage.__version__)

3.9.16
0.19.3


In [2]:
def crop_image(image):
    img_height = image.shape[0]
    if img_height < 2000:
        return image
    
    # adaptive thresholding
    thresh_niblack = threshold_niblack(image, window_size=25, k=0.8)
    binary_niblack = image > thresh_niblack

    # make convex hull
    chull = convex_hull_image(np.pad(binary_niblack, 3, 'constant', constant_values=0))
    
    # Find the contours of the main object
    contours = find_contours(chull, 0.5)

    # Find the largest contour (assumed to be the main object)
    largest_contour = max(contours, key=len)

    # Compute the bounding box coordinates for the largest contour
    min_row, min_col = np.min(largest_contour, axis=0)
    max_row, max_col = np.max(largest_contour, axis=0)

    # Compute the optimal cropping dimensions based on the bounding box
    padding = 10  # Adjust the padding as desired
    crop_min_row = int(max(min_row - padding, 0))
    crop_min_col = int(max(min_col - padding, 0))
    crop_max_row = int(min(max_row + padding, image.shape[0]))
    crop_max_col = int(min(max_col + padding, image.shape[1]))

    # Crop the image using the computed dimensions
    cropped_image = image[crop_min_row:crop_max_row, crop_min_col:crop_max_col]

    return cropped_image

def image_generator(filepaths):
    for filepath in filepaths:
        yield io.imread(filepath, as_gray=True)

def extract_texture_features(image):   
    img = crop_image(image) 
    # Create Gabor filter bank
    fmax = 0.327 # maximum frequency
    k = np.sqrt(2) #frequency ratio or factor for selecting filter frequencies
    p = 0.5 # crossing point between two consecutive filters, default 0.5
    u = 6 #number of frequencies
    v = 8 #number of orientation
    gamma = 0.5  #smoothting parameter 
    eta = 0.5  #smoothting parameter of
    row = img.shape[0]
    col = img.shape[1] # size of image

    GaborFilterBank = gbb().create_a_set_of_gabor_filters(fmax, k, p, u, v, row, col, gamma, eta)
    
    # Filter with the filter bank
    GaborFilteredReponses = gbfrb().create_a_set_of_Gabor_filtered_responses(img, GaborFilterBank)

    # Convert responses to simple 3-D matrix with normalization
    filteredImages = gbfrb().convert_a_set_Gabor_filtered_responses_to_ndarray(GaborFilteredReponses)
    
    # Get mean and standard deviation of each response as Gabor (texture) features of an input image
    nImages = filteredImages.shape[2]
    textureFeatures = np.zeros(nImages*2)

    index=0
    for i in range(0, nImages):
        textureFeatures[index] = np.mean(np.abs(filteredImages[:,:,i]));
        index = index + 1;
        textureFeatures[index] = np.std(np.abs(filteredImages[:,:,i]));
        index = index + 1;
    
    del filteredImages, GaborFilteredReponses, GaborFilterBank
    gc.collect()
    
    return textureFeatures

## 4.2. main()

### 4.2.1. For fold 1
#### 1. Read path of fold 1 file

In [4]:
dfFoldTraining_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-training-fold_1.csv')
dfFoldValidation_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-validation-fold_1.csv')

In [5]:
display(dfFoldTraining_1.head(5), dfFoldTraining_1.shape)

Unnamed: 0,filenames,labels,short_filenames,cls
0,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0001-aggregates.png,0
1,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002.png,0
2,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0003-aggregates.png,0
3,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004-aggregates.png,0
4,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004.png,0


(44099, 4)

#### 2. Extracting gabor feature for the training set

In [6]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_train_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_train_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44099 [00:00<?, ?it/s]

In [7]:
%store gabor_train_list

Stored 'gabor_train_list' (list)


The ‘%store’ command saves the specified variable. Now if we restart the Jupyter Notebook we can recover the variable using the ‘%store -r’ command:

In [6]:
%store -r gabor_train_list

In [8]:
np.asarray(gabor_train_list).shape

(44099, 96)

In [9]:
# Gabor filter for train set --- standardization 
std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)

In [9]:
# Save in file
X_gabor_std_train = std_scale_train.transform(gabor_train_list)
X_gabor_train_dff = pd.DataFrame(data = X_gabor_std_train)
X_gabor_train_df = pd.DataFrame(data = dfFoldTraining_1["short_filenames"])

X_gabor_train_df = pd.concat([X_gabor_train_df,X_gabor_train_dff], axis=1)
X_gabor_train_df.columns = pd.RangeIndex(X_gabor_train_df.columns.size)

display(X_gabor_train_df.head(5), X_gabor_train_df.shape)

X_gabor_train_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96-train-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0001-aggregates.png,1.136312,-0.120867,1.151158,-0.114632,1.209658,0.191188,1.300448,0.745374,1.352405,...,-0.742402,-1.194496,-0.721887,-1.174843,-0.721007,-1.183975,-0.732128,-1.221598,-0.744075,-1.200731
1,0002.png,0.232402,-0.38877,0.24582,-0.432895,0.242654,-0.45255,0.217668,-0.430446,0.192616,...,0.22055,-0.009426,-0.510338,-0.27412,-0.623958,-0.310437,-0.607235,-0.319016,-0.582036,-0.387699
2,0003-aggregates.png,1.147477,0.747343,1.118156,0.435737,1.116818,0.32163,1.195342,0.754692,1.287895,...,-0.756436,-0.912781,-0.721888,-0.821082,-0.754551,-0.978059,-0.771848,-1.06533,-0.774409,-1.01704
3,0004-aggregates.png,-0.492552,0.76512,-0.528705,0.749952,-0.585569,0.758288,-0.627541,0.739659,-0.632524,...,-1.101012,-0.334478,-1.115239,-0.354489,-0.666226,-0.337154,1.038831,0.43092,2.54254,2.356331
4,0004.png,0.659027,-0.766207,0.69782,-0.732164,0.744554,-0.68764,0.749926,-0.64043,0.692348,...,0.158381,-0.28388,-0.428167,-0.801556,-0.787669,-0.932667,-0.801749,-0.928621,-0.763943,-0.889699


(44099, 97)

##### 4.2. For validation set

In [10]:
filepaths = dfFoldValidation_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_validation_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_validation_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/14700 [00:00<?, ?it/s]

In [11]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)
X_gabor_std_validation = std_scale_train.transform(gabor_validation_list)

In [12]:
X_gabor_validation_dff = pd.DataFrame(data = X_gabor_std_validation)
X_gabor_validation_df = pd.DataFrame(data = dfFoldValidation_1["short_filenames"])

X_gabor_validation_df = pd.concat([X_gabor_validation_df,X_gabor_validation_dff], axis=1)
X_gabor_validation_df.columns = pd.RangeIndex(X_gabor_validation_df.columns.size)

display(X_gabor_validation_df.head(5), X_gabor_validation_df.shape)

X_gabor_validation_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96_std-validation-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0002-aggregates.png,-1.124807,0.84644,-1.138088,0.866766,-1.14904,0.865567,-1.170127,0.882358,-1.165774,...,-0.018641,0.365488,0.46232,0.610102,0.639254,0.854176,0.087541,0.243078,0.7945,0.85687
1,0006-aggregates.png,0.58394,-0.551095,0.573031,-0.542108,0.564279,-0.574851,0.556822,-0.644346,0.56466,...,-0.534824,-0.458842,-0.467645,-0.587663,-0.184629,-0.352692,0.289746,0.295499,0.325833,0.408612
2,001-aggregates-jo_700_05.png,-0.452301,0.901055,-0.440633,0.929559,-0.418416,0.959422,-0.391965,0.993847,-0.382175,...,0.383506,1.067756,0.131348,0.300736,-0.340842,0.092228,-0.424322,0.215652,-0.45019,0.159132
3,0010-aggregates.png,0.553547,-0.551249,0.559949,-0.522796,0.566758,-0.47437,0.565353,-0.468133,0.55187,...,-0.208128,-0.346751,-0.063954,-0.113626,-0.114338,-0.329164,-0.289741,-0.405385,-0.460871,-0.544875
4,0011.png,0.878954,-1.234502,0.870033,-1.165677,0.867735,-1.162138,0.873726,-1.118144,0.914043,...,-0.775883,-1.183196,-0.431509,-0.765326,0.178111,-0.42552,-0.282132,-0.658629,-0.743051,-1.142364


(14700, 97)

##### 4.2. For test set

<u><b> Remarks :</b></u> We use 4-fold cross validaiton. Then, we need also to compute each kind of features for test set.
So, for the test set, we extract 4 sets of features for each fold

In [13]:
dfTest = pd.read_csv('..//_inputs//_images_Zooscan//ZooScan-test_img.csv')

In [14]:
filepaths = dfTest['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_test_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_test_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/6907 [00:00<?, ?it/s]

In [15]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_test_list)
X_gabor_std_test = std_scale_train.transform(gabor_test_list)

In [16]:
X_gabor_test_dff = pd.DataFrame(data = X_gabor_std_test)
X_gabor_test_df = pd.DataFrame(data = dfTest["short_filenames"])

X_gabor_test_df = pd.concat([X_gabor_test_df,X_gabor_test_dff], axis=1)
X_gabor_test_df.columns = pd.RangeIndex(X_gabor_test_df.columns.size)

display(X_gabor_test_df.head(5), X_gabor_test_df.shape)

X_gabor_test_df.to_csv("..//_inputs//_image_features//new//X-gabor_96_std-test-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0007-aggregates_002.png,0.697323,-0.808015,0.695685,-0.813086,0.693752,-0.834384,0.683842,-0.861493,0.679107,...,-0.232577,-0.321633,-0.253101,-0.322692,-0.281572,-0.410231,-0.389857,-0.496872,-0.442787,-0.573586
1,0009-aggregates_001.png,1.071954,-1.023312,1.050105,-1.159215,1.0299,-1.328477,1.038799,-1.405424,1.051676,...,-0.654648,-1.13313,-0.63469,-1.126482,-0.661392,-1.095521,-0.674366,-1.084152,-0.634005,-1.1247
2,0012-aggregates_002.png,1.178067,-1.612691,1.175463,-1.60994,1.168327,-1.605932,1.153877,-1.553238,1.146037,...,-0.569331,-1.130898,-0.609587,-1.272601,-0.602741,-1.309572,-0.58447,-1.288972,-0.586553,-1.250737
3,002-aggregates_001.png,0.427048,-0.522704,0.205709,-0.306068,0.020965,-0.079924,-0.031241,-0.103162,-0.010247,...,-1.046277,-0.396232,-0.878527,-0.259588,-0.60644,-0.084636,-0.335342,0.174256,-0.090707,0.027752
4,002-aggregates_007.png,-0.449403,1.010885,-0.441765,1.021145,-0.439187,1.020487,-0.44765,1.00068,-0.452431,...,1.371099,1.642061,0.863731,1.27636,0.259719,0.622112,-0.047238,0.272648,-0.30848,-0.130846


(6907, 97)

### 4.2.2. For fold 2
#### 1. Read path of fold 2 file

In [4]:
dfFoldTraining_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-training-fold_2.csv')
dfFoldValidation_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-validation-fold_2.csv')

In [5]:
display(dfFoldTraining_1.head(5), dfFoldTraining_1.shape)

Unnamed: 0,filenames,labels,short_filenames,cls
0,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0001-aggregates.png,0
1,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002-aggregates.png,0
2,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004-aggregates.png,0
3,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004.png,0
4,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0005-aggregates.png,0


(44099, 4)

#### 2. Extracting gabor feature for the training set

In [6]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_train_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_train_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44099 [00:00<?, ?it/s]

In [7]:
with open("..//_inputs//_image_features//new//Gabor_train_list_fold2.pkl", "wb") as f:
    pickle.dump(gabor_train_list, f)

In [8]:
# load gabor descripors
with open("..//_inputs//_image_features//new//Gabor_train_list_fold2.pkl", "rb") as f:
    gabor_train_list = pickle.load(f)

In [9]:
np.asarray(gabor_train_list).shape

(44099, 96)

In [10]:
# Gabor filter for train set --- standardization 
std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)

In [11]:
# Save in file
X_gabor_std_train = std_scale_train.transform(gabor_train_list)
X_gabor_train_dff = pd.DataFrame(data = X_gabor_std_train)
X_gabor_train_df = pd.DataFrame(data = dfFoldTraining_1["short_filenames"])

X_gabor_train_df = pd.concat([X_gabor_train_df,X_gabor_train_dff], axis=1)
X_gabor_train_df.columns = pd.RangeIndex(X_gabor_train_df.columns.size)

display(X_gabor_train_df.head(5), X_gabor_train_df.shape)

X_gabor_train_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96-train-fold_2.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0001-aggregates.png,1.129691,-0.124047,1.144618,-0.117983,1.202701,0.185961,1.292399,0.737027,1.343451,...,-0.739288,-1.188459,-0.719762,-1.169782,-0.719358,-1.180457,-0.730345,-1.218094,-0.743436,-1.201409
1,0002-aggregates.png,-1.107306,0.838356,-1.120737,0.858442,-1.131818,0.856913,-1.152594,0.87332,-1.148105,...,-0.022725,0.358513,0.452581,0.601725,0.629564,0.846648,0.084184,0.237671,0.793228,0.854149
2,0004-aggregates.png,-0.481796,0.757448,-0.517714,0.74222,-0.574123,0.750179,-0.615626,0.731341,-0.620493,...,-1.094331,-0.335614,-1.109173,-0.355605,-0.665033,-0.338225,1.02951,0.424369,2.539099,2.352121
3,0004.png,0.657498,-0.766117,0.696011,-0.732387,0.742365,-0.688402,0.747577,-0.64178,0.690372,...,0.152536,-0.285438,-0.428984,-0.799305,-0.785464,-0.930511,-0.799529,-0.926899,-0.763279,-0.890685
4,0005-aggregates.png,-0.072821,0.330003,-0.03339,0.298638,-0.018677,0.305432,-0.045096,0.346729,-0.07143,...,0.322328,0.42091,0.342029,0.239057,0.105608,-0.218088,-0.329904,-0.195986,-0.79639,-0.282771


(44099, 97)

##### 4.2. For validation set

In [12]:
filepaths = dfFoldValidation_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_validation_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_validation_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/14700 [00:00<?, ?it/s]

In [13]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)
X_gabor_std_validation = std_scale_train.transform(gabor_validation_list)

In [14]:
X_gabor_validation_dff = pd.DataFrame(data = X_gabor_std_validation)
X_gabor_validation_df = pd.DataFrame(data = dfFoldValidation_1["short_filenames"])

X_gabor_validation_df = pd.concat([X_gabor_validation_df,X_gabor_validation_dff], axis=1)
X_gabor_validation_df.columns = pd.RangeIndex(X_gabor_validation_df.columns.size)

display(X_gabor_validation_df.head(5), X_gabor_validation_df.shape)

X_gabor_validation_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96_std-validation-fold_2.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0002.png,0.235424,-0.390593,0.248728,-0.434634,0.24561,-0.454507,0.22083,-0.432857,0.195924,...,0.214087,-0.013273,-0.510332,-0.27584,-0.623117,-0.311653,-0.606235,-0.321004,-0.581598,-0.389185
1,0003-aggregates.png,1.140738,0.739761,1.11196,0.429596,1.110813,0.31574,1.188381,0.746298,1.279623,...,-0.753183,-0.909093,-0.719762,-0.818685,-0.752622,-0.975657,-0.769816,-1.062776,-0.773732,-1.0179
2,0008-aggregates.png,0.353132,-0.010357,0.308424,0.099207,0.290567,0.198189,0.328398,0.21677,0.362437,...,-0.435581,-0.482271,-0.137701,0.018327,-0.406588,-0.675384,-0.192342,-0.277602,0.084087,0.675424
3,001-aggregates_003.png,0.6767,0.432526,0.597933,0.203781,0.510506,0.088347,0.54129,0.287999,0.583936,...,-0.362058,-0.132771,-0.393057,-0.106532,-0.489456,-0.411545,-0.547886,-0.60241,-0.158576,-0.237548
4,001-aggregates_006.png,0.664364,-0.650944,0.657205,-0.714348,0.657228,-0.829398,0.667014,-0.868372,0.673468,...,-0.635805,-0.718787,-0.550992,-0.604219,-0.433179,-0.636334,-0.357542,-0.516403,-0.335287,-0.521678


(14700, 97)

##### 4.2. For test set

<u><b> Remarks :</b></u> We use 4-fold cross validaiton. Then, we need also to compute each kind of features for test set.
So, for the test set, we extract 4 sets of features for each fold

In [15]:
dfTest = pd.read_csv('..//_inputs//_images_Zooscan//ZooScan-test_img.csv')

In [16]:
filepaths = dfTest['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_test_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_test_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/6907 [00:00<?, ?it/s]

In [17]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_test_list)
X_gabor_std_test = std_scale_train.transform(gabor_test_list)

In [18]:
X_gabor_test_dff = pd.DataFrame(data = X_gabor_std_test)
X_gabor_test_df = pd.DataFrame(data = dfTest["short_filenames"])

X_gabor_test_df = pd.concat([X_gabor_test_df,X_gabor_test_dff], axis=1)
X_gabor_test_df.columns = pd.RangeIndex(X_gabor_test_df.columns.size)

display(X_gabor_test_df.head(5), X_gabor_test_df.shape)

X_gabor_test_df.to_csv("..//_inputs//_image_features//new//X-gabor_96_std-test-fold_2.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0007-aggregates_002.png,0.695385,-0.807712,0.693898,-0.812899,0.692084,-0.834401,0.682178,-0.861727,0.677271,...,-0.234533,-0.322877,-0.255672,-0.324047,-0.283586,-0.410907,-0.39022,-0.497777,-0.442522,-0.574887
1,0009-aggregates_001.png,1.06602,-1.021918,1.044619,-1.157275,1.024786,-1.325983,1.033459,-1.402912,1.045902,...,-0.652407,-1.127605,-0.633438,-1.121785,-0.660239,-1.092482,-0.672945,-1.081484,-0.633503,-1.125453
2,0012-aggregates_002.png,1.171001,-1.60831,1.168669,-1.605716,1.161794,-1.602028,1.147346,-1.54998,1.139265,...,-0.567939,-1.125391,-0.608587,-1.266804,-0.602077,-1.305374,-0.583612,-1.285057,-0.58611,-1.251365
3,002-aggregates_001.png,0.427994,-0.523848,0.209036,-0.30845,0.026193,-0.083774,-0.025501,-0.107225,-0.004795,...,-1.040141,-0.396853,-0.874832,-0.261417,-0.605745,-0.087076,-0.336047,0.169268,-0.090879,0.025854
4,002-aggregates_007.png,-0.439108,1.001968,-0.431681,1.012039,-0.429242,1.011046,-0.437598,0.991044,-0.442304,...,1.353194,1.62444,0.849971,1.262967,0.253193,0.615841,-0.04975,0.267061,-0.308382,-0.132586


(6907, 97)

### 4.2.2. For fold 3
#### 1. Read path of fold 3 file

In [4]:
dfFoldTraining_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-training-fold_3.csv')
dfFoldValidation_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-validation-fold_3.csv')

In [5]:
display(dfFoldTraining_1.head(5), dfFoldTraining_1.shape)

Unnamed: 0,filenames,labels,short_filenames,cls
0,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0001-aggregates.png,0
1,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002-aggregates.png,0
2,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002.png,0
3,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0003-aggregates.png,0
4,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004-aggregates.png,0


(44099, 4)

#### 2. Extracting gabor feature for the training set

In [6]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_train_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_train_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44099 [00:00<?, ?it/s]

In [None]:
with open("..//_inputs//_image_features//new//Gabor_train_list_fold3.pkl", "wb") as f:
    pickle.dump(gabor_train_list, f)

Stored 'gabor_train_list' (list)


In [6]:
# load gabor descripors
with open("..//_inputs//_image_features//new//Gabor_train_list_fold3.pkl", "rb") as f:
    gabor_train_list = pickle.load(f)

In [7]:
np.asarray(gabor_train_list).shape

(44099, 96)

In [8]:
# Gabor filter for train set --- standardization 
std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)

In [9]:
# Save in file
X_gabor_std_train = std_scale_train.transform(gabor_train_list)
X_gabor_train_dff = pd.DataFrame(data = X_gabor_std_train)
X_gabor_train_df = pd.DataFrame(data = dfFoldTraining_1["short_filenames"])

X_gabor_train_df = pd.concat([X_gabor_train_df,X_gabor_train_dff], axis=1)
X_gabor_train_df.columns = pd.RangeIndex(X_gabor_train_df.columns.size)

display(X_gabor_train_df.head(5), X_gabor_train_df.shape)

X_gabor_train_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96-train-fold_3.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0001-aggregates.png,1.136499,-0.123087,1.151453,-0.117025,1.209768,0.188961,1.300024,0.74359,1.351618,...,-0.740267,-1.191119,-0.722403,-1.172184,-0.72172,-1.18236,-0.730698,-1.220511,-0.741962,-1.203153
1,0002-aggregates.png,-1.119299,0.845292,-1.132872,0.865554,-1.143915,0.86422,-1.164898,0.880722,-1.160588,...,-0.02434,0.357026,0.45182,0.601334,0.632453,0.847677,0.087613,0.241048,0.798278,0.858533
2,0002.png,0.234716,-0.391287,0.248062,-0.435671,0.24482,-0.455618,0.219721,-0.433499,0.19458,...,0.212262,-0.015043,-0.512637,-0.277227,-0.625105,-0.312299,-0.606012,-0.31985,-0.579747,-0.388508
3,0003-aggregates.png,1.147638,0.746085,1.118522,0.434006,1.117125,0.319573,1.195158,0.752918,1.287261,...,-0.754149,-0.911542,-0.722403,-0.820688,-0.755114,-0.977263,-0.770352,-1.064576,-0.772329,-1.019098
4,0004-aggregates.png,-0.488532,0.763882,-0.524799,0.748599,-0.581642,0.756802,-0.623553,0.737869,-0.628603,...,-1.094995,-0.337628,-1.112438,-0.357082,-0.667184,-0.33891,1.037327,0.42849,2.548211,2.36097


(44099, 97)

##### 4.2. For validation set

In [10]:
filepaths = dfFoldValidation_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_validation_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_validation_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/14700 [00:00<?, ?it/s]

In [11]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)
X_gabor_std_validation = std_scale_train.transform(gabor_validation_list)

In [12]:
X_gabor_validation_dff = pd.DataFrame(data = X_gabor_std_validation)
X_gabor_validation_df = pd.DataFrame(data = dfFoldValidation_1["short_filenames"])

X_gabor_validation_df = pd.concat([X_gabor_validation_df,X_gabor_validation_dff], axis=1)
X_gabor_validation_df.columns = pd.RangeIndex(X_gabor_validation_df.columns.size)

display(X_gabor_validation_df.head(5), X_gabor_validation_df.shape)

X_gabor_validation_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96_std-validation-fold_3.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0005-aggregates.png,-0.076119,0.333783,-0.03642,0.302222,-0.021637,0.3092,-0.048373,0.350888,-0.07499,...,0.320407,0.41947,0.341091,0.238255,0.106458,-0.218599,-0.328398,-0.194336,-0.795039,-0.281777
1,0006-aggregates_001.png,-0.637663,1.129403,-0.620245,1.101025,-0.689904,1.10544,-0.79326,1.111437,-0.856808,...,-0.5726,-0.280456,-1.076342,-0.295284,-1.218008,-0.258623,-1.059924,-0.281536,0.104893,-0.245942
2,0009.png,0.925995,-1.09423,0.925407,-1.063686,0.915952,-1.04097,0.896001,-1.047408,0.883062,...,-0.448715,-0.782371,-0.437279,-0.772212,-0.500647,-0.807524,-0.577732,-0.935605,-0.57859,-0.954273
3,001-aggregates.png,0.134888,-0.013452,0.126208,-0.033604,0.126023,-0.05889,0.133902,-0.080717,0.15485,...,-0.342136,-0.023882,-0.360772,-0.047173,-0.316156,-0.235785,-0.175846,-0.171197,-0.089496,-0.126099
4,0012-aggregates_001.png,0.518896,-0.60586,0.516323,-0.589178,0.513976,-0.566422,0.508072,-0.553143,0.506644,...,-0.467731,-0.435345,-0.342203,-0.295193,-0.177829,-0.075404,-0.07122,0.085973,-0.076943,0.178533


(14700, 97)

##### 4.2. For test set

<u><b> Remarks :</b></u> We use 4-fold cross validaiton. Then, we need also to compute each kind of features for test set.
So, for the test set, we extract 4 sets of features for each fold

In [13]:
dfTest = pd.read_csv('..//_inputs//_images_Zooscan//ZooScan-test_img.csv')

In [14]:
filepaths = dfTest['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_test_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_test_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/6907 [00:00<?, ?it/s]

In [15]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_test_list)
X_gabor_std_test = std_scale_train.transform(gabor_test_list)

In [16]:
X_gabor_test_dff = pd.DataFrame(data = X_gabor_std_test)
X_gabor_test_df = pd.DataFrame(data = dfTest["short_filenames"])

X_gabor_test_df = pd.concat([X_gabor_test_df,X_gabor_test_dff], axis=1)
X_gabor_test_df.columns = pd.RangeIndex(X_gabor_test_df.columns.size)

display(X_gabor_test_df.head(5), X_gabor_test_df.shape)

X_gabor_test_df.to_csv("..//_inputs//_image_features//new//X-gabor_96_std-test-fold_3.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0007-aggregates_002.png,0.698543,-0.810997,0.696959,-0.81632,0.694959,-0.83795,0.684829,-0.865011,0.679917,...,-0.23596,-0.32488,-0.257569,-0.325488,-0.284252,-0.411696,-0.388994,-0.497327,-0.440348,-0.574763
1,0009-aggregates_001.png,1.072293,-1.026533,1.050617,-1.162866,1.030392,-1.332687,1.038973,-1.40953,1.051603,...,-0.653463,-1.130219,-0.63594,-1.124132,-0.662371,-1.094257,-0.673032,-1.083358,-0.631773,-1.126971
2,0012-aggregates_002.png,1.178156,-1.616565,1.175706,-1.614133,1.168525,-1.610505,1.153788,-1.557503,1.145739,...,-0.569069,-1.128004,-0.611049,-1.269316,-0.603983,-1.307457,-0.583284,-1.287742,-0.584269,-1.253258
3,002-aggregates_001.png,0.428904,-0.52537,0.208037,-0.308692,0.023602,-0.082505,-0.028618,-0.105862,-0.007802,...,-1.040853,-0.398913,-0.877721,-0.262788,-0.607665,-0.087397,-0.334569,0.172372,-0.087886,0.027769
4,002-aggregates_007.png,-0.445485,1.00992,-0.438046,1.020119,-0.435571,1.019343,-0.444074,0.999171,-0.448937,...,1.350358,1.623912,0.849847,1.263326,0.254617,0.616536,-0.046943,0.270555,-0.305895,-0.131144


(6907, 97)

### 4.2.2. For fold 4
#### 1. Read path of fold 4 file

In [4]:
dfFoldTraining_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-training-fold_4.csv')
dfFoldValidation_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-validation-fold_4.csv')

In [5]:
display(dfFoldTraining_1.head(5), dfFoldTraining_1.shape)

Unnamed: 0,filenames,labels,short_filenames,cls
0,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002-aggregates.png,0
1,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002.png,0
2,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0003-aggregates.png,0
3,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0005-aggregates.png,0
4,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0006-aggregates.png,0


(44100, 4)

#### 2. Extracting gabor feature for the training set

In [6]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_train_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_train_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44100 [00:00<?, ?it/s]

In [7]:
with open("..//_inputs//_image_features//new//Gabor_train_list_fold4.pkl", "wb") as f:
    pickle.dump(gabor_train_list, f)
    

In [8]:
# load gabor descripors
with open("..//_inputs//_image_features//new//Gabor_train_list_fold4.pkl", "rb") as f:
    gabor_train_list = pickle.load(f)

In [9]:
np.asarray(gabor_train_list).shape

(44100, 96)

In [10]:
# Gabor filter for train set --- standardization 
std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)

In [11]:
# Save in file
X_gabor_std_train = std_scale_train.transform(gabor_train_list)
X_gabor_train_dff = pd.DataFrame(data = X_gabor_std_train)
X_gabor_train_df = pd.DataFrame(data = dfFoldTraining_1["short_filenames"])

X_gabor_train_df = pd.concat([X_gabor_train_df,X_gabor_train_dff], axis=1)
X_gabor_train_df.columns = pd.RangeIndex(X_gabor_train_df.columns.size)

display(X_gabor_train_df.head(5), X_gabor_train_df.shape)

X_gabor_train_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96-train-fold_4.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0002-aggregates.png,-1.115479,0.842552,-1.12892,0.862641,-1.139676,0.860536,-1.159971,0.876193,-1.1549,...,-0.019768,0.359975,0.457658,0.605825,0.634008,0.85054,0.082487,0.236214,0.78541,0.849124
1,0002.png,0.236209,-0.39244,0.249696,-0.436585,0.246613,-0.456674,0.221599,-0.43505,0.19645,...,0.217272,-0.013184,-0.508229,-0.274586,-0.623509,-0.312289,-0.607384,-0.322761,-0.583182,-0.391903
2,0003-aggregates.png,1.147561,0.743473,1.118695,0.431756,1.117382,0.316973,1.194888,0.748679,1.286053,...,-0.750928,-0.91231,-0.718306,-0.819191,-0.753513,-0.978888,-0.770836,-1.064933,-0.774445,-1.019452
3,0005-aggregates.png,-0.074092,0.3317,-0.034308,0.300174,-0.019374,0.306621,-0.045905,0.347559,-0.07236,...,0.325617,0.422602,0.346764,0.241981,0.10803,-0.218358,-0.331272,-0.197676,-0.797,-0.285687
4,0006-aggregates.png,0.586317,-0.554736,0.575655,-0.545762,0.566989,-0.57889,0.559232,-0.648695,0.566566,...,-0.531309,-0.460496,-0.465833,-0.586778,-0.186161,-0.354479,0.283265,0.288344,0.319448,0.402142


(44100, 97)

##### 4.2. For validation set

In [12]:
filepaths = dfFoldValidation_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_validation_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_validation_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/14699 [00:00<?, ?it/s]

In [13]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)
X_gabor_std_validation = std_scale_train.transform(gabor_validation_list)

In [14]:
X_gabor_validation_dff = pd.DataFrame(data = X_gabor_std_validation)
X_gabor_validation_df = pd.DataFrame(data = dfFoldValidation_1["short_filenames"])

X_gabor_validation_df = pd.concat([X_gabor_validation_df,X_gabor_validation_dff], axis=1)
X_gabor_validation_df.columns = pd.RangeIndex(X_gabor_validation_df.columns.size)

display(X_gabor_validation_df.head(5), X_gabor_validation_df.shape)

X_gabor_validation_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96_std-validation-fold_4.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0001-aggregates.png,1.136441,-0.124583,1.151572,-0.118428,1.209862,0.186622,1.299523,0.739372,1.350229,...,-0.73702,-1.192707,-0.718306,-1.171427,-0.720121,-1.184489,-0.731396,-1.220334,-0.744286,-1.20262
1,0004-aggregates.png,-0.485796,0.761247,-0.521868,0.745867,-0.578393,0.753331,-0.619818,0.733664,-0.624414,...,-1.092405,-0.336714,-1.10892,-0.354609,-0.665586,-0.338965,1.027063,0.423013,2.523363,2.344317
2,0004.png,0.661098,-0.76981,0.699967,-0.735754,0.746564,-0.691601,0.75147,-0.644784,0.693592,...,0.155662,-0.286354,-0.42663,-0.799749,-0.786482,-0.933566,-0.800525,-0.928982,-0.76404,-0.892473
3,0005-aggregates_002.png,0.635414,-0.416083,0.618059,-0.459558,0.591306,-0.475861,0.586353,-0.450764,0.592728,...,-0.341038,-0.388944,-0.237894,-0.226638,-0.365213,-0.420843,-0.478767,-0.548001,-0.416913,-0.459305
4,0005.png,0.354226,-0.525202,0.361422,-0.516236,0.365611,-0.504469,0.356946,-0.522286,0.34442,...,0.78987,0.695135,0.436935,0.007374,-0.422071,-0.531919,-0.773307,-0.608597,-0.631503,-0.403276


(14699, 97)

##### 4.2. For test set

<u><b> Remarks :</b></u> We use 4-fold cross validaiton. Then, we need also to compute each kind of features for test set.
So, for the test set, we extract 4 sets of features for each fold

In [15]:
dfTest = pd.read_csv('..//_inputs//_images_Zooscan//ZooScan-test_img.csv')

In [16]:
filepaths = dfTest['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_test_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_test_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/6907 [00:00<?, ?it/s]

In [17]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_test_list)
X_gabor_std_test = std_scale_train.transform(gabor_test_list)

In [18]:
X_gabor_test_dff = pd.DataFrame(data = X_gabor_std_test)
X_gabor_test_df = pd.DataFrame(data = dfTest["short_filenames"])

X_gabor_test_df = pd.concat([X_gabor_test_df,X_gabor_test_dff], axis=1)
X_gabor_test_df.columns = pd.RangeIndex(X_gabor_test_df.columns.size)

display(X_gabor_test_df.head(5), X_gabor_test_df.shape)

X_gabor_test_df.to_csv("..//_inputs//_image_features//new//X-gabor_96_std-test-fold_4.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0007-aggregates_002.png,0.699238,-0.811611,0.69784,-0.816649,0.695959,-0.838244,0.685683,-0.865584,0.68042,...,-0.23178,-0.32393,-0.252783,-0.322949,-0.282667,-0.41193,-0.391541,-0.499629,-0.444737,-0.57726
1,0009-aggregates_001.png,1.072345,-1.02687,1.050904,-1.162663,1.030802,-1.331996,1.039048,-1.408869,1.051059,...,-0.650055,-1.131628,-0.631715,-1.123274,-0.660774,-1.09617,-0.674042,-1.083651,-0.634851,-1.126806
2,0012-aggregates_002.png,1.178027,-1.616145,1.175783,-1.613238,1.168691,-1.60926,1.15361,-1.556507,1.14493,...,-0.565506,-1.129406,-0.606787,-1.268764,-0.602387,-1.309894,-0.58478,-1.287334,-0.587673,-1.252484
3,002-aggregates_001.png,0.430063,-0.52635,0.209738,-0.309801,0.025785,-0.084303,-0.026193,-0.108155,-0.005362,...,-1.038163,-0.398179,-0.873855,-0.260117,-0.60607,-0.086834,-0.33741,0.167774,-0.094689,0.022366
4,002-aggregates_007.png,-0.442823,1.006969,-0.435261,1.016969,-0.432579,1.01535,-0.440734,0.994374,-0.445254,...,1.357474,1.630573,0.856275,1.269211,0.256184,0.618831,-0.05134,0.26562,-0.311205,-0.13578


(6907, 97)