In [1]:
import os
import numpy as np
from numpy import random

from random import sample

import pandas as pd 

import skimage
from skimage import io
from skimage.feature import daisy, hog, ORB, local_binary_pattern, SIFT
from skimage.color import label2rgb, rgb2gray
from skimage.transform import resize, rotate, downscale_local_mean

from scipy import ndimage as ndi

from skimage.util import img_as_float
from skimage.filters import gabor_kernel
from skimage.filters import threshold_niblack
from skimage.morphology import convex_hull_image
from skimage.measure import find_contours
from skimage import exposure

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

from tqdm.notebook import tqdm

import gc

from joblib import Parallel, delayed, parallel_backend, cpu_count
import psutil

from platform import python_version

import multiprocessing as mp
from more_itertools import chunked

import gabor_filters
from  gabor_filters import gabor_filter
from  gabor_filters import gabor_filter_response

import importlib
importlib.reload(gabor_filters)
importlib.reload(gabor_filters.gabor_filter)
importlib.reload(gabor_filters.gabor_filter_response)

from gabor_filters.gabor_filter import GaborFilterBank as gbb
from gabor_filters.gabor_filter_response import GaborFilteredResponseBank as gbfrb






In [2]:
print(python_version())
print(skimage.__version__)

3.9.16
0.19.3


In [3]:
def crop_image(image):
    img_height = image.shape[0]
    if img_height < 2000:
        return image
    
    # adaptive thresholding
    thresh_niblack = threshold_niblack(image, window_size=25, k=0.8)
    binary_niblack = image > thresh_niblack

    # make convex hull
    chull = convex_hull_image(np.pad(binary_niblack, 3, 'constant', constant_values=0))
    
    # Find the contours of the main object
    contours = find_contours(chull, 0.5)

    # Find the largest contour (assumed to be the main object)
    largest_contour = max(contours, key=len)

    # Compute the bounding box coordinates for the largest contour
    min_row, min_col = np.min(largest_contour, axis=0)
    max_row, max_col = np.max(largest_contour, axis=0)

    # Compute the optimal cropping dimensions based on the bounding box
    padding = 10  # Adjust the padding as desired
    crop_min_row = int(max(min_row - padding, 0))
    crop_min_col = int(max(min_col - padding, 0))
    crop_max_row = int(min(max_row + padding, image.shape[0]))
    crop_max_col = int(min(max_col + padding, image.shape[1]))

    # Crop the image using the computed dimensions
    cropped_image = image[crop_min_row:crop_max_row, crop_min_col:crop_max_col]

    return cropped_image

def image_generator(filepaths):
    for filepath in filepaths:
        yield io.imread(filepath, as_gray=True)

def extract_texture_features(image):   
    img = crop_image(image) 
    # Create Gabor filter bank
    fmax = 0.327 # maximum frequency
    k = np.sqrt(2) #frequency ratio or factor for selecting filter frequencies
    p = 0.5 # crossing point between two consecutive filters, default 0.5
    u = 6 #number of frequencies
    v = 8 #number of orientation
    gamma = 0.5  #smoothting parameter 
    eta = 0.5  #smoothting parameter of
    row = img.shape[0]
    col = img.shape[1] # size of image

    GaborFilterBank = gbb().create_a_set_of_gabor_filters(fmax, k, p, u, v, row, col, gamma, eta)
    
    # Filter with the filter bank
    GaborFilteredReponses = gbfrb().create_a_set_of_Gabor_filtered_responses(img, GaborFilterBank)

    # Convert responses to simple 3-D matrix with normalization
    filteredImages = gbfrb().convert_a_set_Gabor_filtered_responses_to_ndarray(GaborFilteredReponses)
    
    # Get mean and standard deviation of each response as Gabor (texture) features of an input image
    nImages = filteredImages.shape[2]
    textureFeatures = np.zeros(nImages*2)

    index=0
    for i in range(0, nImages):
        textureFeatures[index] = np.mean(np.abs(filteredImages[:,:,i]));
        index = index + 1;
        textureFeatures[index] = np.std(np.abs(filteredImages[:,:,i]));
        index = index + 1;
    
    del filteredImages, GaborFilteredReponses, GaborFilterBank
    gc.collect()
    
    return textureFeatures

## 4.2. main()

### 4.2.1. For fold 1
#### 1. Read path of fold 1 file

In [4]:
dfFoldTraining_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-training-fold_1.csv')
dfFoldValidation_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-validation-fold_1.csv')

In [5]:
display(dfFoldTraining_1.head(5), dfFoldTraining_1.shape)

Unnamed: 0,filenames,labels,short_filenames,cls
0,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0001-aggregates.png,0
1,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002.png,0
2,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0003-aggregates.png,0
3,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004-aggregates.png,0
4,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004.png,0


(44099, 4)

#### 2. Extracting gabor feature for the training set

In [6]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_train_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_train_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44099 [00:00<?, ?it/s]

In [7]:
%store gabor_train_list

Stored 'gabor_train_list' (list)


The ‘%store’ command saves the specified variable. Now if we restart the Jupyter Notebook we can recover the variable using the ‘%store -r’ command:

In [6]:
%store -r gabor_train_list

In [8]:
np.asarray(gabor_train_list).shape

(44099, 96)

In [9]:
# Gabor filter for train set --- standardization 
std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)

In [9]:
# Save in file
X_gabor_std_train = std_scale_train.transform(gabor_train_list)
X_gabor_train_dff = pd.DataFrame(data = X_gabor_std_train)
X_gabor_train_df = pd.DataFrame(data = dfFoldTraining_1["short_filenames"])

X_gabor_train_df = pd.concat([X_gabor_train_df,X_gabor_train_dff], axis=1)
X_gabor_train_df.columns = pd.RangeIndex(X_gabor_train_df.columns.size)

display(X_gabor_train_df.head(5), X_gabor_train_df.shape)

X_gabor_train_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96-train-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0001-aggregates.png,1.136312,-0.120867,1.151158,-0.114632,1.209658,0.191188,1.300448,0.745374,1.352405,...,-0.742402,-1.194496,-0.721887,-1.174843,-0.721007,-1.183975,-0.732128,-1.221598,-0.744075,-1.200731
1,0002.png,0.232402,-0.38877,0.24582,-0.432895,0.242654,-0.45255,0.217668,-0.430446,0.192616,...,0.22055,-0.009426,-0.510338,-0.27412,-0.623958,-0.310437,-0.607235,-0.319016,-0.582036,-0.387699
2,0003-aggregates.png,1.147477,0.747343,1.118156,0.435737,1.116818,0.32163,1.195342,0.754692,1.287895,...,-0.756436,-0.912781,-0.721888,-0.821082,-0.754551,-0.978059,-0.771848,-1.06533,-0.774409,-1.01704
3,0004-aggregates.png,-0.492552,0.76512,-0.528705,0.749952,-0.585569,0.758288,-0.627541,0.739659,-0.632524,...,-1.101012,-0.334478,-1.115239,-0.354489,-0.666226,-0.337154,1.038831,0.43092,2.54254,2.356331
4,0004.png,0.659027,-0.766207,0.69782,-0.732164,0.744554,-0.68764,0.749926,-0.64043,0.692348,...,0.158381,-0.28388,-0.428167,-0.801556,-0.787669,-0.932667,-0.801749,-0.928621,-0.763943,-0.889699


(44099, 97)

##### 4.2. For validation set

In [10]:
filepaths = dfFoldValidation_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_validation_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_validation_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/14700 [00:00<?, ?it/s]

In [11]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)
X_gabor_std_validation = std_scale_train.transform(gabor_validation_list)

In [12]:
X_gabor_validation_dff = pd.DataFrame(data = X_gabor_std_validation)
X_gabor_validation_df = pd.DataFrame(data = dfFoldValidation_1["short_filenames"])

X_gabor_validation_df = pd.concat([X_gabor_validation_df,X_gabor_validation_dff], axis=1)
X_gabor_validation_df.columns = pd.RangeIndex(X_gabor_validation_df.columns.size)

display(X_gabor_validation_df.head(5), X_gabor_validation_df.shape)

X_gabor_validation_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96_std-validation-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0002-aggregates.png,-1.124807,0.84644,-1.138088,0.866766,-1.14904,0.865567,-1.170127,0.882358,-1.165774,...,-0.018641,0.365488,0.46232,0.610102,0.639254,0.854176,0.087541,0.243078,0.7945,0.85687
1,0006-aggregates.png,0.58394,-0.551095,0.573031,-0.542108,0.564279,-0.574851,0.556822,-0.644346,0.56466,...,-0.534824,-0.458842,-0.467645,-0.587663,-0.184629,-0.352692,0.289746,0.295499,0.325833,0.408612
2,001-aggregates-jo_700_05.png,-0.452301,0.901055,-0.440633,0.929559,-0.418416,0.959422,-0.391965,0.993847,-0.382175,...,0.383506,1.067756,0.131348,0.300736,-0.340842,0.092228,-0.424322,0.215652,-0.45019,0.159132
3,0010-aggregates.png,0.553547,-0.551249,0.559949,-0.522796,0.566758,-0.47437,0.565353,-0.468133,0.55187,...,-0.208128,-0.346751,-0.063954,-0.113626,-0.114338,-0.329164,-0.289741,-0.405385,-0.460871,-0.544875
4,0011.png,0.878954,-1.234502,0.870033,-1.165677,0.867735,-1.162138,0.873726,-1.118144,0.914043,...,-0.775883,-1.183196,-0.431509,-0.765326,0.178111,-0.42552,-0.282132,-0.658629,-0.743051,-1.142364


(14700, 97)

##### 4.2. For test set

<u><b> Remarks :</b></u> We use 4-fold cross validaiton. Then, we need also to compute each kind of features for test set.
So, for the test set, we extract 4 sets of features for each fold

In [13]:
dfTest = pd.read_csv('..//_inputs//_images_Zooscan//ZooScan-test_img.csv')

In [14]:
filepaths = dfTest['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_test_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_test_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/6907 [00:00<?, ?it/s]

In [15]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_test_list)
X_gabor_std_test = std_scale_train.transform(gabor_test_list)

In [16]:
X_gabor_test_dff = pd.DataFrame(data = X_gabor_std_test)
X_gabor_test_df = pd.DataFrame(data = dfTest["short_filenames"])

X_gabor_test_df = pd.concat([X_gabor_test_df,X_gabor_test_dff], axis=1)
X_gabor_test_df.columns = pd.RangeIndex(X_gabor_test_df.columns.size)

display(X_gabor_test_df.head(5), X_gabor_test_df.shape)

X_gabor_test_df.to_csv("..//_inputs//_image_features//new//X-gabor_96_std-test-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0007-aggregates_002.png,0.697323,-0.808015,0.695685,-0.813086,0.693752,-0.834384,0.683842,-0.861493,0.679107,...,-0.232577,-0.321633,-0.253101,-0.322692,-0.281572,-0.410231,-0.389857,-0.496872,-0.442787,-0.573586
1,0009-aggregates_001.png,1.071954,-1.023312,1.050105,-1.159215,1.0299,-1.328477,1.038799,-1.405424,1.051676,...,-0.654648,-1.13313,-0.63469,-1.126482,-0.661392,-1.095521,-0.674366,-1.084152,-0.634005,-1.1247
2,0012-aggregates_002.png,1.178067,-1.612691,1.175463,-1.60994,1.168327,-1.605932,1.153877,-1.553238,1.146037,...,-0.569331,-1.130898,-0.609587,-1.272601,-0.602741,-1.309572,-0.58447,-1.288972,-0.586553,-1.250737
3,002-aggregates_001.png,0.427048,-0.522704,0.205709,-0.306068,0.020965,-0.079924,-0.031241,-0.103162,-0.010247,...,-1.046277,-0.396232,-0.878527,-0.259588,-0.60644,-0.084636,-0.335342,0.174256,-0.090707,0.027752
4,002-aggregates_007.png,-0.449403,1.010885,-0.441765,1.021145,-0.439187,1.020487,-0.44765,1.00068,-0.452431,...,1.371099,1.642061,0.863731,1.27636,0.259719,0.622112,-0.047238,0.272648,-0.30848,-0.130846


(6907, 97)

### 4.2.2. For fold 2
#### 1. Read path of fold 2 file

In [17]:
dfFoldTraining_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-training-fold_2.csv')
dfFoldValidation_1 = pd.read_csv('..//_inputs//_images_Zooscan//_Zooscan-validation-fold_2.csv')

In [18]:
display(dfFoldTraining_1.head(5), dfFoldTraining_1.shape)

Unnamed: 0,filenames,labels,short_filenames,cls
0,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0001-aggregates.png,0
1,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0002-aggregates.png,0
2,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004-aggregates.png,0
3,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0004.png,0
4,..//_inputs//_images_Zooscan//_training//aggre...,aggregats_debris,0005-aggregates.png,0


(44099, 4)

#### 2. Extracting gabor feature for the training set

In [19]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_train_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_train_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44099 [00:00<?, ?it/s]

In [None]:
%store gabor_train_list

Stored 'gabor_train_list' (list)


The ‘%store’ command saves the specified variable. Now if we restart the Jupyter Notebook we can recover the variable using the ‘%store -r’ command:

In [None]:
%store -r gabor_train_list

In [None]:
np.asarray(gabor_train_list).shape

(44099, 96)

In [None]:
# Gabor filter for train set --- standardization 
std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)

In [None]:
# Save in file
X_gabor_std_train = std_scale_train.transform(gabor_train_list)
X_gabor_train_dff = pd.DataFrame(data = X_gabor_std_train)
X_gabor_train_df = pd.DataFrame(data = dfFoldTraining_1["short_filenames"])

X_gabor_train_df = pd.concat([X_gabor_train_df,X_gabor_train_dff], axis=1)
X_gabor_train_df.columns = pd.RangeIndex(X_gabor_train_df.columns.size)

display(X_gabor_train_df.head(5), X_gabor_train_df.shape)

X_gabor_train_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96-train-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0001-aggregates.png,1.136312,-0.120867,1.151158,-0.114632,1.209658,0.191188,1.300448,0.745374,1.352405,...,-0.742402,-1.194496,-0.721887,-1.174843,-0.721007,-1.183975,-0.732128,-1.221598,-0.744075,-1.200731
1,0002.png,0.232402,-0.38877,0.24582,-0.432895,0.242654,-0.45255,0.217668,-0.430446,0.192616,...,0.22055,-0.009426,-0.510338,-0.27412,-0.623958,-0.310437,-0.607235,-0.319016,-0.582036,-0.387699
2,0003-aggregates.png,1.147477,0.747343,1.118156,0.435737,1.116818,0.32163,1.195342,0.754692,1.287895,...,-0.756436,-0.912781,-0.721888,-0.821082,-0.754551,-0.978059,-0.771848,-1.06533,-0.774409,-1.01704
3,0004-aggregates.png,-0.492552,0.76512,-0.528705,0.749952,-0.585569,0.758288,-0.627541,0.739659,-0.632524,...,-1.101012,-0.334478,-1.115239,-0.354489,-0.666226,-0.337154,1.038831,0.43092,2.54254,2.356331
4,0004.png,0.659027,-0.766207,0.69782,-0.732164,0.744554,-0.68764,0.749926,-0.64043,0.692348,...,0.158381,-0.28388,-0.428167,-0.801556,-0.787669,-0.932667,-0.801749,-0.928621,-0.763943,-0.889699


(44099, 97)

##### 4.2. For validation set

In [None]:
filepaths = dfFoldValidation_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_validation_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_validation_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/14700 [00:00<?, ?it/s]

In [None]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_train_list)
X_gabor_std_validation = std_scale_train.transform(gabor_validation_list)

In [None]:
X_gabor_validation_dff = pd.DataFrame(data = X_gabor_std_validation)
X_gabor_validation_df = pd.DataFrame(data = dfFoldValidation_1["short_filenames"])

X_gabor_validation_df = pd.concat([X_gabor_validation_df,X_gabor_validation_dff], axis=1)
X_gabor_validation_df.columns = pd.RangeIndex(X_gabor_validation_df.columns.size)

display(X_gabor_validation_df.head(5), X_gabor_validation_df.shape)

X_gabor_validation_df.to_csv("..//_inputs//_image_features//new//X-gabor_std_96_std-validation-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0002-aggregates.png,-1.124807,0.84644,-1.138088,0.866766,-1.14904,0.865567,-1.170127,0.882358,-1.165774,...,-0.018641,0.365488,0.46232,0.610102,0.639254,0.854176,0.087541,0.243078,0.7945,0.85687
1,0006-aggregates.png,0.58394,-0.551095,0.573031,-0.542108,0.564279,-0.574851,0.556822,-0.644346,0.56466,...,-0.534824,-0.458842,-0.467645,-0.587663,-0.184629,-0.352692,0.289746,0.295499,0.325833,0.408612
2,001-aggregates-jo_700_05.png,-0.452301,0.901055,-0.440633,0.929559,-0.418416,0.959422,-0.391965,0.993847,-0.382175,...,0.383506,1.067756,0.131348,0.300736,-0.340842,0.092228,-0.424322,0.215652,-0.45019,0.159132
3,0010-aggregates.png,0.553547,-0.551249,0.559949,-0.522796,0.566758,-0.47437,0.565353,-0.468133,0.55187,...,-0.208128,-0.346751,-0.063954,-0.113626,-0.114338,-0.329164,-0.289741,-0.405385,-0.460871,-0.544875
4,0011.png,0.878954,-1.234502,0.870033,-1.165677,0.867735,-1.162138,0.873726,-1.118144,0.914043,...,-0.775883,-1.183196,-0.431509,-0.765326,0.178111,-0.42552,-0.282132,-0.658629,-0.743051,-1.142364


(14700, 97)

##### 4.2. For test set

<u><b> Remarks :</b></u> We use 4-fold cross validaiton. Then, we need also to compute each kind of features for test set.
So, for the test set, we extract 4 sets of features for each fold

In [None]:
dfTest = pd.read_csv('..//_inputs//_images_Zooscan//ZooScan-test_img.csv')

In [None]:
filepaths = dfTest['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 8

gabor_test_list = []

# i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            # i=i+1
            # print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            gabor_test_list.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))

Extract Gabor features:   0%|          | 0/6907 [00:00<?, ?it/s]

In [None]:
# standard deviation normalization using above std_scale_train = preprocessing.StandardScaler().fit(gabor_test_list)
X_gabor_std_test = std_scale_train.transform(gabor_test_list)

In [None]:
X_gabor_test_dff = pd.DataFrame(data = X_gabor_std_test)
X_gabor_test_df = pd.DataFrame(data = dfTest["short_filenames"])

X_gabor_test_df = pd.concat([X_gabor_test_df,X_gabor_test_dff], axis=1)
X_gabor_test_df.columns = pd.RangeIndex(X_gabor_test_df.columns.size)

display(X_gabor_test_df.head(5), X_gabor_test_df.shape)

X_gabor_test_df.to_csv("..//_inputs//_image_features//new//X-gabor_96_std-test-fold_1.csv", header=False, index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0007-aggregates_002.png,0.697323,-0.808015,0.695685,-0.813086,0.693752,-0.834384,0.683842,-0.861493,0.679107,...,-0.232577,-0.321633,-0.253101,-0.322692,-0.281572,-0.410231,-0.389857,-0.496872,-0.442787,-0.573586
1,0009-aggregates_001.png,1.071954,-1.023312,1.050105,-1.159215,1.0299,-1.328477,1.038799,-1.405424,1.051676,...,-0.654648,-1.13313,-0.63469,-1.126482,-0.661392,-1.095521,-0.674366,-1.084152,-0.634005,-1.1247
2,0012-aggregates_002.png,1.178067,-1.612691,1.175463,-1.60994,1.168327,-1.605932,1.153877,-1.553238,1.146037,...,-0.569331,-1.130898,-0.609587,-1.272601,-0.602741,-1.309572,-0.58447,-1.288972,-0.586553,-1.250737
3,002-aggregates_001.png,0.427048,-0.522704,0.205709,-0.306068,0.020965,-0.079924,-0.031241,-0.103162,-0.010247,...,-1.046277,-0.396232,-0.878527,-0.259588,-0.60644,-0.084636,-0.335342,0.174256,-0.090707,0.027752
4,002-aggregates_007.png,-0.449403,1.010885,-0.441765,1.021145,-0.439187,1.020487,-0.44765,1.00068,-0.452431,...,1.371099,1.642061,0.863731,1.27636,0.259719,0.622112,-0.047238,0.272648,-0.30848,-0.130846


(6907, 97)

In [12]:
del list_gabor_train2, list_gabor_train

gc.collect()

471

In [None]:
n = 2000

list_dfFoldTraining_1_chunked = [dfFoldTraining_1[i:i+n] for i in range(0, len(dfFoldTraining_1), n)]

display(len(list_dfFoldTraining_1_chunked))

In [None]:
with Parallel(n_jobs=2) as parallel:
    

In [None]:
gabor_list_train = None

pbar = tqdm(list_dfFoldTraining_1_chunked)

for i, dfFoldTraining_1_chunked in enumerate(pbar):
    pbar.set_description(f'Processing the chunked data {i+1}')
    
    gabor_list_train_chunked = extract_gabor(dfFoldTraining_1_chunked)

    gabor_list_train = np.vstack(gabor_list_train_chunked)

    del  gabor_list_train_chunked
    gc.collect()

In [None]:
# create a standard deviation normalization for later uses
train_std_norm = StandardScaler().fit(HOG_list)

In [None]:
# Standard deviation normalization
HOG_list_std = train_std_norm.transform(HOG_list)

In [None]:
display(HOG_list_std)

In [None]:
print('Total HOG features:',(HOG_list_std.shape))

# 1. Test the joblib with batch size

In [4]:
import multiprocessing as mp
from more_itertools import chunked

def extract_gabor_from_filepath(img):
    process = psutil.Process(mp.current_process().pid)
    print(f"Worker memory usage: {process.memory_info().rss / 1024 / 1024} MB")
        
    # Create Gabor filter bank
    fmax = 0.327 # maximum frequency
    k = np.sqrt(2) #frequency ratio or factor for selecting filter frequencies
    p = 0.5 # crossing point between two consecutive filters, default 0.5
    u = 6 #number of frequencies
    v = 8 #number of orientation
    gamma = 0.5  #smoothting parameter 
    eta = 0.5  #smoothting parameter of
    row = img.shape[0]
    col = img.shape[1] # size of image

    GaborFilterBank = gbb().create_a_set_of_gabor_filters(fmax, k, p, u, v, row, col, gamma, eta)
    
    # Filter with the filter bank
    GaborFilteredReponses = gbfrb().create_a_set_of_Gabor_filtered_responses(img, GaborFilterBank)

    # Convert responses to simple 3-D matrix with normalization
    filteredImages = gbfrb().convert_a_set_Gabor_filtered_responses_to_ndarray(GaborFilteredReponses)
    
    # Get mean and standard deviation of each response as Gabor (texture) features of an input image
    nImages = filteredImages.shape[2]
    textureFeatures = np.zeros(nImages*2)

    index=0
    for i in range(0, nImages):
        textureFeatures[index] = np.mean(np.abs(filteredImages[:,:,i]));
        index = index + 1;
        textureFeatures[index] = np.std(np.abs(filteredImages[:,:,i]));
        index = index + 1;
    
    return textureFeatures

def image_generator(filepaths):
    for filepath in filepaths:
        yield io.imread(filepath, as_gray=True)


# def extract_texture_features(img):
#     process = psutil.Process(mp.current_process().pid)
#     print(f"Worker memory usage: {process.memory_info().rss / 1024 / 1024} MB")
#     # your existing code here
#     textureFeatures = extract_gabor_from_filepath(img)
#     return textureFeatures


In [5]:
mp.cpu_count()

12

In [None]:
filepaths = dfFoldTraining_1['filenames']
batch_size = 100
chunk_size = 400
num_processes = mp.cpu_count()-4
results = []
with Parallel(n_jobs=num_processes) as parallel:
    for batch in tqdm(chunked(image_generator(filepaths), batch_size), total=len(filepaths)//batch_size):
        
        batch_results = parallel(
            delayed(extract_gabor_from_filepath)(img) for img in batch)
        
        results.extend(batch_results)

# 2. Test with multipleprocessing


In [None]:
import multiprocessing as mp
from more_itertools import chunked

from skimage.filters import threshold_niblack
from skimage.morphology import convex_hull_image
from skimage.measure import find_contours

def crop_image(image):
    img_height = image.shape[0]
    if img_height < 2000:
        return image
    
    # adaptive thresholding
    thresh_niblack = threshold_niblack(image, window_size=25, k=0.8)
    binary_niblack = image > thresh_niblack

    # make convex hull
    chull = convex_hull_image(np.pad(binary_niblack, 3, 'constant', constant_values=0))
    
    # Find the contours of the main object
    contours = find_contours(chull, 0.5)

    # Find the largest contour (assumed to be the main object)
    largest_contour = max(contours, key=len)

    # Compute the bounding box coordinates for the largest contour
    min_row, min_col = np.min(largest_contour, axis=0)
    max_row, max_col = np.max(largest_contour, axis=0)

    # Compute the optimal cropping dimensions based on the bounding box
    padding = 10  # Adjust the padding as desired
    crop_min_row = int(max(min_row - padding, 0))
    crop_min_col = int(max(min_col - padding, 0))
    crop_max_row = int(min(max_row + padding, image.shape[0]))
    crop_max_col = int(min(max_col + padding, image.shape[1]))

    # Crop the image using the computed dimensions
    cropped_image = image[crop_min_row:crop_max_row, crop_min_col:crop_max_col]

    return cropped_image


def image_generator(filepaths):
    for filepath in filepaths:
        yield io.imread(filepath, as_gray=True)

def extract_texture_features(image): 

    img = crop_image(image)

    # Create Gabor filter bank
    fmax = 0.327 # maximum frequency
    k = np.sqrt(2) #frequency ratio or factor for selecting filter frequencies
    p = 0.5 # crossing point between two consecutive filters, default 0.5
    u = 6 #number of frequencies
    v = 8 #number of orientation
    gamma = 0.5  #smoothting parameter 
    eta = 0.5  #smoothting parameter of
    row = img.shape[0]
    col = img.shape[1] # size of image

    GaborFilterBank = gbb().create_a_set_of_gabor_filters(fmax, k, p, u, v, row, col, gamma, eta)
    
    # Filter with the filter bank
    GaborFilteredReponses = gbfrb().create_a_set_of_Gabor_filtered_responses(img, GaborFilterBank)

    # Convert responses to simple 3-D matrix with normalization
    filteredImages = gbfrb().convert_a_set_Gabor_filtered_responses_to_ndarray(GaborFilteredReponses)
    
    # Get mean and standard deviation of each response as Gabor (texture) features of an input image
    nImages = filteredImages.shape[2]
    textureFeatures = np.zeros(nImages*2)

    index=0
    for i in range(0, nImages):
        textureFeatures[index] = np.mean(np.abs(filteredImages[:,:,i]));
        index = index + 1;
        textureFeatures[index] = np.std(np.abs(filteredImages[:,:,i]));
        index = index + 1;
    
    return textureFeatures

In [None]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 4

results = []

chunk_size = 64

num_processes = mp.cpu_count() - 10 # number of workers

i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    results = []
    
    with mp.Pool(processes=num_processes) as pool:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            batch_results = list(pool.imap(extract_texture_features, batch, chunksize=chunk_size))
            
            results.extend(batch_results)

            del batch_results

            gc.collect()

            pbar.update(len(batch))

        # pool.close()
        # pool.join()

# 3. Test with joblib

In [6]:
import multiprocessing as mp
from more_itertools import chunked

from skimage.filters import threshold_niblack
from skimage.morphology import convex_hull_image
from skimage.measure import find_contours

def crop_image(image):
    img_height = image.shape[0]
    if img_height < 2000:
        return image
    
    # adaptive thresholding
    thresh_niblack = threshold_niblack(image, window_size=25, k=0.8)
    binary_niblack = image > thresh_niblack

    # make convex hull
    chull = convex_hull_image(np.pad(binary_niblack, 3, 'constant', constant_values=0))
    
    # Find the contours of the main object
    contours = find_contours(chull, 0.5)

    # Find the largest contour (assumed to be the main object)
    largest_contour = max(contours, key=len)

    # Compute the bounding box coordinates for the largest contour
    min_row, min_col = np.min(largest_contour, axis=0)
    max_row, max_col = np.max(largest_contour, axis=0)

    # Compute the optimal cropping dimensions based on the bounding box
    padding = 10  # Adjust the padding as desired
    crop_min_row = int(max(min_row - padding, 0))
    crop_min_col = int(max(min_col - padding, 0))
    crop_max_row = int(min(max_row + padding, image.shape[0]))
    crop_max_col = int(min(max_col + padding, image.shape[1]))

    # Crop the image using the computed dimensions
    cropped_image = image[crop_min_row:crop_max_row, crop_min_col:crop_max_col]

    return cropped_image

def image_generator(filepaths):
    for filepath in filepaths:
        yield io.imread(filepath, as_gray=True)

def extract_texture_features(image):   
    img = crop_image(image) 
    # Create Gabor filter bank
    fmax = 0.327 # maximum frequency
    k = np.sqrt(2) #frequency ratio or factor for selecting filter frequencies
    p = 0.5 # crossing point between two consecutive filters, default 0.5
    u = 6 #number of frequencies
    v = 8 #number of orientation
    gamma = 0.5  #smoothting parameter 
    eta = 0.5  #smoothting parameter of
    row = img.shape[0]
    col = img.shape[1] # size of image

    GaborFilterBank = gbb().create_a_set_of_gabor_filters(fmax, k, p, u, v, row, col, gamma, eta)
    
    # Filter with the filter bank
    GaborFilteredReponses = gbfrb().create_a_set_of_Gabor_filtered_responses(img, GaborFilterBank)

    # Convert responses to simple 3-D matrix with normalization
    filteredImages = gbfrb().convert_a_set_Gabor_filtered_responses_to_ndarray(GaborFilteredReponses)
    
    # Get mean and standard deviation of each response as Gabor (texture) features of an input image
    nImages = filteredImages.shape[2]
    textureFeatures = np.zeros(nImages*2)

    index=0
    for i in range(0, nImages):
        textureFeatures[index] = np.mean(np.abs(filteredImages[:,:,i]));
        index = index + 1;
        textureFeatures[index] = np.std(np.abs(filteredImages[:,:,i]));
        index = index + 1;
    
    del filteredImages, GaborFilteredReponses, GaborFilterBank
    gc.collect()
    
    return textureFeatures


In [7]:
filepaths = dfFoldTraining_1['filenames']

n_files = len(filepaths)

batch_size = 32

num_processes = 4

results = []

i=0;
with tqdm(total=n_files, desc="Extract Gabor features") as pbar:

    with Parallel(n_jobs=num_processes) as parallel:

        for batch in chunked(image_generator(filepaths), batch_size):
            
            i=i+1
            print('Working with batch: ',i)

            batch_results = parallel(
                delayed(extract_texture_features)(img) for img in batch)
            
            results.extend(batch_results)

            del batch_results
            gc.collect()

            pbar.update(len(batch))


Extract Gabor features:   0%|          | 0/44099 [00:00<?, ?it/s]

Working with batch:  1
Working with batch:  2
Working with batch:  3
Working with batch:  4
Working with batch:  5
Working with batch:  6
Working with batch:  7
Working with batch:  8
Working with batch:  9
Working with batch:  10
Working with batch:  11
Working with batch:  12
Working with batch:  13
Working with batch:  14
Working with batch:  15
Working with batch:  16
Working with batch:  17
Working with batch:  18
Working with batch:  19
Working with batch:  20
Working with batch:  21
Working with batch:  22
Working with batch:  23
Working with batch:  24
Working with batch:  25
Working with batch:  26
Working with batch:  27
Working with batch:  28
Working with batch:  29
Working with batch:  30
Working with batch:  31
Working with batch:  32
Working with batch:  33
Working with batch:  34
Working with batch:  35
Working with batch:  36
Working with batch:  37
Working with batch:  38
Working with batch:  39
Working with batch:  40
Working with batch:  41
Working with batch:  42
W

In [10]:
%store results

Stored 'results' (list)


In [None]:
pip install tables


I'd rather comment than offer this as an actual answer, but I need more reputation to comment.)

You can store most data-like variables in a systematic way. What I usually do is store all dataframes, arrays, etc. in pandas.HDFStore. At the beginning of the notebook, declare

backup = pd.HDFStore('backup.h5')
and then store any new variables as you produce them

backup['var1'] = var1
At the end, probably a good idea to do

backup.close()
before turning off the server. The next time you want to continue with the notebook:

backup = pd.HDFStore('backup.h5')
var1 = backup['var1']
Truth be told, I'd prefer built-in functionality in ipython notebook, too. You can't save everything this way (e.g. objects, connections), and it's hard to keep the notebook organized with so much boilerplate codes.