In [1]:
import os
import shutil

# define directory paths
base_path = "C-NMC_Leukemia"
training_data_path = os.path.join(base_path, "training_data")
all_folders = ["fold_0/all", "fold_1/all", "fold_2/all"]
hem_folders = ["fold_0/hem", "fold_1/hem", "fold_2/hem"]
new_all_path = os.path.join(training_data_path, "new_all")
new_hem_path = os.path.join(training_data_path, "new_hem")

In [2]:
# create new directories
os.makedirs(new_all_path, exist_ok=True)
os.makedirs(new_hem_path, exist_ok=True)

In [3]:
# move all image files from 'all' folders to 'new_all' directory
for folder_path in all_folders:
    folder = os.path.join(training_data_path, folder_path)
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        shutil.move(file_path, new_all_path)
        
print("Image files moved successfully.")

Image files moved successfully.


In [4]:
# move all image files from 'hem' folders to 'new_hem' directory
for folder_path in hem_folders:
    folder = os.path.join(training_data_path, folder_path)
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        shutil.move(file_path, new_hem_path)
        
print("Image files moved successfully.")

Image files moved successfully.


In [5]:
# remove all the empty directories after moving the images
shutil.rmtree(training_data_path+'/fold_0')
shutil.rmtree(training_data_path+'/fold_1')
shutil.rmtree(training_data_path+'/fold_2')

# Load images into arrays

In [6]:
# read all images from disk and store them in an array
import numpy as np
from skimage import io
from PIL import Image

all_in_list=[]
all_images_paths = os.listdir(new_all_path)
for i, image_name in enumerate(all_images_paths):
    if(image_name.split('.')[1] == 'bmp'):
        image = io.imread(new_all_path+'/'+image_name)
        image = Image.fromarray(image, 'RGB')
        image = image.resize((150,150)) #150x150
        all_in_list.append(np.array(image))
all_in_array = np.array(all_in_list)

del all_in_list

In [7]:
# read Hem images from disk and store them in an array
hem_in_list=[]
hem_images_paths = os.listdir(new_hem_path)
for i, image_name in enumerate(hem_images_paths):
    if(image_name.split('.')[1] == 'bmp'):
        image = io.imread(new_hem_path+'/'+image_name)
        image = Image.fromarray(image, 'RGB')
        image = image.resize((150,150)) #150x150
        hem_in_list.append(np.array(image))
hem_in_array = np.array(hem_in_list)

del hem_in_list

# Crop all the images

In [8]:
from skimage.transform import resize
import cv2 as cv

def crop_imgs(images_array):
    cropped_images_in_list=[]
    for each_image in images_array:
        gray = cv.cvtColor(each_image, cv.COLOR_BGR2GRAY)
        thresh = cv.threshold(gray, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU)[1] # threshold 
        hh, ww = thresh.shape
        thresh[hh-3:hh, 0:ww] = 0 # make bottom 2 rows black where they are white the full width of the image
        white = np.where(thresh==255) # get bounds of white pixels
        xmin, ymin, xmax, ymax = np.min(white[1]), np.min(white[0]), np.max(white[1]), np.max(white[0])       
        crop = each_image[ymin:ymax+3, xmin:xmax] # crop the image at the bounds adding back the two blackened rows at the bottom
        resized_img = resize(crop, (125, 125), anti_aliasing=True)
        cropped_images_in_list.append(resized_img) #append cropped image to list
    
    cropped_images_in_array=np.array(cropped_images_in_list) #convert cropped images list to array
    del cropped_images_in_list
    return cropped_images_in_array

In [9]:
cropped_all_in_array = crop_imgs(all_in_array)
cropped_hem_in_array = crop_imgs(hem_in_array)

In [10]:
del all_in_array
del hem_in_array

# Preparing Images for Augmentation

## Split the image arrays into training and testing splits

In [12]:
from sklearn.model_selection import train_test_split

x_train_hem, x_test_hem = train_test_split(cropped_hem_in_array, test_size=0.1, random_state=42)
x_train_all, x_test_all = train_test_split(cropped_all_in_array, test_size=0.2, random_state=42)

In [14]:
del cropped_hem_in_array
del cropped_all_in_array

## Perform augmentation on training hem images

In [15]:
# create an ImageDataGenerator object for data augmentation
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rotation_range=35,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.1,
    shear_range=0.1,
    fill_mode='reflect'
)

In [16]:
# generate the augmented images and store them in a numpy array
augmented_x_train_hem = np.array([datagen.random_transform(img) for img in x_train_hem])

In [17]:
x_train_hem = np.concatenate((x_train_hem, augmented_x_train_hem), axis=0)

In [18]:
del augmented_x_train_hem

In [19]:
len(x_train_hem), len(x_train_all)

(6100, 5817)

## Generate labels for images

In [20]:
y_train_hem = np.zeros((len(x_train_hem),), dtype=int)
y_train_all = np.ones((len(x_train_all),), dtype=int)

In [21]:
y_test_hem = np.zeros((len(x_test_hem),), dtype=int)
y_test_all = np.ones((len(x_test_all),), dtype=int)

# Preparing data for model

In [22]:
len(x_train_hem), len(x_train_all)

(6100, 5817)

In [23]:
x_train = np.concatenate((x_train_hem, x_train_all), axis=0)
y_train = np.concatenate((y_train_hem, y_train_all), axis=0)

In [24]:
del x_train_hem
del x_train_all
del y_train_hem
del y_train_all

In [25]:
x_test = np.concatenate((x_test_hem, x_test_all), axis=0)
y_test = np.concatenate((y_test_hem, y_test_all), axis=0)

In [26]:
del x_test_hem
del x_test_all
del y_test_hem
del y_test_all

In [27]:
y_train[6099], y_train[6100]

(0, 1)

In [44]:
from sklearn.utils import shuffle

x_train, y_train = shuffle(x_train, y_train, random_state=42)

## Making sure all the images are scaled

In [33]:
for img in x_train:
    if np.max(img) > 1:
        print("Found an image with pixel values greater than 1.")
        break
else:
    print("All images have pixel values between 0 and 1.")

All images have pixel values between 0 and 1.


In [34]:
for img in x_test:
    if np.max(img) > 1:
        print("Found an image with pixel values greater than 1.")
        break
else:
    print("All images have pixel values between 0 and 1.")

All images have pixel values between 0 and 1.


In [45]:
x_train[0].shape

(125, 125, 3)

# Transfer Learning for Feature Extraction

In [46]:
from tensorflow.keras.applications.vgg16 import VGG16

VGG_model = VGG16(weights='imagenet', include_top=False, input_shape=(125,125,3))

In [47]:
for layer in VGG_model.layers:
    layer.trainable=False
    
VGG_model.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 125, 125, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 125, 125, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 125, 125, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 62, 62, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 62, 62, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 62, 62, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 31, 31, 128)       0     

In [48]:
x_train_feats = VGG_model.predict(x_train)
x_train_features = x_train_feats.reshape(x_train_feats.shape[0], -1)

x_test_feats = VGG_model.predict(x_test)
x_test_features = x_test_feats.reshape(x_test_feats.shape[0], -1) 



# Save extracted features in pandas dataframe

In [49]:
import pandas as pd

# convert the feature vectors to Pandas DataFrames
train_df = pd.DataFrame(x_train_features)
test_df = pd.DataFrame(x_test_features)

In [51]:
# add the labels to the DataFrames
train_df['label'] = y_train
test_df['label'] = y_test

# Save dataframes to disk

In [55]:
import pickle

with open('train_data.pickle', 'wb') as f:
    pickle.dump(train_df, f)
    
with open('test_data.pickle', 'wb') as f:
    pickle.dump(test_df, f)

In [60]:
train_df['label'].value_counts()

0    6100
1    5817
Name: label, dtype: int64

In [61]:
test_df['label'].value_counts()

1    1455
0     339
Name: label, dtype: int64