<h1 style="color: #00BFFF;">00 |</h1>

In [1]:
# 📚 Basic libraries
import os # file managment
import numpy as np # image array manipulation
import pandas as pd # data manipulation
import matplotlib.pyplot as plt #  plots and visualizations

# 🛠️ Tools
import warnings # who likes warnings?
import shutil # High-level file operations
import random # to generate random samples

# 🌐 Computer Vision - Data Augmentation
import cv2 # computer vision library, in this case to check white values
from tensorflow.keras.preprocessing.image import ImageDataGenerator # real-time data augmentation
from tensorflow.keras.utils import img_to_array, array_to_img, load_img # saving augmented Data

In [2]:
# ⚙️ Settings
warnings.filterwarnings('ignore') # ignore warnings

In [29]:
# 🎯 Specific functions
def move_files(original_dir, target_dir, ratio): # moving unique files to test folder. just run it once
    filenames = os.listdir(original_dir) # file names in the original directory
    filenames = [os.path.join(original_dir, f) for f in filenames if f.endswith('.png') or f.endswith('.jpg')]
    random.shuffle(filenames) # Shuffle the file names
    num_files_to_move = int(len(filenames) * ratio) # number of files to move
    
    for filename in filenames[:num_files_to_move]: # Move the files
        shutil.move(filename, target_dir)
        
def augment_images(data_dir, datagen, num_images=2):
    for filename in os.listdir(data_dir):
        if filename.endswith(".jpg"):  
            img_path = os.path.join(data_dir, filename)
            img_array = img_to_array(load_img(img_path)) # converting the image to numpy array
            img_array = img_array.reshape((1,) + img_array.shape) # Reshape the data

            # Generate new images
            for i, _ in enumerate(datagen.flow(img_array, batch_size=1, save_to_dir=data_dir, save_prefix='aug', save_format='png')):
                if i >= num_images - 1:
                    break

<h1 style="color: #00BFFF;">01 | Data Extraction</h1>

In [4]:
data_path = os.path.join('C:\\Users\\apisi\\01. IronData\\01. GitHub\\03. Projects\\08_cells_at_work', '01_data')

effusions = os.path.join(data_path, '04_efussions_wellgen')

<h1 style="color: #00BFFF;">02 | Data Cleaning</h1>

<h3 style="color: #008080;">Train, Validation & Test directories</h3>

In [5]:
# new directories for the images
train_dir = os.path.join(effusions, "01_train")
val_dir = os.path.join(effusions, "02_validation")
test_dir = os.path.join(effusions, "03_test")

# creating the new directories, making sure to not overwrite them if they already exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

<h3 style="color: #008080;">Diagnoses (labels) directories for Train, Validation & Test</h3>

In [6]:
# New directories for training
train_positive = os.path.join(train_dir, "Positive")
train_negative = os.path.join(train_dir, "Negative")

# Creating the new directories, making sure to not overwrite them if they already exist
os.makedirs(train_positive, exist_ok=True)
os.makedirs(train_negative, exist_ok=True)

# New directories for validation
val_positive = os.path.join(val_dir, "Positive")
val_negative = os.path.join(val_dir, "Negative")

# Creating the new directories, making sure to not overwrite them if they already exist
os.makedirs(val_positive, exist_ok=True)
os.makedirs(val_negative, exist_ok=True)

# New directories for test
test_positive = os.path.join(test_dir, "Positive")
test_negative = os.path.join(test_dir, "Negative")

# Creating the new directories, making sure to not overwrite them if they already exist
os.makedirs(test_positive, exist_ok=True)
os.makedirs(test_negative, exist_ok=True)

Now we have 3 directories for Training, Validation and Testing.
Each directory, contains an empty folder with the diagnose.
* In order to get bigger representation of the Data, we will agument it synthetically using ImageDataGenerator from Keras.
* Firstly, we will perform some quick EDA, since not all images are good for the Model (they contain too many white background and not enough cells informations)

<h1 style="color: #00BFFF;">03 | EDA</h1>

In [8]:
# creating a new directory for white background images
white_background_dir = os.path.join(effusions, "white_background")
os.makedirs(white_background_dir, exist_ok=True)

In [9]:
# Positive and Negative subdirectories inside white_background_dir
positive_white_background_dir = os.path.join(white_background_dir, "Positive")
negative_white_background_dir = os.path.join(white_background_dir, "Negative")
os.makedirs(positive_white_background_dir, exist_ok=True)
os.makedirs(negative_white_background_dir, exist_ok=True)

In [10]:
# Positive and Negative directories
positive = os.path.join(effusions, 'Positive')
negative = os.path.join(effusions, 'Negative')

In [21]:
directories = [(positive, positive_white_background_dir), (negative, negative_white_background_dir)]
threshold = 0.95  # threshold

In [22]:
for directory, white_background_subdir in directories:
    for filename in os.listdir(directory):
        if filename.endswith(('.jpg', '.png')):
            img = cv2.imread(os.path.join(directory, filename), 0) # converting it to grayscale
            white_pixels = np.sum(img > 190)  # count of white or near-white pixels
            total_pixels = np.product(img.shape)  # total count of pixels

            if white_pixels / total_pixels > threshold:  # Proportion of white or near-white pixels
                shutil.move(os.path.join(directory, filename), os.path.join(white_background_subdir, filename))

<h3 style="color: #008080;">Unique cytology images for Test</h3>

Just run it once !

In [23]:
classes = ['Positive', 'Negative']

for class_name in classes:
    original = os.path.join(effusions, class_name)
    test_dir_move = os.path.join(test_dir, class_name)
    
    os.makedirs(test_dir, exist_ok=True)
    
    move_files(original, test_dir_move, ratio=0.15)

<h3 style="color: #008080;">Training and Validation split</h3>

Just run it once

In [24]:
# Specify the classes and corresponding directories
classes = ['Positive', 'Negative']

# Mapping training and validation for each class
train_dirs = [train_positive, train_negative]
val_dirs = [val_positive, val_negative]

for class_name, train_dir, val_dir in zip(classes, train_dirs, val_dirs):
    original_dir = os.path.join(effusions, class_name)

    all_files = os.listdir(original_dir) # list of all files
    random.shuffle(all_files) # shuffling files
    split_idx = int(len(all_files) * 0.7) # index to split training-validation

    # training-validation split
    train_images = all_files[:split_idx]
    val_images = all_files[split_idx:]

    # copying images to their new training directory
    for image_file in train_images:
        source = os.path.join(original_dir, image_file)
        dest = os.path.join(train_dir, image_file)
        shutil.copyfile(source, dest)

    # copying images to their new validation directory
    for image_file in val_images:
        source = os.path.join(original_dir, image_file)
        dest = os.path.join(val_dir, image_file)
        shutil.copyfile(source, dest)

<h1 style="color: #00BFFF;">04 | Data Pre-Processing</h1>

<h3 style="color: #008080;">Performing Data Aumgnetation</h3>

In [25]:
# Parameters
datagen = ImageDataGenerator(
    rescale=1.0/255,  # normalizes pixel values to [0, 1]
    rotation_range=30, # cells can appear at any orientation
    width_shift_range=.15, # cells can be located anywhere
    height_shift_range=.15, # same as before, but vertically
    horizontal_flip=True, # cell orientation doesn't matter
    vertical_flip=True, # cell orientation doesn't matter
    brightness_range=[0.5, 1.5],  # simulates variable lighting/staining
    zoom_range=0.2  # simulates variable cell sizes/distances
)

In [30]:
# using augment_images function
for train_dir, val_dir in zip(train_dirs, val_dirs):
    augment_images(train_dir, datagen, num_images=2)
    augment_images(val_dir, datagen, num_images=2)