# Import libraries

In [1]:
# general liraries
import numpy as np
from collections import Counter

from tqdm import tqdm # to check time progress
import os # to connect

# to check duplicates
from PIL import Image
!pip install imagehash
import imagehash

import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.image as mpimg # read images as array

from scipy import ndimage

!pip install imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re

# Data Loading

In [2]:
# define folder path & create lists with file names
folder = os.path.join("filtered_cropped_images_saved/")
filelist = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

In [3]:
print(len(filelist))

7451


In [4]:
# Create an array with filenames & labels from dataset folder
X_1 = []
y_1 = []

for file in tqdm(filelist):
    try:
        label = re.search(r'_(.*?)_', file).group(1) # extract the label from the filename using regex
        if label in ['gametocyte', 'ring', 'schizont', 'trophozoite']:
            label = 'infected'
        elif label in ["red blood cell"]: 
            label = "uninfected"  # some files are not readable e.g. "cropped_images_train/.DS_Store"
        else:
            continue

        X_1.append(file)
        y_1.append(label)

        # if no label is found in the filename, skip this file
    except AttributeError:
        #print(f"no label found in filename: {file}")
        continue

X_1 = np.array(X_1)
y_1 = np.array(y_1)

100%|██████████| 7451/7451 [00:00<00:00, 608188.36it/s]


In [5]:
print("Number of Images in Array: ", len(X_1), len(y_1))
print("Number of images per label: " , sorted(Counter(y_1).items()))

Number of Images in Array:  7451 7451
Number of images per label:  [('infected', 2451), ('uninfected', 5000)]


# Pre-processing

Find and eliminate Duplicates

In [6]:
# Function to identify and remove duplicates
def remove_duplicates(directory, file_names, label_names):
    image_hashes = {}
    duplicates = []
    updated_file_names = []
    updated_label_names = []
    num_non_existing = 0
  
    for i in tqdm(range(len(file_names))):
        filename = file_names[i]
        image_path = os.path.join(directory, filename)
        if not os.path.exists(image_path):
            num_non_existing += 1
            continue

        with Image.open(image_path) as img:
            img_hash = imagehash.average_hash(img, hash_size=128)

        if img_hash in image_hashes:
            duplicates.append((image_path, image_hashes[img_hash]))
        else:
            image_hashes[img_hash] = image_path
            updated_file_names.append(file_names[i])
            updated_label_names.append(label_names[i])

    print("Number of non-existing image paths:", num_non_existing)
    return np.array(updated_file_names), np.array(updated_label_names), duplicates

In [7]:
# Call the remove_duplicates function for training dataset
X_cleaned1, y_cleaned1, duplicates = remove_duplicates(folder, X_1, y_1)

# Print the cleaned X_train and y_train
print(len(X_cleaned1), len(y_cleaned1))
print(sorted(Counter(y_cleaned1).items()))

100%|██████████| 7451/7451 [00:33<00:00, 224.85it/s]

Number of non-existing image paths: 0
7451 7451
[('infected', 2451), ('uninfected', 5000)]





In [8]:
# Show how many duplicates were found
print("Number of duplicates: ", len(duplicates))

Number of duplicates:  0


Create binary labels

In [9]:
# encoding labels as 1 = infected, 0 = uninfected
le = LabelEncoder()
y_cleaned1 = le.fit_transform(y_cleaned1)

print(len(y_cleaned1))
print(sorted(Counter(y_cleaned1).items()))

7451
[(0, 2451), (1, 5000)]


Shuffle Data

In [10]:
# saving the shuffled file.
X_cleaned, y_cleaned = shuffle(X_cleaned1, y_cleaned1)

Create Test, Training and Validation Set

In [11]:
# Split data by label
infected_indices = np.where(y_cleaned == 0)[0]
uninfected_indices = np.where(y_cleaned == 1)[0]

# Get arrays only for according classes (for infected and uninfected) based on indecis
y_infected = y_cleaned[infected_indices]
X_infected = X_cleaned[infected_indices]

X_uninfected = X_cleaned[uninfected_indices]
y_uninfected = y_cleaned[uninfected_indices]

# Split infected data into train, validation, and test sets
X_train_infected, X_val_infected, y_train_infected, y_val_infected = train_test_split(X_infected, y_infected, test_size=0.4, random_state=42)
X_val_infected, X_test_infected, y_val_infected, y_test_infected = train_test_split(X_val_infected, y_val_infected, test_size=0.5, random_state=42)

# Split uninfected data into train, validation, and test sets
X_train_uninfected, X_val_uninfected, y_train_uninfected, y_val_uninfected = train_test_split(X_uninfected, y_uninfected, test_size=0.4, random_state=42)
X_val_uninfected, X_test_uninfected, y_val_uninfected, y_test_uninfected = train_test_split(X_val_uninfected, y_val_uninfected, test_size=0.5, random_state=42)

# Merge data back together
X_train = np.concatenate([X_train_infected, X_train_uninfected])
y_train = np.concatenate([y_train_infected, y_train_uninfected])

X_val = np.concatenate([X_val_infected, X_val_uninfected])
y_val = np.concatenate([y_val_infected, y_val_uninfected])

X_test = np.concatenate([X_test_infected, X_test_uninfected])
y_test = np.concatenate([y_test_infected, y_test_uninfected])

In [12]:
# Shuffle
X_train, y_train  = shuffle(X_train, y_train)
X_val, y_val= shuffle(X_val, y_val)
X_test, y_test = shuffle(X_test, y_test)

In [13]:
# check length and class size for each set
print("Number of Items in Training Set: ", len(X_train), len(y_train))
print("Distribution of Labels in Training Set: ", sorted(Counter(y_train).items()))
print("Number of Items in Validation Set: ",len(X_val), len(y_val))
print("Distribution of Labels in Validation Set: ", sorted(Counter(y_val).items()))
print("Number of Items in Test Set: ",len(X_test), len(y_test))
print("Distribution of Labels in Test Set: ", sorted(Counter(y_test).items()))

Number of Items in Training Set:  4470 4470
Distribution of Labels in Training Set:  [(0, 1470), (1, 3000)]
Number of Items in Validation Set:  1490 1490
Distribution of Labels in Validation Set:  [(0, 490), (1, 1000)]
Number of Items in Test Set:  1491 1491
Distribution of Labels in Test Set:  [(0, 491), (1, 1000)]


In [14]:
# Save final numpy arrays of filenames and labels
np.save('Image Numpys/y_train', y_train)
np.save('Image Numpys/X_train', X_train)
np.save('Image Numpys/y_test', y_test)
np.save('Image Numpys/X_test', X_test)
np.save('Image Numpys/y_val', y_val)
np.save('Image Numpys/X_val', X_val)