## Import Libraries


In [1]:
import os
import re

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

import numpy as np
import PIL as pil # to read images
import matplotlib.image as mpimg # read images as array

from scipy import ndimage

from imblearn.under_sampling import RandomUnderSampler
from tqdm import tqdm

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img

## Data Loading

In [4]:
# Get arrays of filenames & labels of training, test and validation set
X_train = np.load('Image Numpys_saved/X_train.npy')
X_test = np.load('Image Numpys_saved/X_test.npy')
X_val = np.load('Image Numpys_saved/X_val.npy')
y_train = np.load('Image Numpys_saved/y_train.npy')
y_test = np.load('Image Numpys_saved/y_test.npy')
y_val = np.load('Image Numpys_saved/y_val.npy')

In [5]:
# check length and class size for each set
print("Number of Items in Training Set: ", len(X_train), len(y_train))
print("Distribution of Labels in Training Set: ", sorted(Counter(y_train).items()))
print("Number of Items in Validation Set: ",len(X_val), len(y_val))
print("Distribution of Labels in Validation Set: ", sorted(Counter(y_val).items()))
print("Number of Items in Test Set: ",len(X_test), len(y_test))
print("Distribution of Labels in Test Set: ", sorted(Counter(y_test).items()))

Number of Items in Training Set:  4470 4470
Distribution of Labels in Training Set:  [(0, 1470), (1, 3000)]
Number of Items in Validation Set:  1490 1490
Distribution of Labels in Validation Set:  [(0, 490), (1, 1000)]
Number of Items in Test Set:  1491 1491
Distribution of Labels in Test Set:  [(0, 491), (1, 1000)]


# Create image pixel array

In [14]:
# Create a function to create a numpy array with the pixels of the files
def get_pix_array(file_names, new_array):
    for index, file in tqdm(enumerate(file_names)):
        image_dir = "filtered_cropped_images_saved/"
        image = load_img(image_dir+file)

        image = tf.image.convert_image_dtype(image, tf.float32)
        image = tf.image.resize(image, (128, 128)) # Resizing the image to 128x128 dimension

        new_array.append(img_to_array(image))

    new_array1 = np.array(new_array)
    return new_array1

In [15]:
X_val_pix = []
X_val_pix = get_pix_array(X_val, X_val_pix)

1490it [17:45,  1.40it/s]


In [17]:
X_test_pix = []
X_test_pix = get_pix_array(X_test,X_test_pix )

1491it [17:53,  1.39it/s]


In [19]:
X_train_pix = []
X_train_pix = get_pix_array(X_train,X_train_pix )

4470it [54:37,  1.36it/s]


## Oversampling

In [20]:
from imblearn.over_sampling import ADASYN

# reshape X to 2D array
X_train_reshaped = X_train_pix.reshape(X_train_pix.shape[0], -1)
X_val_reshaped = X_val_pix.reshape(X_val_pix.shape[0], -1)

# define resampling
over = ADASYN(random_state=42)

# fit and apply resampling
X_train_resampled, y_train_resampled = over.fit_resample(X_train_reshaped, y_train)
X_val_resampled, y_val_resampled = over.fit_resample(X_val_reshaped, y_val)

# reshape X back to 4D array
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], 128, 128, 3)          
X_val_resampled = X_val_resampled.reshape(X_val_resampled.shape[0], 128, 128, 3) 

In [21]:
# Save the numpy arrays
np.save('Image Numpys/X_train_resampled', X_train_resampled)
np.save('Image Numpys/y_train_resampled', y_train_resampled)
np.save('Image Numpys/X_val_resampled', X_val_resampled)
np.save('Image Numpys/y_val_resampled', y_val_resampled)

In [22]:
unique, counts = np.unique(y_train_resampled, return_counts=True)
dict(zip(unique, counts))

{0: 3043, 1: 3000}

In [23]:
unique, counts = np.unique(y_val_resampled, return_counts=True)
dict(zip(unique, counts))

{0: 1042, 1: 1000}

In [25]:
unique, counts = np.unique(y_test, return_counts=True)
dict(zip(unique, counts))

{0: 491, 1: 1000}