In [1]:
# imports
import os
from pathlib import Path
import cv2 #opencv-python
from matplotlib import pyplot as plt
import pywt
import numpy as np
from sklearn.model_selection import train_test_split #scikit-learn

# Preprocessing

In [10]:
# Preprocess(img: string): given filepath img to an image, returns the preprocessed cv2 image object
    # throws invalidArgument error if img is not a valid path
def Preprocess(img):
    try:
        i = cv2.imread(img)
        img_gray = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
        img_hcontrast = cv2.equalizeHist(img_gray)
        blur = cv2.GaussianBlur(img_hcontrast, (0,0), sigmaX=33, sigmaY=33)
        divide = cv2.divide(img_hcontrast, blur, scale=255)
        th3 = cv2.adaptiveThreshold(divide,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
            cv2.THRESH_BINARY,11,2)
        return th3
    except:
        raise ValueError(f"input {img} is not a valid filepath.")

In [13]:
# initialize data
ref_imgs = []
nonref_imgs = []

# iterate through all images in dataset
data_dir = os.path.join(os.getcwd(), "../data/SAUNAR/")
img_dirs = [name for name in os.listdir(data_dir) if os.path.isdir(data_dir + name)]
for d in img_dirs:
    print(f"starting directory {d[:2]}")
    # process and store nonref images from the 01 and 02 folders
    if d[:2] == "01" or d[:2] == "02":
        for i in os.listdir(data_dir + d):
            nonref_imgs.append(Preprocess(os.path.join(data_dir + d,i)))
    # pass on non-classifiable images from 00 and 10 folders
    elif d[:2] == "00" or d[:2] == "10":
        pass
    # process and store ref images from all other folders
    else:
        for i in os.listdir(data_dir + d):
            ref_imgs.append(Preprocess(os.path.join(data_dir + d,i)))

starting directory 06
starting directory 05
starting directory 01
starting directory 07
starting directory 03
starting directory 09
starting directory 10
starting directory 00
starting directory 02
starting directory 08
starting directory 04


In [17]:
# save processed image data to file
np.save("../data/processed/ref_imgs.npy", ref_imgs)
np.save("../data/processed/nonref_imgs.npy", nonref_imgs)

In [2]:
# train test split
#sklearn.model_selection.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)[source]
ref_imgs_load = np.load("../data/processed/ref_imgs.npy")
nonref_imgs_load = np.load("../data/processed/nonref_imgs.npy")

Xref_train, Xref_test, yref_train, yref_test = train_test_split(ref_imgs_load, np.zeros(len(ref_imgs_load)), test_size=.2)
Xnref_train, Xnref_test, ynref_train, ynref_test = train_test_split(nonref_imgs_load, np.ones(len(nonref_imgs_load)), test_size=.2)


In [3]:
Xtrain = np.concatenate((Xref_train, Xnref_train))
Ytrain = np.concatenate((yref_train, ynref_train))
Xtest = np.concatenate((Xref_test, Xnref_test))
Ytest = np.concatenate((yref_test, ynref_test))

# save split to files
# save processed image data to file
np.save("../data/test/Xtest.npy", Xtest)
np.save("../data/test/Ytest.npy", Ytest)
np.save("../data/train/Xtrain.npy", Xtrain)
np.save("../data/train/Ytrain.npy", Ytrain)