In [1]:
# imports
import os
from pathlib import Path
import cv2 #opencv-python
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

# Preprocessing

In [2]:
def img_reshaper(orig_img):
    new_img = []
    for row in orig_img:
        nrow = []
        for item in row:
            nrow.append([item, item, item])
        new_img.append(nrow)
    return new_img

In [3]:
# Preprocess(img: string): given filepath img to an image, returns the preprocessed cv2 image object
    # throws invalidArgument error if img is not a valid path
def Preprocess(img):
    try:
        i = cv2.imread(img)
        img_gray = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
        img_hcontrast = cv2.equalizeHist(img_gray)
        blur = cv2.GaussianBlur(img_hcontrast, (0,0), sigmaX=33, sigmaY=33)
        divide = cv2.divide(img_hcontrast, blur, scale=255)
        th3 = cv2.adaptiveThreshold(divide,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
            cv2.THRESH_BINARY,11,2)
        small = cv2.resize(th3, (700,700))
        return small
    except:
        raise ValueError(f"input {img} is not a valid filepath.")

In [4]:
# initialize data
ref_imgs = []
nonref_imgs = []

# iterate through all images in dataset
data_dir = os.path.join(os.getcwd(), "../data/SAUNAR/")
img_dirs = [name for name in os.listdir(data_dir) if os.path.isdir(data_dir + name)]
for d in img_dirs:
    print(f"starting directory {d[:2]}")
    # process and store nonref images from the 01 and 02 folders
    if d[:2] == "01" or d[:2] == "02":
        for i in os.listdir(data_dir + d):
            nonref_imgs.append(Preprocess(os.path.join(data_dir + d,i)))
    # pass on non-classifiable images from 00 and 10 folders
    elif d[:2] == "00" or d[:2] == "10":
        pass
    # process and store ref images from all other folders
    else:
        for i in os.listdir(data_dir + d):
            ref_imgs.append(Preprocess(os.path.join(data_dir + d,i)))

starting directory 06
starting directory 05
starting directory 01


: 

: 

In [None]:
# save processed image data to file
np.save("../data/processed/ref_imgsn.npy", ref_imgs)
np.save("../data/processed/nonref_imgsn.npy", nonref_imgs)

In [None]:
# train test split
#sklearn.model_selection.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)[source]
ref_imgs_load = np.load("../data/processed/ref_imgsn.npy")
nonref_imgs_load = np.load("../data/processed/nonref_imgsn.npy")

Xref_train, Xref_test, yref_train, yref_test = train_test_split(ref_imgs_load, np.zeros(len(ref_imgs_load)), test_size=.2)
Xnref_train, Xnref_test, ynref_train, ynref_test = train_test_split(nonref_imgs_load, np.ones(len(nonref_imgs_load)), test_size=.2)


In [None]:
Xtrain = np.concatenate((Xref_train, Xnref_train))
Ytrain = np.concatenate((yref_train, ynref_train))
Xtest = np.concatenate((Xref_test, Xnref_test))
Ytest = np.concatenate((yref_test, ynref_test))

# save split to files
# save processed image data to file
np.save("../data/test/Xtestnew.npy", Xtest)
np.save("../data/test/Ytestnew.npy", Ytest)
np.save("../data/train/Xtrainnew.npy", Xtrain)
np.save("../data/train/Ytrainnew.npy", Ytrain)

# grayscale conversion

In [2]:
# load data
Xtrain = np.load("../data/train/Xtrain.npy")
Ytrain = np.load("../data/train/Ytrain.npy")
# Xtest = np.load("../data/test/Xtest.npy")
# Ytest = np.load("../data/test/Ytest.npy")

# balance classes
Ytrain_use = Ytrain[:1124]
Xtrain_use = Xtrain[:1124]
del Xtrain
del Ytrain
# Ytest_use = Ytest[:282]
# Xtest_use = Xtest[:282]

# define image reshaper function to create rgb values for each pixel in the image
def img_reshaper(orig_img):
    new_img = []
    for row in orig_img:
        nrow = []
        for item in row:
            nrow.append([item, item, item])
        new_img.append(nrow)
    return new_img

In [3]:
# reshape Xtrain
Xtrain_temp = []
c = 1
for img in Xtrain_use:
    Xtrain_temp.append(img_reshaper(img))
    print(f"{c} out of 1124 complete.")
    c+=1
Xtrain_new = np.array(Xtrain_temp)

In [14]:
# for i in range(4):
#     Xtrain_temp.append(img_reshaper(Xtrain_use[i]))
#     print(f"{c} out of 1124 complete.")
#     c+=1
#     Xtrain_use = Xtrain_use[1:]

37 out of 1124 complete.
38 out of 1124 complete.
39 out of 1124 complete.
40 out of 1124 complete.


In [6]:
# check shape for correctness
#Xtrain_temp.shape
len(Xtrain_temp)

8

In [None]:
# save modified Xtrain and Ytrain
np.save("../data/train/Xtrain_final.npy", Xtrain_new)
np.save("../data/train/Ytrain_final.npy", Ytrain_use)

In [None]:
# reshape Xtest
Xtest_temp = []
c = 1
for img in Xtest_use:
    Xtest_temp.append(img_reshaper(img))
    print(f"{c} out of 1124 complete.")
    c+=1
Xtest_new = np.array(Xtrain_temp)

In [None]:
# check shape for correctness
Xtest_new.shape

In [None]:
# save modified Xtest and Ytest
np.save("../data/train/Xtest_final.npy", Xtest_new)
np.save("../data/train/Ytest_final.npy", Ytest_use)