## Prepare Dataset and Image Preprocessing

The original dataset was obtained from the [APTOS 2019 Blindness Detection competition](https://www.kaggle.com/c/aptos2019-blindness-detection/).

Due to the lack of classification labels for the test dataset, only the training set of the original competition was used in this study to create datasets for training and validation.

### 1. Load necessary libraries and dataset

The original dataset is not provided in the repository.

In [None]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [9]:
data_path = "aptos2019-blindness-detection/"
df = pd.read_csv(data_path + 'train.csv')

In [26]:
for i in range(5):
    print(str(i))
    if i==0:
        train, test = train_test_split(df[df['level']==0], test_size = 0.2)
        print(train.shape)
        print(test.shape)
    else:
        tr, te = train_test_split(df[df['level']==i], test_size = 0.2)
        train = pd.concat([train, tr])
        test = pd.concat([test, te])
        print(tr.shape)
        print(te.shape)

0
(1444, 2)
(361, 2)
1
(296, 2)
(74, 2)
2
(799, 2)
(200, 2)
3
(154, 2)
(39, 2)
4
(236, 2)
(59, 2)


In [5]:
def crop_image_from_gray(img,tol=7):
    """
    Crop out black borders
    https://www.kaggle.com/ratthachat/aptos-updated-preprocessing-ben-s-cropping
    """  
    
    if img.ndim ==2:
        mask = img>tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    elif img.ndim==3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        mask = gray_img>tol        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0):
            return img
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
            img = np.stack([img1,img2,img3],axis=-1)
        return img

def circle_crop(img):
    """
    Create circular crop around image centre
    """
    img = crop_image_from_gray(img)

    height, width, depth = img.shape
    largest_side = np.max((height, width))
    img = cv2.resize(img, (largest_side, largest_side))

    height, width, depth = img.shape

    x = int(width / 2)
    y = int(height / 2)
    r = np.amin((x, y))

    circle_img = np.zeros((height, width), np.uint8)
    cv2.circle(circle_img, (x, y), int(r), 1, thickness=-1)
    img = cv2.bitwise_and(img, img, mask=circle_img)
    img = crop_image_from_gray(img)

    return img

In [6]:
grey_path = "grey_cropped_2019/"
color_path = "color_cropped_2019/"

In [29]:
img_path = data_path
error_list = []
for idx, row in train.iterrows():
    img = row['image']
    ori_name = img_path+'train_images/'+img+".png"
    try:
        cimg = cv2.imread(ori_name, cv2.IMREAD_COLOR)
        cirimg = circle_crop(cimg)
        cirimg = cv2.resize(cirimg, (224,224))
        gimg=cv2.addWeighted(cirimg,4, cv2.GaussianBlur(cirimg, (0,0) , 10) ,-4 ,128)
        ggimg = cv2.cvtColor(cirimg, cv2.COLOR_BGR2GRAY)
        ggimg=cv2.addWeighted ( ggimg,4, cv2.GaussianBlur( ggimg , (0,0) , 10) ,-4 ,128)
        classname = row['level']
        gname = grey_path+"train/"+str(classname)+"/"+img+".png"
        cname = color_path+"train/"+str(classname)+"/"+img+".png"
        cv2.imwrite(gname, ggimg)
        cv2.imwrite(cname, gimg)
    except:
        error_list.append(img)
error_list

train done


['5.49381E+11',
 '7.21214E+11',
 '8.91E+20',
 '9.46545E+11',
 '5.35683E+11',
 '7.10E+10',
 '3.89552E+11',
 '2.93E+10',
 '9.34E+76',
 '5.95447E+11',
 '1.94E+14',
 '9.21433E+11',
 '2.3255E+11',
 '4.41118E+11',
 'f0546a45ef10',
 '991a0b7a8c87',
 '3ca8be3b40d6',
 '9d62478042b6',
 'fdc685055659',
 '6bb30ec3231a',
 'f64b6e85f1c9',
 '78937523f7a8',
 '61c667663f2f',
 '2eba4279e503',
 '152db3de8120',
 'f4df3d86688d',
 'beb00fa6e7c9',
 '13d014ccd136',
 'bc7bf19b84e3',
 '5321ab64f9ea',
 'a1b28bcbce00',
 'ce6f33a81ad5',
 '810ed108f5b7',
 'e529c5757d64',
 '2399d68d407f',
 'b8e9a8f4617d',
 'f0860c21533b',
 '6e018411ba4a',
 'e6a5e4718873',
 'ed648b9bcd95',
 'db52626d450c',
 'ae58ccb5905e',
 'e47770a2e5d1',
 '8660e1864665',
 '5814cbd2e9bf',
 'a6731dd737af',
 '7d11dbc1e738',
 'a0e635689259',
 '9e99ae6ee7af',
 'd5ad3362424c',
 '43823561c3f0',
 'f7508f14dd7b',
 '76cab26493f1',
 '099021fac3c9',
 '1d37f1c8b6d8',
 'e087bd4b88f2',
 'ee36ca728641',
 'b67ae80f7eba',
 '9039cbfcbb2f',
 '53c874dbc594',
 '5a27b9b2

In [30]:
error_list = []
for idx, row in test.iterrows():
    img = row['image']
    ori_name = img_path+'train_images/'+img+".png"
    try:
        cimg = cv2.imread(ori_name, cv2.IMREAD_COLOR)
        cirimg = circle_crop(cimg)
        cirimg = cv2.resize(cirimg, (224,224))
        gimg=cv2.addWeighted(cirimg,4, cv2.GaussianBlur(cirimg, (0,0) , 10) ,-4 ,128)
        ggimg = cv2.cvtColor(cirimg, cv2.COLOR_BGR2GRAY)
        ggimg=cv2.addWeighted ( ggimg,4, cv2.GaussianBlur( ggimg , (0,0) , 10) ,-4 ,128)
        classname = row['level']
        gname = grey_path+"test/"+str(classname)+"/"+img+".png"
        cname = color_path+"test/"+str(classname)+"/"+img+".png"
        cv2.imwrite(gname, ggimg)
        cv2.imwrite(cname, gimg)
    except:
        error_list.append(img)
error_list

[]

In [33]:
train.to_csv("train_2019.csv")

In [34]:
test.to_csv("test_2019.csv")

In [36]:
wrong = ['5.49381E+11',
 '7.21214E+11',
 '8.91E+20',
 '9.46545E+11',
 '5.35683E+11',
 '7.10E+10',
 '3.89552E+11',
 '2.93E+10',
 '9.34E+76',
 '5.95447E+11',
 '1.94E+14',
 '9.21433E+11',
 '2.3255E+11',
 '4.41118E+11']
real_train = train[~train['image'].isin(wrong)]

In [37]:
real_train.shape

(2915, 2)

In [38]:
train.shape

(2929, 2)

In [39]:
real_train.to_csv("train_2019.csv")

In [4]:
# test grey images
path = "grey_cropped_2019/train/0/"
img = cv2.imread(path+"00cc2b75cddd.png", cv2.IMREAD_GRAYSCALE)

In [5]:
img

array([[128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128],
       ...,
       [128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128],
       [128, 128, 128, ..., 128, 128, 128]], dtype=uint8)

In [2]:
train = pd.read_csv("train_2019.csv")

In [3]:
for i in range(5):
    print(str(i))
    print(train[train['level']==i].shape)

0
(1435, 3)
1
(295, 3)
2
(795, 3)
3
(154, 3)
4
(236, 3)
