In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.imputation.mice import MICEData
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.metrics import precision_score
from xgboost import XGBClassifier
import cv2
from keras.applications import VGG16

In [2]:
IMG_SIZE = 224

In [3]:
root_path = '/kaggle/working/Data'
if os.path.exists(root_path):
    shutil.rmtree(root_path)

train_dir = os.makedirs(os.path.join(root_path, 'training'))
val_dir = os.makedirs(os.path.join(root_path, 'validation'))
class_names = ['0','1','2','3','4']

train_healthy = os.makedirs(os.path.join(root_path, 'training/0'))
train_mildDR = os.makedirs(os.path.join(root_path, 'training/1'))
train_moderateDR = os.makedirs(os.path.join(root_path, 'training/2'))
train_proliferateDR = os.makedirs(os.path.join(root_path, 'training/3'))
train_severeDR = os.makedirs(os.path.join(root_path, 'training/4'))

val_healthy = os.makedirs(os.path.join(root_path, 'validation/0'))
val_mildDR = os.makedirs(os.path.join(root_path, 'validation/1'))
valval_moderateDR = os.makedirs(os.path.join(root_path, 'validation/2'))
val_proliferateDR = os.makedirs(os.path.join(root_path, 'validation/3'))
val_severeDR = os.makedirs(os.path.join(root_path, 'validation/4'))

In [4]:
source_path = '/kaggle/input/diabetic-retinopathy-resized-arranged'

In [5]:
source_path_healthy = os.path.join(source_path, class_names[0])
source_path_mildDR = os.path.join(source_path, class_names[1])
source_path_moderateDR = os.path.join(source_path, class_names[2])
source_path_proliferateDR = os.path.join(source_path, class_names[3])
source_path_severeDR = os.path.join(source_path, class_names[4])

DR_class_count = [source_path_healthy, source_path_mildDR, source_path_moderateDR, source_path_proliferateDR, source_path_severeDR]

# os.listdir returns a list containing all files under the given path
for i in range(len(DR_class_count)):
    print(f"There are {len(os.listdir(DR_class_count[i]+'/'))} images of "+str(i))

There are 25810 images of 0
There are 2443 images of 1
There are 5292 images of 2
There are 873 images of 3
There are 708 images of 4


In [6]:
def preprocessing_fn(img):
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image = cv2.addWeighted(image,3.1, cv2.GaussianBlur(image, (15,15) ,IMG_SIZE/10),-3.5 ,128)
    adjusted = cv2.convertScaleAbs(image, alpha=1.8, beta=1.5)
    adjusted = cv2.resize(adjusted, (IMG_SIZE, IMG_SIZE))
    return adjusted.astype('float64')

In [7]:
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
    all_files = []
    count = 1000
    for fn in os.listdir(SOURCE_DIR):
        count-=1
        file_path = SOURCE_DIR + fn
        if os.path.getsize(file_path)>0:
            all_files.append(fn)
        else:
            print("\n" + fn + " is zero length, so ignoring.")
        if count<=0:
            break
    n_files = len(all_files)
    split_point = int(n_files * SPLIT_SIZE)
    shuffled = random.sample(all_files, n_files)

    train_set = shuffled[:split_point]
    test_set = shuffled[split_point:]

    for file_name in train_set:
        #shutil.copyfile(SOURCE_DIR + file_name, TRAINING_DIR + file_name)
        img = cv2.imread(SOURCE_DIR + file_name)
        img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
        img = preprocessing_fn(img)
        cv2.imwrite(TRAINING_DIR + file_name, img)

    for file_name in test_set:
        #shutil.copyfile(SOURCE_DIR + file_name, VALIDATION_DIR + file_name)
        img = cv2.imread(SOURCE_DIR + file_name)
        img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
        img = preprocessing_fn(img)
        cv2.imwrite(VALIDATION_DIR + file_name, img)

HEALTHY_SOURCE_DIR = "/kaggle/input/diabetic-retinopathy-resized-arranged/0/"
TRAINING_HEALTHY_DIR = "/kaggle/working/Data/training/0/"
VALIDATION_HEALTHY_DIR = "/kaggle/working/Data/validation/0/"

MILDDR_SOURCE_DIR = "/kaggle/input/diabetic-retinopathy-resized-arranged/1/"
TRAINING_MILDDR_DIR = "/kaggle/working/Data/training/1/"
VALIDATION_MILDDR_DIR = "/kaggle/working/Data/validation/1/"

MODERATEDR_SOURCE_DIR = "/kaggle/input/diabetic-retinopathy-resized-arranged/2/"
TRAINING_MODERATEDR_DIR = "/kaggle/working/Data/training/2/"
VALIDATION_MODERATEDR_DIR = "/kaggle/working/Data/validation/2/"

PROLIFERATEDR_SOURCE_DIR = "/kaggle/input/diabetic-retinopathy-resized-arranged/3/"
TRAINING_PROLIFERATEDR_DIR = "/kaggle/working/Data/training/3/"
VALIDATION_PROLIFERATEDR_DIR = "/kaggle/working/Data/validation/3/"

SEVEREDR_SOURCE_DIR = "/kaggle/input/diabetic-retinopathy-resized-arranged/4/"
TRAINING_SEVEREDR_DIR = "/kaggle/working/Data/training/4/"
VALIDATION_SEVEREDR_DIR = "/kaggle/working/Data/validation/4/"

split_size = .85
split_data(HEALTHY_SOURCE_DIR, TRAINING_HEALTHY_DIR, VALIDATION_HEALTHY_DIR, split_size)
split_data(MILDDR_SOURCE_DIR, TRAINING_MILDDR_DIR, VALIDATION_MILDDR_DIR, split_size)
split_data(MODERATEDR_SOURCE_DIR, TRAINING_MODERATEDR_DIR, VALIDATION_MODERATEDR_DIR, split_size)
split_data(PROLIFERATEDR_SOURCE_DIR, TRAINING_PROLIFERATEDR_DIR, VALIDATION_PROLIFERATEDR_DIR, split_size)
split_data(SEVEREDR_SOURCE_DIR, TRAINING_SEVEREDR_DIR, VALIDATION_SEVEREDR_DIR, split_size)

In [8]:
TRAINING_DIR = '/kaggle/working/Data/training'
VALIDATION_DIR = '/kaggle/working/Data/validation'
healthy_imgs = len(os.listdir('/kaggle/working/Data/training/0')) + len(os.listdir('/kaggle/working/Data/validation/0'))
dr_imgs = len(os.listdir('/kaggle/working/Data/training/3')) + len(os.listdir('/kaggle/working/Data/validation/3'))
healthy_imgs, dr_imgs

(1000, 873)

In [9]:
def merge_fn(DIR, img_list2, img):
    l = len(img_list2)
    index = random.randint(0,l-1)
    img2 = cv2.imread(DIR + img_list2[index])
    img2 = cv2.addWeighted(img, 0.5, img2, 0.5, 1)
    return img2

In [10]:
def oversampling(DIR,n):
    img_list = [name for name in os.listdir(DIR)]
    lim = len(img_list)
    left_img_list = [name for name in img_list if 'left' in name]
    right_img_list = [name for name in img_list if 'right' in name]

    for i in range(n):
        index = random.randint(0,lim-1)
        fn = img_list[index]
        file_path = DIR + fn

        if os.path.getsize(file_path)>0:
            img = cv2.imread(file_path)
            r = random.randint(0,2)

            if r==0:
                flip = cv2.flip(img,0)
                cv2.imwrite(file_path+str(i)+'.jpeg',flip)
            elif r==1:
                if 'left' in fn:
                    merge = merge_fn(DIR, left_img_list, img)
                else:
                    merge = merge_fn(DIR, right_img_list, img)
                cv2.imwrite(file_path+str(i)+'.jpeg',merge)
            else:
                cv2.imwrite(file_path+str(i)+'.jpeg',img)

# len_healthy = len(os.listdir(TRAINING_HEALTHY_DIR))

len_healthy = int(1000*split_size)
len_dr = len(os.listdir(TRAINING_MILDDR_DIR))
oversampling(TRAINING_MILDDR_DIR, len_healthy - len_dr)

len_dr = len(os.listdir(TRAINING_MODERATEDR_DIR))
oversampling(TRAINING_MODERATEDR_DIR, len_healthy - len_dr)

len_dr = len(os.listdir(TRAINING_PROLIFERATEDR_DIR))
oversampling(TRAINING_PROLIFERATEDR_DIR, len_healthy - len_dr)

len_dr = len(os.listdir(TRAINING_SEVEREDR_DIR))
oversampling(TRAINING_SEVEREDR_DIR, len_healthy - len_dr)

In [11]:
def validation_oversampler(DIR, n):
    img_list = [name for name in os.listdir(DIR)]
    lim = len(img_list)
    left_img_list = [name for name in img_list if 'left' in name]
    right_img_list = [name for name in img_list if 'right' in name]
    for i in range(n):
        index = random.randint(0,lim-1)
        fn = img_list[index]
        file_path = DIR + fn

        if os.path.getsize(file_path)>0:
            img = cv2.imread(file_path)
            r = random.randint(0,2)

            if r==0:
                flip = cv2.flip(img,0)
                cv2.imwrite(file_path+str(i)+'.jpeg',flip)
            else:
                cv2.imwrite(file_path+str(i)+'.jpeg',img)

len_healthy = int(1000*(1-split_size))

len_dr = len(os.listdir(VALIDATION_MILDDR_DIR))
validation_oversampler(VALIDATION_MILDDR_DIR, len_healthy - len_dr)

len_dr = len(os.listdir(VALIDATION_MODERATEDR_DIR))
oversampling(VALIDATION_MODERATEDR_DIR, len_healthy - len_dr)

len_dr = len(os.listdir(VALIDATION_PROLIFERATEDR_DIR))
oversampling(VALIDATION_PROLIFERATEDR_DIR, len_healthy - len_dr)

len_dr = len(os.listdir(VALIDATION_SEVEREDR_DIR))
oversampling(VALIDATION_SEVEREDR_DIR, len_healthy - len_dr)

In [12]:
len(os.listdir(TRAINING_MODERATEDR_DIR)), len(os.listdir(TRAINING_SEVEREDR_DIR))

(850, 850)

In [13]:
# def train_val_generators(TRAINING_DIR, VALIDATION_DIR):
#     train_datagen = ImageDataGenerator(rescale=1./255,
#                                        horizontal_flip=True,
#                                       preprocessing_function=preprocessing_fn)

#     train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
#                                                         batch_size=64,
#                                                         class_mode='categorical',
#                                                         target_size=(IMG_SIZE, IMG_SIZE))

#     validation_datagen = ImageDataGenerator(rescale=1./255,
#                                             horizontal_flip=True,
#                                             preprocessing_function=preprocessing_fn)
#     validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
#                                                                 batch_size=32,
#                                                                 class_mode='categorical',
#                                                                 target_size=(IMG_SIZE, IMG_SIZE))

#     return train_generator, validation_generator

# train_generator, validation_generator = train_val_generators(TRAINING_DIR, VALIDATION_DIR)

## **Conversion of images to tabular data**

In [14]:
rows_train = []

In [15]:
conv = VGG16(
    include_top = False,
    weights = 'imagenet',
    input_shape=(224,224,3)
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [16]:
def converter(img,rows, c):
    img2 = conv(np.expand_dims(img, axis=0))
    img3 = np.sum(img2, axis=3)/512
    rows.append(list(img3.flatten()) + [c])

In [17]:
def genDataset(direc):
    rows = []
    for i in class_names:
        theDirec = os.path.join(direc, i)        
        for j in os.listdir(theDirec):
            img = cv2.imread(direc+'/'+i+'/'+j)
            img = np.array(img)
            converter(img, rows, int(i))
    random.shuffle(rows)
    rows = np.array(rows)
    rows = pd.DataFrame(rows, columns=None, index=np.arange(0,len(rows)))
    return rows
            
    

In [18]:
# df_train = genDataset(TRAINING_DIR)
# df_test = genDataset(VALIDATION_DIR)
df_train = pd.read_csv('/kaggle/input/drmachinelearning/train_file (1).csv')
df_test = pd.read_csv('/kaggle/input/drmachinelearning/test_file (1).csv')

In [19]:
df_train.drop('Unnamed: 0',axis=1, inplace=True)
df_test.drop('Unnamed: 0',axis=1, inplace=True)

In [20]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,2.264182,3.747727,2.313169,2.635653,2.98309,4.386437,2.280173,1.243878,3.585125,2.356004,...,3.96306,2.42595,2.590348,4.594205,4.080608,1.931603,1.834204,5.193177,3.334594,4.0
1,2.534084,4.617288,5.043473,3.817172,5.181656,6.039627,2.906878,1.568009,4.438841,3.779143,...,6.386855,3.045848,2.754045,3.607669,2.304223,2.233987,4.887716,6.807181,3.360308,1.0
2,2.117104,4.641054,3.69842,2.423517,3.153149,4.59539,2.546879,1.428409,3.732916,2.770275,...,4.870719,3.060979,3.013932,4.662717,2.397598,2.124692,4.102462,7.140022,3.78118,3.0
3,1.92599,4.480029,4.325315,4.007621,3.714158,5.097728,2.634568,0.829914,3.33442,4.24458,...,5.825408,2.317929,1.746776,2.866825,3.300934,2.694438,4.072708,4.820017,2.527001,2.0
4,1.429226,4.610854,4.972523,3.995689,2.509166,2.166258,1.36784,0.996258,4.078973,5.020054,...,2.731688,1.323433,1.748123,2.343807,1.866626,1.718506,1.995262,2.244982,1.490078,2.0


In [21]:
df_train.to_csv('/kaggle/working/train_file.csv')
df_test.to_csv('/kaggle/working/test_file.csv')

In [22]:
df_train.shape, df_test.shape

((4250, 50), (750, 50))

In [23]:
np.array(1).reshape(-1,1).shape

(1, 1)

In [24]:
# img = cv2.imread('/kaggle/working/Data/validation/0/10003_left.jpeg')
# img3 = conv(np.expand_dims(img, axis=0))
# img2 = np.sum(img3, axis=3)/512
# # converter(img, rows_train, np.array(int('0')))
# n1 = list(img2.flatten()) + [1]
# n2 = list(img2.flatten()) + [1]

# pd.DataFrame(np.array([n1,n2]))

In [25]:
# img3.numpy().shape

## **Machine Learning Approach**

In [26]:
X_train = df_train.drop('49', axis=1)
y_train = df_train['49']
X_test = df_test.drop('49', axis=1)
y_test = df_test['49']

In [27]:
y_train.describe()

count    4250.00000
mean        2.00000
std         1.41438
min         0.00000
25%         1.00000
50%         2.00000
75%         3.00000
max         4.00000
Name: 49, dtype: float64

In [28]:
# Defining functions for grid search
def grid_check(grid, X, X_test, y_train, y_test):
    grid.fit(X, y_train)
    
    train_score = grid.score(X, y_train)
    print('Train accuracy: ', train_score)
    
    test_score = grid.score(X_test, y_test)
    print('\nTest accuracy: ', test_score)
    
    par = grid.best_params_
    print('\nBest parameters\n',par)
    print("\n")
    return grid.best_estimator_

In [29]:
def ResultFunction(grid_model):
    noModel = grid_check(grid_model, X_train, X_test, y_train, y_test)
    return noModel

In [30]:
# Decision tree
modelTree = DecisionTreeClassifier()
params_tree = {
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'min_samples_leaf': [1, 2, 3, 4],
    'min_samples_split': [2, 3, 4],
    'splitter': ['best', 'random'],
    'criterion': ['log_loss', 'gini', 'entropy']
}
grid_tree = GridSearchCV(modelTree, params_tree, cv=5, n_jobs=-1, verbose=1)

In [31]:
l = ResultFunction(grid_tree)

Fitting 5 folds for each of 504 candidates, totalling 2520 fits
Train accuracy:  0.3517647058823529

Test accuracy:  0.224

Best parameters
 {'criterion': 'log_loss', 'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'random'}




In [32]:
# modelXGB = XGBClassifier()
# params = {
#         'booster':['gbtree','dart'],
#         'min_child_weight': [1, 5],
#         'gamma': [0.5, 1, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [4, 5]
#         }
# grid_XGB = GridSearchCV(modelXGB, params, cv=5, n_jobs=-1, verbose=1)

In [33]:
# l2 = ResultFunction(grid_XGB)

In [34]:
modelXGB = XGBClassifier(
    booster='gbtree',
    max_depth=4
)
modelXGB.fit(X_train, y_train)
modelXGB.score(X_train, y_train)

0.9618823529411765

In [35]:
modelXGB.score(X_test, y_test)

0.33066666666666666

In [36]:
cat1 = CatBoostClassifier(verbose=0)
cat1.fit(X_train, y_train) 
cat1.score(X_train, y_train)

0.9971764705882353

In [37]:
cat1.score(X_test, y_test)

0.3293333333333333