In [None]:
import numpy as np 
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import random
from PIL import Image

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from skopt import BayesSearchCV

In [None]:
# Specify the directory path for the training set 
directory_path = '/Users/karansagar/Downloads/archive/Training_Set/Training_Set/Training'

# List all files in the directory
file_names = [file for file in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file))]

# sort the order of the list by the numerical part 
file_names.sort(key=lambda x: int(x.split('.')[0]))

print(file_names)

Images of the Retina provided in the dataset are quite large and computationally heavy. As a result, I've reszied the images to 256 x 256. 

In [None]:
def resize_images(output_folder, size):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    for file in file_names:
        path = '/Users/karansagar/Downloads/archive/Training_Set/Training_Set/Training/' + file
        img = Image.open(path)
        img_resized = img.resize(size)
            
        output_path = os.path.join(output_folder, file)
        img_resized.save(output_path)
        #print(f"Resized and saved: {output_path}")


output_folder = '/Users/karansagar/Desktop/Retina_Training'
size = (256, 256)  # Desired size (width, height)
resize_images(output_folder, size)

In [None]:
data = [] # empty list which will contain the pixels for all the flattened images 

for file in file_names:
    path = '/Users/karansagar/Desktop/Retina_Classification/Retina_Training/' + file # path for the file
    image = Image.open(path) 
    image = np.asarray(image) # covert image to a numpy array
    flattened_image = image.flatten() # flatten the array
    data.append(flattened_image) # append the array to data

In [None]:
df = pd.read_csv('/Users/karansagar/Desktop/Retina_Classification/RFMiD_Training_Labels.csv') # load the csv which contains the classifications (0 or 1)

y = df['Disease_Risk'] # response data

y = np.vstack(y) # changing the shape of the array

In [None]:
unique_values, counts = np.unique(y, return_counts=True)

print(dict(zip(unique_values, counts)))

From the cell above, we see that our classes are not balanced. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

y_train = y_train.ravel()

Below I'm using Bayesian Optimization to find the optimal hyperparameters.

In [None]:
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the pipeline
clf = Pipeline([
    ("pca", PCA()),
    ("rnd", RandomForestClassifier())
])

# Define the param_grid with integer values for n_components and others
param_grid = {
    "pca__n_components": np.arange(50, 1228 - 50, 50),  # Integer range for PCA components 
    "rnd__n_estimators": np.arange(10, 500, 25),  # Integer range for values for n_estimators
    "rnd__max_depth": np.arange(10, 500, 25),  # Integer range for max_depth
    "rnd__max_leaf_nodes": np.arange(10, 500, 25),  # Integer values for max_leaf_nodes
}

# Perform Bayesian optimization
opt = BayesSearchCV(clf, param_grid, n_iter=32, cv=cv_folds, scoring='roc_auc')
opt.fit(X_train_scaled, y_train)

# Best parameters
print("Best parameters found: ", opt.best_params_)

In [None]:
pca = PCA(n_components=50)

X_train_pca = pca.fit_transform(X_train_scaled) # Apply PCA transformation to train test
X_test_pca = pca.transform(X_test_scaled) # Apply PCA trnsformation to test test

In [None]:
rnd_clf = RandomForestClassifier(n_estimators = 260, max_depth = 185, max_leaf_nodes = 35, random_state = 42) # Initialize Random Forest classifier 
rnd_clf.fit(X_train_pca, y_train) # fit the Random Forest Classfier to the training set 

y_pred_rnd = rnd_clf.predict(X_test_pca) # get the predictions

In [None]:
# Create a confusion matrix display
cm_rnd = confusion_matrix(y_test, y_pred_rnd, labels=rnd_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rnd, display_labels=rnd_clf.classes_)

# Plot the confusion matrix
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Random Forest')
plt.show()

In [None]:
sgd_clf = SGDClassifier(loss= 'log_loss') # Initialize Stochastic Gradient Descent Classifier 
sgd_clf.fit(X_train_pca, y_train) # fit the Stochastic Gradient Descent Classfier to the training set 

y_pred_sgd = sgd_clf.predict(X_test_pca) # get the predictions

In [None]:
# Create a confusion matrix display
cm_sgd = confusion_matrix(y_test, y_pred_sgd, labels=sgd_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_sgd, display_labels=sgd_clf.classes_)

# Plot the confusion matrix
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Stochastic Gradient Descent')
plt.show()

In [None]:
# Define the pipeline
clf = Pipeline([
    ("pca", PCA()),
    ("svc", SVC())
])

# Define the param_grid with integer values for n_components, gamma, C for Support Vector Classifier 
param_grid = {
    "pca__n_components": np.arange(50, 1228 - 50, 50),  # Integer range for PCA components 
    "svc__gamma": np.linspace(0, 5, 250),  # Integer range for values for gamma
    "svc__C": np.linspace(0, 500, 500),  # Integer range for C
}

# Perform Bayesian optimization
opt = BayesSearchCV(clf, param_grid, n_iter=32, cv=cv_folds, scoring='roc_auc')
opt.fit(X_train_scaled, y_train)

# Best parameters
print("Best parameters found: ", opt.best_params_)

In [None]:
svc_clf = SVC(kernel='rbf', gamma= 0.001, C=1.0) # Intialize the Support Vector Classifier using the specified Kernel, gamma, and C parameters 
svc_clf.fit(X_train, y_train) # fit the Support Vector classfier to the training set 

y_pred_svc = svc_clf.predict(X_test) # get the predictions

In [None]:
# Create a confusion matrix display
cm_svc = confusion_matrix(y_test, y_pred_svc, labels=svc_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_svc, display_labels=svc_clf.classes_)

# Plot the confusion matrix
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix for Support Vector Machines')
plt.show()