In [3]:
# Import Libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Set the directory path to the main folder containing the class subdirectories
data_dir = "../input/diabetic-retinopathy-224x224-gaussian-filtered/gaussian_filtered_images/gaussian_filtered_images"

# Create an ImageDataGenerator with augmentation and validation split
datagen = ImageDataGenerator(
    rescale=1./255,
    zoom_range=0.2,
    width_shift_range=0.2,
    height_shift_range=0.2,
    validation_split=0.2
)

# Training data generator
train_data = datagen.flow_from_directory(
    data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

# Validation data generator
valid_data = datagen.flow_from_directory(
    data_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)


Found 2931 images belonging to 5 classes.
Found 731 images belonging to 5 classes.


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Define a simple CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(train_data.num_classes, activation='softmax')  # Use number of classes from train_data
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=10
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load a pretrained ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
feature_extractor = Model(inputs=base_model.input, outputs=x)

# Extract features for training and validation sets
train_features = feature_extractor.predict(train_data)
train_labels = train_data.classes

val_features = feature_extractor.predict(valid_data)
val_labels = valid_data.classes


In [9]:
# SVM Classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(train_features, train_labels)
y_pred_svm = svm_classifier.predict(val_features)
print("SVM Classifier Report:\n", classification_report(val_labels, y_pred_svm))

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_features, train_labels)
y_pred_rf = rf_classifier.predict(val_features)
print("Random Forest Classifier Report:\n", classification_report(val_labels, y_pred_rf))


  _warn_prf(average, modifier, msg_start, len(result))


SVM Classifier Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        74
           1       0.00      0.00      0.00       199
           2       0.49      1.00      0.66       361
           3       0.00      0.00      0.00        59
           4       0.00      0.00      0.00        38

    accuracy                           0.49       731
   macro avg       0.10      0.20      0.13       731
weighted avg       0.24      0.49      0.33       731

Random Forest Classifier Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        74
           1       0.26      0.09      0.13       199
           2       0.49      0.91      0.64       361
           3       0.00      0.00      0.00        59
           4       0.00      0.00      0.00        38

    accuracy                           0.47       731
   macro avg       0.15      0.20      0.15       731
weighted avg       

  _warn_prf(average, modifier, msg_start, len(result))


# Multi-Stage Models


In [10]:
# Create binary labels for Healthy (0) and Not Healthy (1-4)
train_labels_binary = (train_labels > 0).astype(int)
val_labels_binary = (val_labels > 0).astype(int)

# Train an SVM classifier for binary classification
binary_classifier = SVC(kernel='linear')
binary_classifier.fit(train_features, train_labels_binary)

# Predict and evaluate
y_pred_binary = binary_classifier.predict(val_features)
print("Binary Classifier Report:\n", classification_report(val_labels_binary, y_pred_binary))


Binary Classifier Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        74
           1       0.90      1.00      0.95       657

    accuracy                           0.90       731
   macro avg       0.45      0.50      0.47       731
weighted avg       0.81      0.90      0.85       731



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Filter out only the non-healthy samples for severity classification
non_healthy_train_features = train_features[train_labels > 0]
non_healthy_train_labels = train_labels[train_labels > 0]
non_healthy_val_features = val_features[val_labels > 0]
non_healthy_val_labels = val_labels[val_labels > 0]

# Train a Random Forest classifier for severity levels
severity_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
severity_classifier.fit(non_healthy_train_features, non_healthy_train_labels)

# Predict and evaluate
y_pred_severity = severity_classifier.predict(non_healthy_val_features)
print("Severity Classifier Report:\n", classification_report(non_healthy_val_labels, y_pred_severity))


Severity Classifier Report:
               precision    recall  f1-score   support

           1       0.18      0.05      0.07       199
           2       0.54      0.91      0.68       361
           3       0.00      0.00      0.00        59
           4       0.00      0.00      0.00        38

    accuracy                           0.51       657
   macro avg       0.18      0.24      0.19       657
weighted avg       0.35      0.51      0.39       657



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Function to use the hybrid model pipeline on new images
def classify_image(image):
    """
    Classify a single image using the hybrid model pipeline.
    
    Args:
    - image (numpy array): Image array of shape (224, 224, 3)
    
    Returns:
    - str: Classification result as a string.
    """
    # Preprocess the image
    image = tf.image.resize(image, img_size) / 255.0
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    
    # Step 1: Feature extraction
    feature = feature_extractor.predict(image)
    
    # Step 2: Binary classification (Healthy vs Not Healthy)
    is_not_healthy = binary_classifier.predict(feature)
    
    if is_not_healthy == 0:
        return "No_DR"  # Healthy
    else:
        # Step 3: Severity classification
        severity = severity_classifier.predict(feature)
        if severity == 1:
            return "Mild"
        elif severity == 2:
            return "Moderate"
        elif severity == 3:
            return "Severe"
        elif severity == 4:
            return "Proliferate_DR"  # Corrected quotation mark here


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve
import numpy as np

# Define function to calculate metrics
def calculate_metrics(y_true, y_pred, average='weighted'):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=average, zero_division=0)
    recall = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)
    
    # For AUC-ROC, ensure binary labels
    if len(np.unique(y_true)) == 2:  # binary classification
        auc = roc_auc_score(y_true, y_pred)
    else:
        auc = "N/A for multiclass without probabilities"
        
    return accuracy, precision, recall, f1, auc

# Metrics for SVM Classifier
svm_accuracy, svm_precision, svm_recall, svm_f1, svm_auc = calculate_metrics(val_labels, y_pred_svm)
print("SVM Classifier Metrics:")
print(f"Accuracy: {svm_accuracy}, Precision: {svm_precision}, Recall: {svm_recall}, F1 Score: {svm_f1}, AUC-ROC: {svm_auc}")

# Metrics for Random Forest Classifier
rf_accuracy, rf_precision, rf_recall, rf_f1, rf_auc = calculate_metrics(val_labels, y_pred_rf)
print("Random Forest Classifier Metrics:")
print(f"Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1 Score: {rf_f1}, AUC-ROC: {rf_auc}")


SVM Classifier Metrics:
Accuracy: 0.493844049247606, Precision: 0.2438819449772719, Recall: 0.493844049247606, F1 Score: 0.3265159373230509, AUC-ROC: N/A for multiclass without probabilities
Random Forest Classifier Metrics:
Accuracy: 0.4719562243502052, Precision: 0.3140668631496512, Recall: 0.4719562243502052, F1 Score: 0.35098779134295227, AUC-ROC: N/A for multiclass without probabilities


In [15]:
# Binary Classifier Metrics
binary_accuracy, binary_precision, binary_recall, binary_f1, binary_auc = calculate_metrics(val_labels_binary, y_pred_binary)
print("Binary Classifier Metrics:")
print(f"Accuracy: {binary_accuracy}, Precision: {binary_precision}, Recall: {binary_recall}, F1 Score: {binary_f1}, AUC-ROC: {binary_auc}")


Binary Classifier Metrics:
Accuracy: 0.8987688098495212, Precision: 0.8077853735583249, Recall: 0.8987688098495212, F1 Score: 0.8508517407365065, AUC-ROC: 0.5


In [16]:
# Severity Classifier Metrics
severity_accuracy, severity_precision, severity_recall, severity_f1, severity_auc = calculate_metrics(non_healthy_val_labels, y_pred_severity)
print("Severity Classifier Metrics:")
print(f"Accuracy: {severity_accuracy}, Precision: {severity_precision}, Recall: {severity_recall}, F1 Score: {severity_f1}, AUC-ROC: {severity_auc}")


Severity Classifier Metrics:
Accuracy: 0.5114155251141552, Precision: 0.3499462355582859, Recall: 0.5114155251141552, F1 Score: 0.39342310870602015, AUC-ROC: N/A for multiclass without probabilities


In [17]:
from sklearn.preprocessing import label_binarize

# Example: Calculate AUC-ROC for multiclass using one-vs-rest approach
if hasattr(severity_classifier, "predict_proba"):
    non_healthy_val_labels_binarized = label_binarize(non_healthy_val_labels, classes=[1, 2, 3, 4])
    severity_proba = severity_classifier.predict_proba(non_healthy_val_features)
    multiclass_auc = roc_auc_score(non_healthy_val_labels_binarized, severity_proba, average="macro", multi_class="ovr")
    print("Multiclass Severity Classifier AUC-ROC:", multiclass_auc)
else:
    print("AUC-ROC not available as probabilities are not provided by this classifier.")


Multiclass Severity Classifier AUC-ROC: 0.47619311613933757


# 1. SVM Classifier

# SVM doesn't support predict_proba by default. However, you can use the SVC with the probability=True parameter, which enables probability predictions (note: this makes SVM training slower). Here’s how to modify the SVM section:



In [18]:
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

# Modify SVM classifier to enable probability predictions
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(train_features, train_labels)
y_pred_svm_proba = svm_classifier.predict_proba(val_features)

# Calculate AUC-ROC for multiclass
val_labels_binarized = label_binarize(val_labels, classes=[0, 1, 2, 3, 4])
svm_auc = roc_auc_score(val_labels_binarized, y_pred_svm_proba, average="macro", multi_class="ovr")
print("SVM Classifier AUC-ROC:", svm_auc)


SVM Classifier AUC-ROC: 0.4891482737103311


# 2. Random Forest Classifier
# Random Forest has a predict_proba method, so you can directly use it to get the probability scores.

In [19]:
# Get probability predictions for Random Forest
y_pred_rf_proba = rf_classifier.predict_proba(val_features)

# Calculate AUC-ROC for multiclass
rf_auc = roc_auc_score(val_labels_binarized, y_pred_rf_proba, average="macro", multi_class="ovr")
print("Random Forest Classifier AUC-ROC:", rf_auc)


Random Forest Classifier AUC-ROC: 0.49953714768644736


# 3. Binary Classifier (Healthy vs. Not Healthy)

In [22]:
# Initialize the SVM classifier with probability enabled
binary_classifier = SVC(kernel='linear', probability=True)
binary_classifier.fit(train_features, train_labels_binary)

# Get probability predictions for the binary classifier
y_pred_binary_proba = binary_classifier.predict_proba(val_features)[:, 1]  # Probability of class 1

# Calculate binary AUC-ROC
binary_auc = roc_auc_score(val_labels_binary, y_pred_binary_proba)
print("Binary Classifier AUC-ROC:", binary_auc)


Binary Classifier AUC-ROC: 0.481879139413386


# 4. Severity Classifier (Multiclass)


In [21]:
# Get probability predictions for the severity classifier
if hasattr(severity_classifier, "predict_proba"):
    severity_proba = severity_classifier.predict_proba(non_healthy_val_features)
    non_healthy_val_labels_binarized = label_binarize(non_healthy_val_labels, classes=[1, 2, 3, 4])
    
    severity_auc = roc_auc_score(non_healthy_val_labels_binarized, severity_proba, average="macro", multi_class="ovr")
    print("Severity Classifier AUC-ROC:", severity_auc)
else:
    print("AUC-ROC not available as probabilities are not provided by the severity classifier.")


Severity Classifier AUC-ROC: 0.47619311613933757


# OVERALL

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# Function to calculate and print metrics
def print_metrics(name, y_true, y_pred, y_proba=None, average='weighted'):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=average, zero_division=0)
    recall = recall_score(y_true, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_true, y_pred, average=average, zero_division=0)
    
    if y_proba is not None:
        # Binary or multiclass AUC calculation based on provided probabilities
        if len(set(y_true)) == 2:  # binary
            auc = roc_auc_score(y_true, y_proba)
        else:  # multiclass
            y_true_binarized = label_binarize(y_true, classes=list(set(y_true)))
            auc = roc_auc_score(y_true_binarized, y_proba, average="macro", multi_class="ovr")
    else:
        auc = "N/A"

    print(f"{name} Metrics:")
    print(f"  Accuracy: {accuracy}")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1 Score: {f1}")
    print(f"  AUC-ROC Score: {auc}")
    print("")

# SVM Classifier
print_metrics(
    "SVM Classifier",
    val_labels,
    svm_classifier.predict(val_features),
    svm_classifier.predict_proba(val_features)
)

# Random Forest Classifier
print_metrics(
    "Random Forest Classifier",
    val_labels,
    rf_classifier.predict(val_features),
    rf_classifier.predict_proba(val_features)
)

# Binary Classifier (Healthy vs Not Healthy)
print_metrics(
    "Binary Classifier",
    val_labels_binary,
    binary_classifier.predict(val_features),
    binary_classifier.predict_proba(val_features)[:, 1]
)

# Severity Classifier (Multiclass)
if hasattr(severity_classifier, "predict_proba"):
    print_metrics(
        "Severity Classifier",
        non_healthy_val_labels,
        severity_classifier.predict(non_healthy_val_features),
        severity_classifier.predict_proba(non_healthy_val_features)
    )
else:
    print("Severity Classifier AUC-ROC not available as probabilities are not provided.")


SVM Classifier Metrics:
  Accuracy: 0.493844049247606
  Precision: 0.2438819449772719
  Recall: 0.493844049247606
  F1 Score: 0.3265159373230509
  AUC-ROC Score: 0.4891482737103311

Random Forest Classifier Metrics:
  Accuracy: 0.4719562243502052
  Precision: 0.3140668631496512
  Recall: 0.4719562243502052
  F1 Score: 0.35098779134295227
  AUC-ROC Score: 0.49953714768644736

Binary Classifier Metrics:
  Accuracy: 0.8987688098495212
  Precision: 0.8077853735583249
  Recall: 0.8987688098495212
  F1 Score: 0.8508517407365065
  AUC-ROC Score: 0.481879139413386

Severity Classifier Metrics:
  Accuracy: 0.5114155251141552
  Precision: 0.3499462355582859
  Recall: 0.5114155251141552
  F1 Score: 0.39342310870602015
  AUC-ROC Score: 0.47619311613933757

