In [None]:
!pip install tensorflow scikit-learn numpy matplotlib


In [None]:
!pip install tensorflow

### Loading Images, Boundaries and Masks 

In [2]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split


train_dir = './Data/WCEcurated/train'
val_dir = './Data/WCEcurated/val'
test_dir = './Data/WCEcurated/test'


class_names = ['normal', 'ulcerative_colitis', 'polyps', 'esophagitis']
num_classes = len(class_names)

# load images and masks
def load_images_and_labels(data_dir):
    images = []
    labels = []

    for label, class_name in enumerate(class_names):
        class_dir = os.path.join(data_dir, f'{label}_{class_name}')
        for image_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, image_name)
            img = img_to_array(load_img(img_path, target_size=(224, 224)))
            images.append(img)
            labels.append(label)

    return np.array(images), np.array(labels)

# Load datasets
train_images, train_labels = load_images_and_labels(train_dir)
val_images, val_labels = load_images_and_labels(val_dir)
test_images, test_labels = load_images_and_labels(test_dir)


2024-09-19 10:54:54.397904: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-19 10:54:54.434911: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-19 10:54:54.435851: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Preprocessing 

#### Normalization

In [3]:
train_images = train_images / 255.0
val_images = val_images / 255.0
test_images = test_images / 255.0

### Feature Extraction using MobileNetV2 

#### MobileNetV2 will be used to extract features from the lesion images (from the images folder). Since these images are in RGB, we will use the default configuration of MobileNetV2. 
Parameters:

    weights='imagenet': Initializes the model with weights pre-trained on ImageNet.
    include_top=False: Removes the final classification layers, enabling us to extract features instead.
    input_shape=(224, 224, 3): Specifies the input shape for RGB images.

In [4]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D

# Load pre-trained MobileNetV2 without the classification head
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add a global average pooling layer to convert 4D feature maps to 2D feature vectors
x = base_model.output
x = GlobalAveragePooling2D()(x)

# Create a feature extractor model
feature_extractor = Model(inputs=base_model.input, outputs=x)


2024-09-19 10:57:04.673493: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [5]:
#Extracting Features
train_features = feature_extractor.predict(train_images)
val_features = feature_extractor.predict(val_images)
test_features = feature_extractor.predict(test_images)



#### Train an SVM classifier

We will use an SVM classifier with an RBF kernel. Before feeding the features into the SVM, we standardize them using a StandardScaler.

StandardScaler: Scales the features to have zero mean and unit variance, which improves the performance of SVM.

SVC(kernel='rbf'): The RBF kernel is suitable for non-linear classification problems like this one

In [6]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline with standard scaling and an SVM classifier
svm_model = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='auto'))

# Train the SVM classifier
svm_model.fit(train_features, train_labels)


#### Model evaluation 
Evaluate the trained SVM model on the validation and test datasets using accuracy and classification reports.

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.preprocessing import label_binarize

###Training 
# Accuracy
train_accuracy = accuracy_score(train_labels, train_predictions)
print(f"Training Accuracy: {train_accuracy}")

# Precision, Recall, F1-score 
train_precision, train_recall, train_f1_score, _ = precision_recall_fscore_support(train_labels, train_predictions, average='macro')
print(f"Training Precision: {train_precision}")
print(f"Training Recall: {train_recall}")
print(f"Training F1-score: {train_f1_score}")

# AUC 
train_labels_bin = label_binarize(train_labels, classes=[0, 1, 2, 3])
train_auc = roc_auc_score(train_labels_bin, svm_model.decision_function(train_features), average='macro')
print(f"Training AUC: {train_auc}")


Training Accuracy: 0.999375
Training Precision: 0.9993757802746566
Training Recall: 0.999375
Training F1-score: 0.9993749997558592
Training AUC: 0.9999977864583334


In [14]:
###Validation 
# Accuracy
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")

# Precision, Recall, F1-score (Macro-average)
val_precision, val_recall, val_f1_score, _ = precision_recall_fscore_support(val_labels, val_predictions, average='macro')
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")
print(f"Validation F1-score: {val_f1_score}")

# AUC (Macro-average)
val_labels_bin = label_binarize(val_labels, classes=[0, 1, 2, 3])
val_auc = roc_auc_score(val_labels_bin, svm_model.decision_function(val_features), average='macro')
print(f"Validation  AUC: {val_auc}")


Validation Accuracy: 0.95
Validation Precision: 0.9540954179644814
Validation Recall: 0.95
Validation F1-score: 0.9495300731060741
Validation  AUC: 0.993578


In [12]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions)

print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[200   0   0   0]
 [  0 162  37   1]
 [  0   3 197   0]
 [  0   0   0 200]]


In [13]:
from sklearn.metrics import recall_score

# Compute recall (sensitivity) for each class
sensitivity_per_class = recall_score(test_labels, test_predictions, average=None)

# Print sensitivity for each class
for i, sensitivity in enumerate(sensitivity_per_class):
    print(f"Sensitivity for class {i}: {sensitivity}")


Sensitivity for class 0: 1.0
Sensitivity for class 1: 0.81
Sensitivity for class 2: 0.985
Sensitivity for class 3: 1.0


In [14]:
specificity_per_class = []

for i in range(len(conf_matrix)):
    # True negatives: sum of all elements that are not in the i-th row or column
    TN = conf_matrix.sum() - (conf_matrix[i, :].sum() + conf_matrix[:, i].sum() - conf_matrix[i, i])
    # False positives: sum of the i-th column, excluding the diagonal element
    FP = conf_matrix[:, i].sum() - conf_matrix[i, i]
    # False negatives: sum of the i-th row, excluding the diagonal element
    FN = conf_matrix[i, :].sum() - conf_matrix[i, i]
    # True positives: diagonal element
    TP = conf_matrix[i, i]
    
    # Specificity
    specificity = TN / (TN + FP)
    specificity_per_class.append(specificity)

# Print specificity for each class
for i, specificity in enumerate(specificity_per_class):
    print(f"Specificity for class {i}: {specificity}")


Specificity for class 0: 1.0
Specificity for class 1: 0.995
Specificity for class 2: 0.9383333333333334
Specificity for class 3: 0.9983333333333333


In [17]:
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# Accuracy
accuracy = accuracy_score(test_labels, test_predictions)

# Classification error (loss)
classification_error = 1 - accuracy
print(f"Classification Error (Loss): {classification_error}")


Classification Error (Loss): 0.05125000000000002


#### Model optimization for Deployment 
To make the feature extraction model lightweight for embedded deployment, i converted it to TensorFlow Lite:

In [None]:
import tensorflow as tf

# Convert the feature extraction model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(feature_extractor)
tflite_model = converter.convert()

# Save the TFLite model to a file
with open('./Model/mobilenetv2_feature_extractor.tflite', 'wb') as f:
    f.write(tflite_model)


In [None]:
import joblib

# Save the trained SVM model
joblib.dump(svm_model, './Model/svm_model.pkl')


## With Data augmentation

In [1]:
import shutil
import os

original_dataset_dir = './Data/WCEcurated'
augmented_dataset_dir = './Data/WCEcurated_augmented'

# Copy the dataset to a new directory to not change the initial dataset 
shutil.copytree(original_dataset_dir, augmented_dataset_dir)
print(f"Dataset copied to {augmented_dataset_dir}")


Dataset copied to ./Data/WCEcurated_augmented


In [2]:
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
import numpy as np

# Paths to the dataset
dataset_dir = './Data/WCEcurated_augmented'  # The dataset copy where i'll augment the data 
splits = ['train', 'val', 'test']  

# Define ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# augment images in a given directory
def augment_images(class_dir, num_needed):
    image_filenames = os.listdir(class_dir)
    num_images = len(image_filenames)
    
    print(f"Augmenting {class_dir}, current images: {num_images}, need to add {num_needed}")
    
    for image_file in image_filenames:
        img_path = os.path.join(class_dir, image_file)
        img = load_img(img_path)
        x = img_to_array(img)
        x = np.expand_dims(x, axis=0)
        
        # Augment the image and save it to the class directory
        i = 0
        for batch in datagen.flow(x, batch_size=1, save_to_dir=class_dir, save_prefix='aug', save_format='jpg'):
            i += 1
            if i >= num_needed:  
                return

# Main augmentation process
for class_name in os.listdir(os.path.join(dataset_dir, 'train')):
    total_images = 0
    class_dirs = {split: os.path.join(dataset_dir, split, class_name) for split in splits}
    
    # Calculate total number of images for this class across all splits
    for split, class_dir in class_dirs.items():
        total_images += len(os.listdir(class_dir))
    
    # If total images are less than 3000, augment
    if total_images < 3000:
        num_to_add = 3000 - total_images
        print(f"Class '{class_name}' has {total_images} images, augmenting by {num_to_add} images.")

        # Distribute the augmentation across train, val, and test
        for split, class_dir in class_dirs.items():
            current_images = len(os.listdir(class_dir))
            proportion = current_images / total_images if total_images > 0 else 1
            num_needed = int(num_to_add * proportion)
            
            if num_needed > 0:
                augment_images(class_dir, num_needed)

print("Data augmentation complete!")



2024-09-12 16:53:19.667422: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-12 16:53:19.926707: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-12 16:53:19.928222: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Class '0_normal' has 1500 images, augmenting by 1500 images.
Augmenting ./Data/WCEcurated_augmented/train/0_normal, current images: 800, need to add 800
Augmenting ./Data/WCEcurated_augmented/val/0_normal, current images: 500, need to add 500
Augmenting ./Data/WCEcurated_augmented/test/0_normal, current images: 200, need to add 200
Class '1_ulcerative_colitis' has 1500 images, augmenting by 1500 images.
Augmenting ./Data/WCEcurated_augmented/train/1_ulcerative_colitis, current images: 800, need to add 800
Augmenting ./Data/WCEcurated_augmented/val/1_ulcerative_colitis, current images: 500, need to add 500
Augmenting ./Data/WCEcurated_augmented/test/1_ulcerative_colitis, current images: 200, need to add 200
Class '2_polyps' has 1500 images, augmenting by 1500 images.
Augmenting ./Data/WCEcurated_augmented/train/2_polyps, current images: 800, need to add 800
Augmenting ./Data/WCEcurated_augmented/val/2_polyps, current images: 500, need to add 500
Augmenting ./Data/WCEcurated_augmented/te

In [36]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split

# Define paths to your dataset
train_dir = './Data/WCEcurated_augmented/train'
val_dir = './Data/WCEcurated_augmented/val'
test_dir = './Data/WCEcurated_augmented/test'

# Define class labels
class_names = ['normal', 'ulcerative_colitis', 'polyps', 'esophagitis']
num_classes = len(class_names)

# Function to load images and masks
def load_images_and_labels(data_dir):
    images = []
    labels = []

    for label, class_name in enumerate(class_names):
        class_dir = os.path.join(data_dir, f'{label}_{class_name}')
        for image_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, image_name)
            img = img_to_array(load_img(img_path, target_size=(224, 224)))
            images.append(img)
            labels.append(label)

    return np.array(images), np.array(labels)

# Load datasets
train_images, train_labels = load_images_and_labels(train_dir)
val_images, val_labels = load_images_and_labels(val_dir)
test_images, test_labels = load_images_and_labels(test_dir)


In [37]:
train_images = train_images / 255.0
val_images = val_images / 255.0
test_images = test_images / 255.0

In [38]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.layers import GlobalAveragePooling2D
import numpy as np
from tensorflow.keras.layers import Dropout, Dense
# Load pre-trained MobileNetV2 without the classification head
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add a global average pooling layer to convert 4D feature maps to 2D feature vectors
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.8)(x)  # Increased dropout rate


# Create a feature extractor model
feature_extractor = Model(inputs=base_model.input, outputs=x)

# `train_images`, `val_images`, and `test_images` are arrays of the augmented dataset images
train_features = feature_extractor.predict(train_images)
val_features = feature_extractor.predict(val_images)
test_features = feature_extractor.predict(test_images)

# Normalize the features if necessary
train_features = np.array(train_features)
val_features = np.array(val_features)
test_features = np.array(test_features)





In [39]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline with standard scaling and an SVM classifier
svm_model = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='auto',probability=True))

# Train the SVM classifier on the training features and labels
svm_model.fit(train_features, train_labels)


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# Make predictions and obtain probabilities
train_predictions = svm_model.predict(train_features)
val_predictions = svm_model.predict(val_features)
test_predictions = svm_model.predict(test_features)

# Obtain probability estimates
train_proba = svm_model.predict_proba(train_features)
val_proba = svm_model.predict_proba(val_features)

# Calculate Accuracy
train_accuracy = accuracy_score(train_labels, train_predictions)
val_accuracy = accuracy_score(val_labels, val_predictions)

# Calculate Precision
train_precision = precision_score(train_labels, train_predictions, average='macro')
val_precision = precision_score(val_labels, val_predictions, average='macro')

# Calculate Recall 
train_recall = recall_score(train_labels, train_predictions, average='macro')
val_recall = recall_score(val_labels, val_predictions, average='macro')

# Calculate AUC 
train_auc = roc_auc_score(train_labels, train_proba, multi_class='ovo', average='macro')
val_auc = roc_auc_score(val_labels, val_proba, multi_class='ovo', average='macro')

print(f"Training Accuracy: {train_accuracy}")
print(f"Validation Accuracy: {val_accuracy}")
print(f"Training Precision: {train_precision}")
print(f"Validation Precision: {val_precision}")
print(f"Training Recall: {train_recall}")
print(f"Validation Recall: {val_recall}")
print(f"Training AUC: {train_auc}")
print(f"Validation AUC: {val_auc}")


Training Accuracy: 0.9998403830806065
Validation Accuracy: 0.9777158774373259
Training Precision: 0.9998404594767071
Validation Precision: 0.977993790638009
Training Recall: 0.9998402555910543
Validation Recall: 0.9777220125316699
Training AUC: 0.9999999320161811
Validation AUC: 0.9978350986805634


In [43]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_matrix = confusion_matrix(test_labels, test_predictions)

print("Confusion Matrix:")
print(conf_matrix)

#Specificity per class 
specificity_per_class = []

for i in range(len(conf_matrix)):
    # True negatives: sum of all elements that are not in the i-th row or column
    TN = conf_matrix.sum() - (conf_matrix[i, :].sum() + conf_matrix[:, i].sum() - conf_matrix[i, i])
    # False positives: sum of the i-th column, excluding the diagonal element
    FP = conf_matrix[:, i].sum() - conf_matrix[i, i]
    # False negatives: sum of the i-th row, excluding the diagonal element
    FN = conf_matrix[i, :].sum() - conf_matrix[i, i]
    # True positives: diagonal element
    TP = conf_matrix[i, i]
    
    # Specificity
    specificity = TN / (TN + FP)
    specificity_per_class.append(specificity)

# Print specificity for each class
for i, specificity in enumerate(specificity_per_class):
    print(f"Specificity for class {i}: {specificity}")


Confusion Matrix:
[[393   6   0   0]
 [  0 369  27   1]
 [  0   4 394   0]
 [  0   0   0 397]]
Specificity for class 0: 1.0
Specificity for class 1: 0.9916247906197655
Specificity for class 2: 0.9773679798826488
Specificity for class 3: 0.9991624790619765


In [44]:
from sklearn.metrics import recall_score

# Compute recall (sensitivity) for each class
sensitivity_per_class = recall_score(test_labels, test_predictions, average=None)

# Print sensitivity for each class
for i, sensitivity in enumerate(sensitivity_per_class):
    print(f"Sensitivity for class {i}: {sensitivity}")


Sensitivity for class 0: 0.9849624060150376
Sensitivity for class 1: 0.929471032745592
Sensitivity for class 2: 0.9899497487437185
Sensitivity for class 3: 1.0
