In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

#### Funtion to retrieve from the folder having structure
data/
    train/
        class1/
            image1.png
            .
            .
        class2/
            .
            .
    test/
        .
    valid/
        .


In [2]:
def load_data(main_foler_path):
    images=[]
    labels=[]
    
    for data_type in os.listdir(main_foler_path):
        type_foler_path=os.path.join(main_foler_path,data_type)
        
        if not os.path.isdir(type_foler_path):
            continue
        for class_folder in os.listdir(type_foler_path):
            if class_folder.startswith('.'):
                continue
                
            class_folder_path=os.path.join(type_foler_path,class_folder)

            if not os.path.isdir(class_folder_path):
                continue

            for filename in tqdm(os.listdir(class_folder_path)):
                if filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg"):
                    image_path=os.path.join(class_folder_path,filename)
                    img=load_img(image_path,target_size=(224,224))
                    img=img_to_array(img)
                    label=class_folder

                    images.append(img)
                    labels.append(label)
    return np.array(images),np.array(labels)

#### Images folder

In [3]:
main_folder_path='data'

In [4]:
X,y=load_data(main_folder_path)

100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 269.97it/s]
100%|██████████████████████████████████████████| 21/21 [00:00<00:00, 347.97it/s]
100%|██████████████████████████████████████████| 23/23 [00:00<00:00, 344.08it/s]
100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 94.31it/s]
100%|██████████████████████████████████████████| 90/90 [00:00<00:00, 250.74it/s]
100%|██████████████████████████████████████████| 51/51 [00:00<00:00, 272.98it/s]
100%|████████████████████████████████████████| 120/120 [00:00<00:00, 241.58it/s]
100%|██████████████████████████████████████████| 54/54 [00:00<00:00, 108.54it/s]
100%|████████████████████████████████████████| 155/155 [00:00<00:00, 303.97it/s]
100%|████████████████████████████████████████| 115/115 [00:00<00:00, 315.56it/s]
100%|████████████████████████████████████████| 195/195 [00:00<00:00, 306.91it/s]
100%|████████████████████████████████████████| 148/148 [00:01<00:00, 137.05it/s]


## Image segmentaion and feature extraction using the K means clustering and pre defined model VGG16

#### Image segmentation

In [5]:
kmeans=KMeans(n_clusters=2,random_state=42)
X_flattened_images=X.reshape(X.shape[0],-1)
kmeans.fit(X_flattened_images)



In [6]:
X_image_clusters=kmeans.predict(X_flattened_images)

#### Encoding labels to 0,1,2 and 3

In [7]:
from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder()
y_encoded=encoder.fit_transform(y.reshape(-1,1)).flatten()

#### Data splitting

In [8]:
X_train,X_test,y_train,y_test,X_cluster_train,X_cluster_test=train_test_split(X,y_encoded,
                                                                             X_image_clusters,
                                                                             test_size=0.3,
                                                                             random_state=42)

#### Loading VGG16 model for feature extraction

In [258]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
base_model=VGG16(weights='imagenet',include_top=False,input_shape=(224,224,3))
# Freeze VGG16 layers
for layer in base_model.layers:
    layer.trainable = False

#### Preprocessing image before feature extraction

In [259]:
X_train_preprocessed=preprocess_input(X_train)
X_test_preprocessed=preprocess_input(X_test)

#### Extracting features

In [260]:
X_train_features=base_model.predict(X_train_preprocessed)
X_test_features=base_model.predict(X_test_preprocessed)



#### Reshapping training and testing set

In [261]:
X_train_flatten=X_train_features.reshape(X_train_features.shape[0],-1)
X_test_flatten=X_test_features.reshape(X_test_features.shape[0],-1)

# Model building

### SVM model

In [247]:
from sklearn.svm import SVC
svm_clf_model=SVC(kernel='linear',C=1.0)
svm_clf_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),
                 y_train)
y_pred_svm=svm_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_svm=accuracy_score(y_pred_svm,y_test)
print(f"Accuracy of SVM classifier : {accuracy_score_svm*100:.2f}%")

Accuracy of SVM classifier : 97.67%


### Logistic Regression model

In [1]:
from sklearn.linear_model import LogisticRegression
logr_clf_model=LogisticRegression(random_state=42)
logr_clf_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),
                 y_train)
y_pred_logr=logr_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_logr=accuracy_score(y_pred_logr,y_test)
print(f"Accuracy of Logistic Regression classifier : {accuracy_score_logr*100:.2f}%")

python(2135) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


NameError: name 'np' is not defined

### SVM and Logistic regression performance measures

In [258]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming svm_clf and logr_clf are your trained SVM and Logistic Regression models
# y_test, y_pred_svm, and y_pred_logr are your test labels and predictions

# SVM
svm_precision = precision_score(y_test, y_pred_svm, average='weighted')
svm_recall = recall_score(y_test, y_pred_svm, average='weighted')
svm_f1 = f1_score(y_test, y_pred_svm, average='weighted')

# Logistic Regression
logr_precision = precision_score(y_test, y_pred_logr, average='weighted')
logr_recall = recall_score(y_test, y_pred_logr, average='weighted')
logr_f1 = f1_score(y_test, y_pred_logr, average='weighted')
print(f"Accuracy of SVM classifier : {accuracy_score_svm*100:.2f}%")
print(f"SVM Precision: {svm_precision*100:.2f}%")
print(f"SVM Recall: {svm_recall*100:.2f}%")
print(f"SVM F1 Score: {svm_f1*100:.2f}%")
print(f"Accuracy of Logistic Regression classifier : {accuracy_score_logr*100:.2f}%")
print(f"Logistic Regression Precision: {logr_precision*100:.2f}%")
print(f"Logistic Regression Recall: {logr_recall*100:.2f}%")
print(f"Logistic Regression F1 Score: {logr_f1*100:.2f}%")

Accuracy of SVM classifier : 97.67%
SVM Precision: 97.69%
SVM Recall: 97.67%
SVM F1 Score: 97.67%
Accuracy of Logistic Regression classifier : 96.67%
Logistic Regression Precision: 96.70%
Logistic Regression Recall: 96.67%
Logistic Regression F1 Score: 96.67%


## Other different models

### XGBoost model

In [200]:
from xgboost import XGBClassifier
xgb_clf_model=XGBClassifier()
xgb_clf_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),
                 y_train)
y_pred_xgb=xgb_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_xgb=accuracy_score(y_pred_xgb,y_test)
print(f"Accuracy of XGBoost classifier : {accuracy_score_xgb*100:.2f}%")

Accuracy of XGBoost classifier : 92.67%


### Decision Tree model

In [207]:
from sklearn.tree import DecisionTreeClassifier
dt_clf_model=DecisionTreeClassifier(max_depth=10)
dt_clf_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),y_train)
y_pred_dt=dt_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_dt=accuracy_score(y_pred_dt,y_test)
print(f"Accuracy of Decision Tree classifier : {accuracy_score_dt*100:.2f}%")

Accuracy of Decision Tree classifier : 72.33%


### Random Forest model

In [209]:
from sklearn.ensemble import RandomForestClassifier
rf_clf_model=RandomForestClassifier(max_depth=100)
rf_clf_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),y_train)
y_pred_rf=rf_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_rf=accuracy_score(y_pred_rf,y_test)
print(f"Accuracy of Random Forest classifier : {accuracy_score_rf*100:.2f}%")

Accuracy of Random Forest classifier : 91.00%


### Ensemble model with voting classifier model

In [210]:
from sklearn.ensemble import VotingClassifier
voting_model=VotingClassifier(estimators=[('Random Forest',RandomForestClassifier()),
                                          ('XGBoost',XGBClassifier())],voting='soft')
voting_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),y_train)
y_pred_vm=rf_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_vm=accuracy_score(y_pred_vm,y_test)
print(f"Accuracy of Voting classifier : {accuracy_score_vm*100:.2f}%")

Accuracy of Voting classifier : 91.00%


### KNN model

In [211]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf_model=KNeighborsClassifier(n_neighbors=1)
knn_clf_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),y_train)
y_pred_knn=knn_clf_model.predict(np.concatenate([X_test_flatten,X_cluster_test.reshape(-1,1)],axis=1))
accuracy_score_knn=accuracy_score(y_pred_knn,y_test)
print(f"Accuracy of KNN classifier : {accuracy_score_knn*100:.2f}%")

Accuracy of KNN classifier : 95.67%


## Forward Propagation Neural Network(VGG16 based)

#### Reshapping VGG16 processed data

In [270]:
input_shape=X_train_flatten.shape[1]+1

#### Building neural network

In [346]:
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
import tensorflow as tf

custom_model = models.Sequential()
custom_model.add(Flatten(input_shape=(input_shape,)))
custom_model.add(Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001)))
custom_model.add(BatchNormalization())
custom_model.add(Dropout(0.3))
# custom_model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))
# custom_model.add(BatchNormalization())
# custom_model.add(Dropout(0.3))
custom_model.add(Dense(len(np.unique(y_train)), activation='softmax'))
# Compile the model
custom_model.compile(optimizer='adam',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

# Train the model
history = custom_model.fit(np.concatenate([X_train_flatten,X_cluster_train.reshape(-1,1)],axis=1),
    y_train,
    epochs=10,
    batch_size=32)

# Evaluate the model on the test set
y_pred_vgg16 = np.argmax(custom_model.predict(np.concatenate([X_test_flatten, X_cluster_test.reshape(-1, 1)], axis=1)), axis=1)
accuracy_score_vgg16 = accuracy_score(y_pred_vgg16, y_test)
print(f"Accuracy of VGG16-based classifier: {accuracy_score_vgg16*100:.2f}%")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy of VGG16-based classifier: 99.33%


In [347]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import numpy as np


# Precision
precision_vgg16 = precision_score(y_test, y_pred_vgg16, average='weighted')

# Recall
recall_vgg16 = recall_score(y_test, y_pred_vgg16, average='weighted')

# F1 Score
f1_score_vgg16 = f1_score(y_test, y_pred_vgg16, average='weighted')

# AUC Score for Multiclass
# Binarize the labels and calculate AUC for each class
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
y_pred_bin = label_binarize(y_pred_vgg16, classes=np.unique(y_test))

n_classes = len(np.unique(y_test))
auc_score_vgg16 = np.zeros(n_classes)

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    auc_score_vgg16[i] = auc(fpr, tpr)

# Average the AUC scores across all classes
average_auc_score_vgg16 = np.mean(auc_score_vgg16)

# Confusion Matrix for Multiclass
conf_matrix_vgg16 = confusion_matrix(y_test, y_pred_vgg16)

# Specificity
specificity_vgg16 = np.diag(conf_matrix_vgg16) / np.sum(conf_matrix_vgg16, axis=1)

# Print the results
print(f"Precision of VGG16-based classifier: {precision_vgg16:.2f}")
print(f"Recall of VGG16-based classifier: {recall_vgg16:.2f}")
print(f"F1 Score of VGG16-based classifier: {f1_score_vgg16:.2f}")
print(f"AUC of VGG16-based classifier: {average_auc_score_vgg16:.2f}")
print(f"Specificity of VGG16-based classifier: {np.mean(specificity_vgg16):.2f}")


Precision of VGG16-based classifier: 0.99
Recall of VGG16-based classifier: 0.99
F1 Score of VGG16-based classifier: 0.99
AUC of VGG16-based classifier: 1.00
Specificity of VGG16-based classifier: 0.99


In [291]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, cohen_kappa_score, roc_auc_score
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization
import tensorflow as tf

# Assuming X_train_flatten, X_cluster_train, y_train are your training data
X_train_combined = np.concatenate([X_train_flatten, X_cluster_train.reshape(-1, 1)], axis=1)
y_train_array = np.array(y_train)

# Initialize StratifiedKFold with 5 folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store evaluation metrics for each fold
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
mcc_scores = []
kappa_scores = []
roc_auc_scores = []

# Iterate through each fold
for train_index, test_index in kf.split(X_train_combined, y_train_array):
    X_train_fold, X_val_fold = X_train_combined[train_index], X_train_combined[test_index]
    y_train_fold, y_val_fold = y_train_array[train_index], y_train_array[test_index]

    # Create and compile the model
    custom_model = models.Sequential()
    custom_model.add(Flatten(input_shape=(input_shape,)))
    custom_model.add(Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.0001)))
    custom_model.add(BatchNormalization())
    custom_model.add(Dropout(0.3))
    custom_model.add(Dense(len(np.unique(y_train)), activation='softmax'))
    custom_model.compile(optimizer='adam',
                         loss='sparse_categorical_crossentropy',
                         metrics=['accuracy'])

    # Train the model on the current fold
    custom_model.fit(X_train_fold, y_train_fold, epochs=10, batch_size=32, verbose=0)

    # Evaluate the model on the validation set of the current fold
    y_pred_fold = np.argmax(custom_model.predict(X_val_fold), axis=1)
    
    # Calculate evaluation metrics for the current fold
    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    precision_fold = precision_score(y_val_fold, y_pred_fold, average='weighted')
    recall_fold = recall_score(y_val_fold, y_pred_fold, average='weighted')
    f1_fold = f1_score(y_val_fold, y_pred_fold, average='weighted')
    mcc_fold = matthews_corrcoef(y_val_fold, y_pred_fold)
    kappa_fold = cohen_kappa_score(y_val_fold, y_pred_fold)
    roc_auc_fold = roc_auc_score(y_val_fold, custom_model.predict(X_val_fold), multi_class='ovr')

    # Append scores to respective lists
    accuracy_scores.append(accuracy_fold)
    precision_scores.append(precision_fold)
    recall_scores.append(recall_fold)
    f1_scores.append(f1_fold)
    mcc_scores.append(mcc_fold)
    kappa_scores.append(kappa_fold)
    roc_auc_scores.append(roc_auc_fold)

# Calculate the average scores over all folds
average_accuracy = np.mean(accuracy_scores)
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1 = np.mean(f1_scores)
average_mcc = np.mean(mcc_scores)
average_kappa = np.mean(kappa_scores)
average_roc_auc = np.mean(roc_auc_scores)

# Print the average scores
print(f"Average Accuracy over 5 folds: {average_accuracy*100:.2f}%")
print(f"Average Precision over 5 folds: {average_precision*100:.2f}")
print(f"Average Recall over 5 folds: {average_recall*100:.2f}")
print(f"Average F1 Score over 5 folds: {average_f1*100:.2f}")
print(f"Average Matthews Correlation Coefficient (MCC) over 5 folds: {average_mcc:.4f}")
print(f"Average Cohen's Kappa over 5 folds: {average_kappa:.4f}")
print(f"Average ROC AUC Score over 5 folds: {average_roc_auc:.4f}")


Average Accuracy over 5 folds: 95.43%
Average Precision over 5 folds: 95.51
Average Recall over 5 folds: 95.43
Average F1 Score over 5 folds: 95.42
Average Matthews Correlation Coefficient (MCC) over 5 folds: 0.9384
Average Cohen's Kappa over 5 folds: 0.9381
Average ROC AUC Score over 5 folds: 0.9912
