 <span style="font-size:28px;"> **SKIN LESION CLASSIFICATION - VGG16 FOR FEATURE EXTRACTION.** </span>


The HAM10000 dataset has 7 different classes of skin lesion which are listed below :
1. Melanocytic nevi
2. Melanoma
3. Benign keratosis-like lesions
4. Basal cell carcinoma
5. Actinic keratoses
6. Vascular lesions
7. Dermatofibroma

 <span style="font-size:28px;"> **Importing and Installing Essential Libraries.** </span>

In [1]:
from numpy.random import seed
seed(101)
import tensorflow as tf
tf.random.set_seed(101)
import cv2
import tensorflowjs as tfjs
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten


import pandas as pd
import numpy as np
#import keras
#from keras import backend as K

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline

# Directory Structure

We create folders to store images that will later be fed to Keras generators

In [None]:
base_dir = 'base_dir'
os.mkdir(base_dir)

# create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)


test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)

# create new folders inside train_dir
nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)


# create new folders inside test_dir
nv = os.path.join(test_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(test_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(test_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(test_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(test_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(test_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(test_dir, 'df')
os.mkdir(df)

In [2]:
df_data = pd.read_csv('/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')
df_data.head()


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [4]:
# How many images in each class on HAM10000 dataset
df_data['dx'].value_counts()

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

In [6]:
#X = df_data.drop('dx', axis=1)
y = df_data['dx']

df_train, df_test = train_test_split(df_data, test_size=0.1, random_state=42, stratify=y)

print(df_train.shape)
print(df_test.shape)

(9013, 7)
(1002, 7)


In [7]:
#How many image each class in train set?
df_train['dx'].value_counts()

dx
nv       6034
mel      1002
bkl       989
bcc       463
akiec     294
vasc      128
df        103
Name: count, dtype: int64

In [8]:
#How many image each class in test set?
df_test['dx'].value_counts()

dx
nv       671
mel      111
bkl      110
bcc       51
akiec     33
vasc      14
df        12
Name: count, dtype: int64

# Now we will transfer the images into the created subfolders

In [10]:
# Set the image_id as the index in df_data
df_data.set_index('image_id', inplace=True, drop=False)
df_data

Unnamed: 0_level_0,lesion_id,image_id,dx,dx_type,age,sex,localization
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ISIC_0027419,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
ISIC_0025030,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
ISIC_0026769,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
ISIC_0025661,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
ISIC_0031633,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear
...,...,...,...,...,...,...,...
ISIC_0033084,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen
ISIC_0033550,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen
ISIC_0033536,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen
ISIC_0032854,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face


In [11]:
# Get a list of images in each of the two folders
folder_1 = os.listdir('../input/skin-cancer-mnist-ham10000/ham10000_images_part_1')
folder_2 = os.listdir('../input/skin-cancer-mnist-ham10000/ham10000_images_part_2')

# Get a list of train and val images
train_list = list(df_train['image_id'])
test_list = list(df_test['image_id'])


# Transfer the train images

for image in train_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(train_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)


# Transfer the test images

        
for image in test_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image, 'dx']
    
    if fname in folder_1:
        # source path to image
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_1', fname)
        # destination path to image
        dst = os.path.join(test_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # source path to image
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_2', fname)
        # destination path to image
        dst = os.path.join(test_dir, label, fname)
        # copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [12]:
# check how many train images we have in each folder

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

6034
1002
989
463
294
128
103


In [13]:
# check how many val images we have in each folder

print(len(os.listdir('base_dir/test_dir/nv')))
print(len(os.listdir('base_dir/test_dir/mel')))
print(len(os.listdir('base_dir/test_dir/bkl')))
print(len(os.listdir('base_dir/test_dir/bcc')))
print(len(os.listdir('base_dir/test_dir/akiec')))
print(len(os.listdir('base_dir/test_dir/vasc')))
print(len(os.listdir('base_dir/test_dir/df')))

671
111
110
51
33
14
12


In [4]:
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image

In [5]:
base_model_vgg = VGG16(include_top=True, weights="imagenet",)
model = Model(inputs=base_model_vgg.input, outputs=base_model_vgg.get_layer('flatten').output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [7]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [40]:
def VGG_16(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    array = cv2.resize(img, (224, 224))
    array = np.expand_dims(array, axis=0)
    array = preprocess_input(array)
    feature = model.predict(array, verbose=0)[0]
    return feature

In [64]:
train_dir = '/kaggle/working/base_dir/train_dir'
test_dir = '/kaggle/working/base_dir/test_dir'
def read_data(data_dir):
    features = []
    labels = []

    for folder in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder)

        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)
            img = cv2.imread(img_path)
            feature = VGG_16(img)
            
            features.append(feature)
            labels.append(folder)

    features = np.asarray(features)
    labels = np.asarray(labels)

    return features, labels
    

# FEATURE EXTRACTION AND STORE IN CSV FILE

In [73]:
features_train, label_train = read_data(train_dir)

In [74]:

# Tạo DataFrame từ features_train và label_train
df = pd.DataFrame(features_train, columns=['Feature_' + str(i+1) for i in range(features_train.shape[1])])
df['Label'] = label_train

# Đường dẫn tới tệp CSV cần lưu
csv_file_path = 'train_data.csv'

# Lưu DataFrame vào tệp CSV
df.to_csv(csv_file_path, index=False)

In [66]:
features_test, label_test = read_data(test_dir)

In [69]:

# Tạo DataFrame từ features_test và label_test
df = pd.DataFrame(features_test, columns=['Feature_' + str(i+1) for i in range(features_test.shape[1])])
df['Label'] = label_test

# Đường dẫn tới tệp CSV cần lưu
csv_file_path = 'test_data_flatten.csv'

# Lưu DataFrame vào tệp CSV
df.to_csv(csv_file_path, index=False)