In [None]:
# Skin Cancer Detection

### Import required libraries

import os
import shutil
import textwrap as tw
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.ticker import FuncFormatter
import seaborn as sns

import cv2
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import svm

from skimage.color import rgb2gray
from skimage.filters import threshold_otsu
from skimage.measure import label, regionprops

from scipy import ndimage


# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization, Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.applications.resnet import preprocess_input

import warnings
warnings.filterwarnings('ignore')

### EDA

data = pd.read_csv("./ISIC_2019_Training_GroundTruth.csv", header = 0)
data.head(20)

data.info()

data.tail()

data.describe()

data.corr()

lesion_type_dict = {
    'NV': 'Melanocytic nevi',
    'MEL': 'Melanoma',
    'BKL': 'Benign keratosis ',
    'BCC': 'Basal cell carcinoma',
    'AK': 'Actinic keratoses',
    'VASC': 'Vascular lesions',
    'DF': 'Dermatofibroma',
    'SCC' : 'Squamous cell carcinoma'
}

# print all columns
pd.set_option('display.max_columns', None)

# inhibit graphics card runs out of memory
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)### Step 1: Load and preprocess the image within the CNN architecture

### Step 1: Load and preprocess the image within the CNN architecture

def load_data(path: str):
    dir = Path(path)

    # list of all filepathes
    filepaths = list(dir.glob(r'**/*.jpg'))

    # list of labels extracted from last foldername of filepath
    labels = list(map(lambda l: os.path.split(os.path.split(l)[0])[1], filepaths))

    # series of string filepathes
    filepaths = pd.Series(filepaths, name='FilePaths').astype(str)

    # series of string labels
    labels = pd.Series(labels, name='Labels').astype(str)

    # merge series to dataframe df
    df = pd.merge(filepaths, labels, right_index=True, left_index=True)

    # Resampling complete rows and reset the index
    return df.sample(frac=1).reset_index(drop=True)

df = load_data('./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train')

# total categires
len(os.listdir('./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train'))

os.listdir('./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train')

list_diseases = os.listdir('./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train')

results2 = []
for disease in list_diseases:
    dies_name_count = {}
    count_disease = len(os.listdir(f'./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train'))
    dies_name_count['disease'] = disease
    dies_name_count['count_images'] = count_disease
    results2.append(dies_name_count)

results = pd.DataFrame(results2)
results

df.head(15)

df.info()

# ordered count of rows per unique label
labels_count = df['Labels'].value_counts(ascending=True)

f = plt.figure(figsize=(15, 6))
s = sns.barplot(labels_count.index,labels_count.values)
sns.despine()
s.set_xticklabels(s.get_xticklabels(), rotation = 30)

def plot_images_per_label(df, label, cols: int, size: tuple):
    fig, axs = plt.subplots(nrows=1, ncols=cols, figsize=size)

    cntMax = cols
    cntCur = 0
    for index, row in df.iterrows():
        if(row['Labels'] == label and cntCur < cntMax):
            axs[cntCur].imshow(plt.imread(df.FilePaths[index]))
            axs[cntCur].set_title(df.Labels[index])

            cntCur += 1
        else:
            if(cntCur >= cntMax):
                break

    plt.tight_layout()
    plt.show()


# unique labels
labels = sorted(df['Labels'].unique())
# loop through labels
for label in labels:
    plot_images_per_label(df, label, 3, (12,9))

from sklearn.model_selection import train_test_split

# stratified train and val (25%) datasets
X_train, X_val = train_test_split(df, test_size=0.25, stratify=df['Labels'], random_state=1)

print('Train Data: ', X_train.shape)
print('Val Data: ', X_val.shape)

# number of samples/images per iteration
BATCH_SIZE = 32
# input image size
IMG_SIZE = (224, 224)
# count of epchos
EPOCHS = 10

# image preprocessing
img_data_gen = ImageDataGenerator(shear_range=0.2,
                                  zoom_range=0.2,
                                  horizontal_flip=True,
                                  preprocessing_function=preprocess_input)

X_train = img_data_gen.flow_from_dataframe(dataframe=X_train,
                                           x_col='FilePaths',
                                           y_col='Labels',
                                           target_size=IMG_SIZE,
                                           color_mode='rgb',
                                           class_mode='categorical',
                                           batch_size=BATCH_SIZE,
                                           seed=1)

X_val = img_data_gen.flow_from_dataframe(dataframe=X_val,
                                         x_col='FilePaths',
                                         y_col='Labels',
                                         target_size=IMG_SIZE,
                                         color_mode='rgb',
                                         class_mode='categorical',
                                         batch_size=BATCH_SIZE,
                                         seed=1)

fit, ax = plt.subplots(nrows=3, ncols=3, figsize=(12,15))

for i, a in enumerate(ax.flat):
    img, label = X_train.next()
    a.imshow(img[0],)
    a.set_title(label[0])

plt.tight_layout()
plt.show()

#training data
training_dir="./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train"

#increases amount of data by making different forms of image
training_generator = ImageDataGenerator(rescale=1/255,
                                        featurewise_center = True,
                                        samplewise_center=True,
                                        featurewise_std_normalization=False,
                                        samplewise_std_normalization=True,
                                        zca_whitening=False,
                                        rotation_range=30,
                                        zoom_range=0.2,
                                        width_shift_range=0.1,
                                        height_shift_range=0.1,
                                        horizontal_flip=True,
                                        vertical_flip=True)

#creates accessible training data
train_generator=training_generator.flow_from_directory(training_dir,target_size=(224,224),
                                                       batch_size=4,class_mode='binary')

#validation data
validation_dir="./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Test"

#increases amount of data by making different forms of image
validation_generator=ImageDataGenerator(rescale=1/255)
val_generator=validation_generator.flow_from_directory(validation_dir,target_size=(224,224), batch_size=4, class_mode='binary')

#testing data
testing_dir="./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Test"

#increases amount of data by making different forms of image
testing_generator = ImageDataGenerator(rescale=1/255,
                                        featurewise_center = False,
                                        samplewise_center=False,
                                        featurewise_std_normalization=False,
                                        samplewise_std_normalization=False,
                                        zca_whitening=False,
                                        rotation_range=30,
                                        zoom_range=0.2,
                                        width_shift_range=0.1,
                                        height_shift_range=0.1,
                                        horizontal_flip=False,
                                        vertical_flip=False)

#creates accessible testing data
test_generator=training_generator.flow_from_directory(testing_dir,target_size=(224,224),
                                                       batch_size=4,class_mode='binary')

#model sequelling
model = Sequential()

model.add(layers.Conv2D(32, (3,3), input_shape = (224,224,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))

model.add(layers.Conv2D(64, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))

model.add(layers.Dropout(0.2))

model.add(layers.Conv2D(128, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))
model.add(layers.Dropout(0.2))

model.add(layers.Conv2D(256, (3,3), activation = 'relu'))
model.add(layers.MaxPooling2D(2,2))

model.add(layers.Flatten())
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

#compiling the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['acc'])

# stop training when accuracy has stopped improving
cb = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=3)
hst = model.fit(X_train, validation_data=X_val, epochs=EPOCHS, callbacks=cb)

# train model
hst = model.fit(X_train, validation_data=X_val, epochs=EPOCHS)

acc = hst.history['acc']
val_acc = hst.history['val_acc']
loss = hst.history['loss']
val_loss = hst.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs,acc,'r', label = 'Training accuracy')
plt.plot(epochs, val_acc, 'b', label = 'validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc= 0)
plt.figure()


X_test = load_data('./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Test')

X_test.head(15)

print('Test Data: ', X_test.shape)

# ordered count of rows per unique label
X_test['Labels'].value_counts(ascending=True)

#### Image Preprocessing

X_test = img_data_gen.flow_from_dataframe(dataframe=X_test,
                                          x_col='FilePaths',
                                          y_col='Labels',
                                          target_size=IMG_SIZE,
                                          color_mode='rgb',
                                          class_mode='categorical',
                                          batch_size=BATCH_SIZE,
                                          shuffle=False, # necessary fpr confusion matrix
                                          seed=1)

res = model.evaluate(X_test)

#### Accuracy and loss

print(f'Train Accuracy: {hst.history["acc"][-1:][0] * 100:.2f}')
print(f'Val Accuracy: {hst.history["val_acc"][-1:][0] * 100:.2f}')
print(f'Test Accuracy: {res[1] * 100:.2f}')

print(f'Train Loss: {hst.history["loss"][-1:][0] * 100:.2f}')
print(f'Val Loss: {hst.history["val_loss"][-1:][0] * 100:.2f}')
print(f'Test Loss: {res[0] * 100:.2f}')

#### Predicted Labels and Rounded Labels

Y_pred = model.predict(X_test)
print("Y_pred", Y_pred.shape)

y_pred = np.argmax(Y_pred, axis=1)
print("y_pred", y_pred.size)

#### True Labels and Label Classes

y_true = X_test.classes
print("y_pred", len(y_pred))

class_labels = list(X_test.class_indices.keys())
print("labels", len(class_labels))

# compare with true labels
cfm = confusion_matrix(y_pred, y_true, normalize='true')

# plot size
fig, ax = plt.subplots(figsize=(15,15))

# print confusion matrix
s = sns.heatmap(cfm,
               annot=True,
               cmap=['#ff0001', '#09AA11'],
               center=0.8,
               fmt='.1%',
               linewidths=.5,
               cbar_kws={'format': FuncFormatter(lambda x, pos: '{:.0%}'.format(x))},
               linecolor='Black',
               ax=ax)

# set labels
s.set(xlabel='Predict', ylabel='True')
s.set(title='Confusion Matrix')
s.set_yticklabels([tw.fill(e, 10) for e in class_labels])
s.set_xticklabels([tw.fill(e, 10) for e in class_labels])

def load_data(path: str):
    dir = Path(path)

    # list of all filepathes
    filepaths = list(dir.glob(r'**/*.jpg'))

    # list of labels extracted from last foldername of filepath
    labels = list(map(lambda l: os.path.split(os.path.split(l)[0])[1], filepaths))

    # series of string filepathes
    filepaths = pd.Series(filepaths, name='FilePaths').astype(str)

    # series of string labels
    labels = pd.Series(labels, name='Labels').astype(str)

    # merge series to dataframe df
    df = pd.merge(filepaths, labels, right_index=True, left_index=True)

    # Resampling complete rows and reset the index
    return df.sample(frac=1).reset_index(drop=True)

df = load_data('./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Train')

# Read Sample Image
image_path = "./input/skin-cancer9-classesisic/Skin cancer ISIC The International Skin Imaging Collaboration/Test/basal cell carcinoma/ISIC_0024331.jpg"
image = plt.imread(image_path)

plt.title("Sample Image")
plt.imshow(image)

preprocessed_image = model.output
preprocessed_image = model.predict(np.expand_dims(image, axis=0))[0]

### Step 1: Preprocessing

#### Step 1.1: Apply Hough's transform to remove hair

# Convert image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply Canny edge detection
edges = cv2.Canny(gray, 50, 150)

# Apply Hough's transform to detect lines representing hair
lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=50, minLineLength=100, maxLineGap=10)

# Create a mask image to draw the detected lines
mask = np.zeros_like(image)

# Draw the detected lines on the mask image
for line in lines:
    x1, y1, x2, y2 = line[0]
    cv2.line(mask, (x1, y1), (x2, y2), (255, 255, 255), thickness=2)

# Apply bitwise AND operation to remove hair from the original image
hair_removed = cv2.bitwise_and(image, cv2.bitwise_not(mask))

plt.title("Hair Removal")
plt.imshow(hair_removed)

#### Step 1.2: Apply MATLAB filters to remove shade and glare

# Convert image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply shading correction filter (e.g., morphological opening)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (10, 10))
shading_corrected = cv2.morphologyEx(gray, cv2.MORPH_TOPHAT, kernel)

# Apply glare removal filter (e.g., guided filter)
radius = 10
epsilon = 0.1
glare_removed = cv2.ximgproc.guidedFilter(shading_corrected, gray, radius, epsilon)

# Convert the image back to color if needed
if len(image.shape) == 3:
    glare_removed = cv2.cvtColor(glare_removed, cv2.COLOR_GRAY2BGR)

plt.title("Glare Removal")
plt.imshow(glare_removed)

#### Step 1.3: Perform contrast enhancement

# Convert image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply contrast enhancement algorithm (e.g., histogram equalization)
enhanced = cv2.equalizeHist(gray)

# Convert the enhanced grayscale image back to the original color space if needed
if len(image.shape) == 3:
    enhanced = cv2.cvtColor(enhanced, cv2.COLOR_GRAY2BGR)

plt.title("Contrast Enhancement")
plt.imshow(enhanced)

### Step 2: Perform segmentation using Otsu's thresholding

# Convert the image to grayscale
gray_image = rgb2gray(image)

plt.title("Grayscaled Image")
plt.imshow(gray_image)

# Apply histogram equalization
equalized_image = cv2.equalizeHist((gray_image * 255).astype(np.uint8))

plt.title("Histogram Equalization")
plt.imshow(equalized_image)

# Perform Otsu's thresholding
threshold_value = threshold_otsu(equalized_image)
binary_image = (equalized_image > threshold_value).astype(np.uint8)

plt.title("Binarized Image")
plt.imshow(binary_image)

### Step 3: Extract features

# Compute the area of the segmented region
area = np.sum(binary_image)

# Compute the mean, variance, and standard deviation of the segmented region
masked_image = gray_image * binary_image
mean = np.mean(masked_image)
variance = np.var(masked_image)
std_dev = np.std(masked_image)

features = [mean, variance, std_dev, area]

features

### Step 4: Process image to get mean value of red, blue, and green pixels

image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Get the pixel data of the image
pixels = image[0:image.shape[0], 0:image.shape[1], 0:4]

# Initialize variables for accumulating the total sum of red, green, and blue values
total_red = 0
total_green = 0
total_blue = 0

# Iterate over each pixel in the image
for y in range(image.shape[1]):
    for x in range(image.shape[0]):
        # Get the RGB values of the current pixel
        r, g, b = pixels[x, y]

        # Accumulate the RGB values
        total_red += r
        total_green += g
        total_blue += b

# Calculate the mean values
num_pixels = image.shape[0] * image.shape[1]
mean_red = total_red / num_pixels
mean_green = total_green / num_pixels
mean_blue = total_blue / num_pixels

# Calculate the mean color distance threshold
threshold = 90

# Iterate over each pixel again to check and discard portions based on mean color distance
for y in range(image.shape[1]):
  for x in range(image.shape[0]):
      # Get the RGB values of the current pixel
      r, g, b = pixels[x, y]

      # Calculate the Euclidean distance between the pixel's color and the mean color
      distance = ((r - mean_red) ** 2 + (g - mean_green) ** 2 + (b - mean_blue) ** 2) ** 0.5

      # Discard the pixel if the mean color distance is less than the threshold
      if distance < threshold:
          pixels[x, y] = (0, 0, 0)  # Set the pixel to black

plt.title("RGB Thresholding")
plt.imshow(image)

### Step 5: Perform border extraction to segment the image into lesion and background skin

# Convert image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply edge detection (e.g., Canny edge detector)
edges = cv2.Canny(gray, 100, 200)

# Apply threshold to obtain a binary image
_, thresholded = cv2.threshold(edges, 128, 255, cv2.THRESH_BINARY)

# Perform morphological operations (e.g., dilation) if needed
kernel = np.ones((3, 3), np.uint8)
dilated = cv2.dilate(thresholded, kernel, iterations=1)

# Create a segmented image by masking the original image with the extracted borders
segmented = cv2.bitwise_and(image, image, mask=dilated)

plt.title("Image Borders")
plt.imshow(segmented)

### Step 6: Classify the image using Support Vector Machines (SVM)

train,test = train_test_split(data, test_size = 100)
print(train.shape)
print(test.shape)

prediction_var = ['MEL', 'NV', 'BCC', 'AK', 'BKL','DF','VASC','SCC']

train_X = train[prediction_var]
train_Y = train.MEL
test_X = test[prediction_var]
test_Y = test.MEL

model = svm.SVC()
model.fit(train_X, train_Y)

prediction = model.predict(test_X)

print(prediction)

from sklearn import metrics
metrics.accuracy_score(prediction, test_Y)