<a href="https://colab.research.google.com/github/iyngaran/skin-disease-detection-using-machine-learning-algorithms/blob/master/LC_model_improvement_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steps



1.   Download the lesion-diagnosis Dataset and unzip it clone the git repo which has a utility function to generate images for training and testing.
2.   Prepare datasets for train and test.
3.   Visualize the images
4.   Building a convolutional neural network
     1.   Preprocess / normalize the images 
     2.   Load all the images
     3.   Build a CNN to find patterns in the images
     4.   Compile our CNN
     5.   Fit the CNN to our training data
     6.   Visualise the training results
     7.   Recreate the model again and compile it and train


# 1. Download the lesion-diagnosis Dataset and unzip it clone the git repo which has a utility function to generate images for training and testing.

In [None]:
import zipfile
import os

#os.system("rm -rf ipythonUtils")
#os.system("rm -rf lesion-diagnosis")
#os.system("rm -rf lesion-diagnosis-ver2.zip")
#os.system("rm -rf test")
#os.system("rm -rf train")

isExist = os.path.exists("ipython-utils")
if not os.path.exists("ipython-utils"):
    !git clone https://github.com/iyngaran/ipythonUtils.git


if not os.path.exists("lesion-diagnosis-ver2.zip"):
    !wget  https://testing.idev55.com/lesion-diagnosis-ver2.zip

if not os.path.exists("lesion-diagnosis"):
    # Unzip the downloaded file
    zip_file = zipfile.ZipFile("lesion-diagnosis-ver2.zip", "r")
    zip_file.extractall()
    zip_file.close()

In [None]:
import os

# Walk through pizza_steak directory and list number of files
for dirpath, dirnames, filenames in os.walk("lesion-diagnosis"):
  print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

**To Balance the data in all the classes, let's use `Augmentor` with `flip_top_bottom` and `zoom` to generate images**

In [None]:
!pip install Augmentor

In [None]:
import Augmentor
p = Augmentor.Pipeline("lesion-diagnosis/solar-lentigo")
p.flip_top_bottom(probability=0.8)
p.zoom(probability=0.3, min_factor=1.1, max_factor=1.6)
p.sample(570) ## We are adding 570 samples.

p = Augmentor.Pipeline("lesion-diagnosis/vascular-lesion")
p.flip_top_bottom(probability=0.8)
p.zoom(probability=0.3, min_factor=1.1, max_factor=1.6)
p.sample(739) ## We are adding 739 samples.

p = Augmentor.Pipeline("lesion-diagnosis/dermatofibroma")
p.flip_top_bottom(probability=0.8)
p.zoom(probability=0.3, min_factor=1.1, max_factor=1.6)
p.sample(745) ## We are adding 745 samples.


p = Augmentor.Pipeline("lesion-diagnosis/squamous-cell-carcinoma")
p.flip_top_bottom(probability=0.8)
p.zoom(probability=0.3, min_factor=1.1, max_factor=1.6)
p.sample(323) ## We are adding 323 samples.

p = Augmentor.Pipeline("lesion-diagnosis/actinic-keratosis")
p.flip_top_bottom(probability=0.8)
p.zoom(probability=0.3, min_factor=1.1, max_factor=1.6)
p.sample(88) ## We are adding 88 samples per class.

In [None]:
!mv lesion-diagnosis/solar-lentigo/output/* lesion-diagnosis/solar-lentigo/
!mv lesion-diagnosis/vascular-lesion/output/* lesion-diagnosis/vascular-lesion/
!mv lesion-diagnosis/dermatofibroma/output/* lesion-diagnosis/dermatofibroma/
!mv lesion-diagnosis/squamous-cell-carcinoma/output/* lesion-diagnosis/squamous-cell-carcinoma/
!mv lesion-diagnosis/actinic-keratosis/output/* lesion-diagnosis/actinic-keratosis/


!rm -rf lesion-diagnosis/solar-lentigo/output
!rm -rf lesion-diagnosis/vascular-lesion/output
!rm -rf lesion-diagnosis/dermatofibroma/output
!rm -rf lesion-diagnosis/squamous-cell-carcinoma/output
!rm -rf lesion-diagnosis/actinic-keratosis/output

In [None]:
for dirpath, dirnames, filenames in os.walk("lesion-diagnosis"):
  print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

 we can see here, all the classes have the same number of images

# 2. Prepare datasets for train and test 

Generate test and train datasets using the `generate_test_and_train_datasets` utility function

In [None]:
from ipythonUtils.generateTestTrain import generate_test_and_train_datasets
if not os.path.exists("train/actinic-keratosis"):
   generate_test_and_train_datasets()

Turn our training path into a Python path and created a list of class_names from the subdirectories

In [None]:
import pathlib
import numpy as np

data_dir = pathlib.Path("train") 
class_names = np.array(sorted([item.name for item in data_dir.glob('*')]))
print(class_names)

In [None]:
!ls train

**Setup the train and test directories**

In [None]:
data_dir_train = pathlib.Path("train")
data_dir_test = pathlib.Path("test")

In [None]:
image_count_train = len(list(data_dir_train.glob('*/*.JPG')))
image_count_test = len(list(data_dir_test.glob('*/*.JPG')))
image_count_train, image_count_test

#3. Visualize the images

In [None]:
from ipythonUtils.generateTestTrain import view_random_image
img = view_random_image(target_dir="train", target_class="melanoma")

In [None]:
img2 = view_random_image(target_dir="test", target_class="nevus")

In [None]:
img.shape, img2.shape

#4.   Building a convolutional neural network

#### 4.1 Preprocess / normalize the images

In [None]:
batch_size = 32
img_height = 180
img_width = 180
num_classes = len(class_names)

In [None]:
import tensorflow as tf

## create train dataset

## get all the train images and resizing them to the size of img_height*img_width and create batches
train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir_train,
    seed=123,
    validation_split = 0.2,
    subset='training',
    image_size=(img_height, img_width),  
    batch_size=batch_size)

In [None]:
## create validation dataset
val_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir_train,
    seed=123,
    validation_split = 0.2,
    subset='validation',
    image_size=(img_height, img_width),  
    batch_size=batch_size)

**See the classes distribution**

**Load all the images - Randomly select an image from each classes and display them.**

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
  for i in range(len(class_names)-1):
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.subplot(3, 3, i + 1)
    plt.imshow(img)
    plt.title(class_names[labels[i]])
    plt.axis("off")

**Cache DataSet**

Keeps the images in memory using `Dataset.cache()` -  after images are loaded off disk during the first epoch.

Overlaps data preprocessing and model execution while training - `Dataset.prefetch().`

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_dataset = train_dataset.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

#### 4.3 Build a CNN to find patterns in the images

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

#### 4.4 Compile our CNN

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# View the summary of all layers
model.summary()

#### 4.5 Fit the CNN to our training data

In [None]:
epochs = 20
history = model.fit(
  train_dataset,
  validation_data=val_dataset,
  epochs=epochs
)

#### 4.6 Visualizing training results

In [None]:
acc = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_accuracy, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

As we can see here that the training accuracy has improved a lot however, the validation accuracy hasn’t improve much. It means the model is overfitting and it is not able to generalise well.

Similarly, the training loss is falling down and validation loss is increasing dramatically.

To overcome this problem, I am going to use augmentation technique. I am going to generate new samples by modifying the existing training data and use them t0 train the model.


In [None]:
data_augmentation = keras.Sequential([
      layers.experimental.preprocessing.RandomFlip("horizontal", input_shape=(img_height, img_width, 3)),
      layers.experimental.preprocessing.RandomRotation(0.1),
      layers.experimental.preprocessing.RandomZoom(0.1),
])

In [None]:
from glob import glob
path_list = [ x for x in glob(os.path.join(data_dir_train, '*', '*.JPG')) ] 
lesion_list = [ os.path.basename(os.path.dirname(y)) for y in glob(os.path.join(data_dir_train, '*', '*.JPG')) ]
print(len(lesion_list))

In [None]:
df_dict_original = dict(zip(path_list, lesion_list))
list(df_dict_original.items())[:2]

In [None]:
import pandas as pd
original_df = pd.DataFrame(list(df_dict_original.items()), columns=['Path','Label'])
original_df.head()

In [None]:
original_df[['Label']].value_counts()

In [None]:
original_df[['Label']].value_counts(normalize=True)

In [None]:
path_to_training_dataset=str(data_dir_train) + '/'
import Augmentor
for i in class_names:
    p = Augmentor.Pipeline(path_to_training_dataset + i)
    p.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)
    p.sample(1000) ## We are adding 1000 samples per class.

In [None]:
total_train_images = len(list(data_dir_train.glob('*/output/*.JPG')))
print(total_train_images)

**Let's see the distribution of augmented data.**

In [None]:
from glob import glob
path_list_new = [x for x in glob(os.path.join(data_dir_train, '*','output', '*.JPG'))]
path_list_new[:2]

In [None]:
lesion_list_new = [os.path.basename(os.path.dirname(os.path.dirname(y))) for y in glob(os.path.join(data_dir_train, '*','output', '*.JPG'))]
lesion_list_new[:2]

In [None]:
import pandas as pd
dataframe_dict_new = dict(zip(path_list_new, lesion_list_new))

df_2 = pd.DataFrame(list(dataframe_dict_new.items()),columns = ['Path','Label'])
new_df = original_df.append(df_2)
new_df.shape

In [None]:
new_df.head()

In [None]:
# Checking the class distribution after adding the new images
new_df['Label'].value_counts(normalize=True)

In [None]:
new_df['Label'].value_counts()

**Again : Train the model with complete dataset (including newly created images using Augmentor)**

**Create a training dataset**

In [None]:
# data_dir_train="path to directory with training data + data created using augmentor"
train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir_train,
  seed=123,
  validation_split = 0.2,
  subset = 'training',
  image_size=(img_height, img_width),
  batch_size=batch_size)

**Create a validation dataset**

In [None]:
val_dataset = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir_train,
  seed=123,
  validation_split = 0.2,
  subset = 'validation',
  image_size=(img_height, img_width),
  batch_size=batch_size)

**Recreate the model again**

In [None]:
model = Sequential([
  data_augmentation,
  layers.experimental.preprocessing.Rescaling(1./255),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
epochs = 30

history = model.fit(
  train_dataset,
  validation_data=val_dataset,
  epochs=epochs
)

**Visualize the model results**

In [None]:
acc = history.history['accuracy']
val_accuracy = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_accuracy, label='Validation Accuracy')
plt.legend(loc='upper left')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

# Making a prediction with our trained model

In [None]:
import matplotlib.image as mpimg

!wget https://goldencoastdermatology.com/wp-content/uploads/2020/02/ACTINIC-KERATOSES.jpg
actinic_keratosis = mpimg.imread("ACTINIC-KERATOSES.jpg")
plt.imshow(actinic_keratosis)
plt.axis(False);

In [None]:
# Create a function to import an image and resize it to be able to be used with our model
def load_and_prep_image(filename, img_shape=180):
  """
  Reads an image from filename, turns it into a tensor
  and reshapes it to (img_shape, img_shape, colour_channel).
  """
  # Read in target file (an image)
  img = tf.io.read_file(filename)

  # Decode the read file into a tensor & ensure 3 colour channels 
  # (our model is trained on images with 3 colour channels and sometimes images have 4 colour channels)
  img = tf.image.decode_image(img, channels=3)

  # Resize the image (to the same size our model was trained on)
  img = tf.image.resize(img, size = [img_shape, img_shape])

  # Rescale the image (get all values between 0 and 1)
  img = img/255.
  return img

In [None]:
# Load in and preprocess our custom image
actinic_keratosis = load_and_prep_image("ACTINIC-KERATOSES.jpg")
actinic_keratosis

In [None]:
# Add an extra axis
print(f"Shape before new dimension: {actinic_keratosis.shape}")
actinic_keratosis = tf.expand_dims(actinic_keratosis, axis=0) # add an extra dimension at axis 0
print(f"Shape after new dimension: {actinic_keratosis.shape}")
actinic_keratosis

In [None]:
pred = model.predict(actinic_keratosis)
pred

In [None]:
pred_class = class_names[int(tf.round(pred)[0][0])]
pred_class

In [None]:
def pred_and_plot(model, filename, class_names):
  """
  Imports an image located at filename, makes a prediction on it with
  a trained model and plots the image with the predicted class as the title.
  """
  # Import the target image and preprocess it
  img = load_and_prep_image(filename)

  # Make a prediction
  pred = model.predict(tf.expand_dims(img, axis=0))

  # Get the predicted class
  pred_class = class_names[int(tf.round(pred)[0][0])]

  # Plot the image and predicted class
  plt.imshow(img)
  plt.title(f"Prediction: {pred_class}")
  plt.axis(False);

In [None]:
# Test our model on a custom image
pred_and_plot(model, "ACTINIC-KERATOSES.jpg", class_names)