In [1]:
#import necessary packages
import os                           #for image iteration/loading
import cv2                          #for image processing and analysis
import matplotlib.pyplot as plt     #for plotting and viz
import numpy as np                  #for math stuffs

In [2]:
#one hot vector
# [1,0] = benign
# [0,1] = malignant

In [3]:
#standardize image size (dimensions) : make them 50x50 pixels
img_size = 50

In [4]:
#location of image files
ben_training_folder = "melanoma_cancer_dataset/train/benign/"
mal_training_folder = "melanoma_cancer_dataset/train/malignant/"
ben_testing_folder = "melanoma_cancer_dataset/test/benign/"
mal_testing_folder = "melanoma_cancer_dataset/test/malignant/"

In [5]:
ben_training_data = []
mal_training_data = []

In [6]:
ben_testing_data = []
mal_testing_data = []

In [7]:
#benign training

for filename in os.listdir(ben_training_folder):
    try:    #skip corrupter image, it is fine to skip if we have a lot of training data
        path = ben_training_folder+filename
        #print(path)
        
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)    #grayscale all the img; case-to-case basis depending on the research problem
        
        # plt.imshow(img)
        # plt.show()
        # break
        
        img = cv2.resize(img, (img_size, img_size))     #resize image
        
        img_array = np.array(img)       #convert image into pixel array; since grayscaled we have 1 value per pixel, if colored (r,g,b) values per pixel
        # print(img_array)
        # print(img.shape)
        # break
        
        ben_training_data.append([img_array, np.array([1, 0])])
        
    except:
        pass
        

In [8]:
#malignant training

for filename in os.listdir(mal_training_folder):
    try:    #skip corrupter image, it is fine to skip if we have a lot of training data
        path = mal_training_folder+filename
        #print(path)
        
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)    #grayscale all the img; case-to-case basis depending on the research problem
        
        # plt.imshow(img)
        # plt.show()
        # break
        
        img = cv2.resize(img, (img_size, img_size))     #resize image
        
        img_array = np.array(img)       #convert image into pixel array; since grayscaled we have 1 value per pixel, if colored (r,g,b) values per pixel
        # print(img_array)
        # print(img.shape)
        # break
        
        mal_training_data.append([img_array, np.array([0, 1])])
        
    except:
        pass

In [9]:
#benign testing

for filename in os.listdir(ben_testing_folder):
    try:    #skip corrupter image, it is fine to skip if we have a lot of training data
        path = ben_testing_folder+filename
        #print(path)
        
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)    #grayscale all the img; case-to-case basis depending on the research problem
        
        # plt.imshow(img)
        # plt.show()
        # break
        
        img = cv2.resize(img, (img_size, img_size))     #resize image
        
        img_array = np.array(img)       #convert image into pixel array; since grayscaled we have 1 value per pixel, if colored (r,g,b) values per pixel
        # print(img_array)
        # print(img.shape)
        # break
        
        ben_testing_data.append([img_array, np.array([1, 0])])
        
    except:
        pass

In [10]:
#malignant testing

for filename in os.listdir(mal_testing_folder):
    try:    #skip corrupter image, it is fine to skip if we have a lot of training data
        path = mal_testing_folder+filename
        #print(path)
        
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)    #grayscale all the img; case-to-case basis depending on the research problem
        
        # plt.imshow(img)
        # plt.show()
        # break
        
        img = cv2.resize(img, (img_size, img_size))     #resize image
        
        img_array = np.array(img)       #convert image into pixel array; since grayscaled we have 1 value per pixel, if colored (r,g,b) values per pixel
        # print(img_array)
        # print(img.shape)
        # break
        
        mal_testing_data.append([img_array, np.array([0, 1])])
        
    except:
        pass

In [11]:
#check training class imbalance

print()
print()
print(f"Benign training count: {len(ben_training_data)}")
print(f"Malignant training count: {len(mal_training_data)}")
print()
print(f"Benign testing count: {len(ben_testing_data)}")
print(f"Malignant testing count: {len(mal_testing_data)}")




Benign training count: 5000
Malignant training count: 4605

Benign testing count: 500
Malignant testing count: 500


In [12]:
#shorten benign to balance (undersample)

ben_training_data = ben_training_data[0:len(mal_training_data)]

In [13]:
#recheck training class imbalance

print()
print()
print(f"Benign training count: {len(ben_training_data)}")
print(f"Malignant training count: {len(mal_training_data)}")
print()
print(f"Benign testing count: {len(ben_testing_data)}")
print(f"Malignant testing count: {len(mal_testing_data)}")




Benign training count: 4605
Malignant training count: 4605

Benign testing count: 500
Malignant testing count: 500


In [14]:
#merge training data
training_data = ben_training_data + mal_training_data
np.random.shuffle(training_data)    #shuffling is important as the classifier may learn pattern from the arrangement
training_data = np.asarray(training_data, dtype="object")   #to avoid ValueError:  setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.
training_data

array([[array([[176, 175, 182, ..., 180, 182, 176],
               [180, 178, 185, ..., 184, 184, 181],
               [182, 181, 185, ..., 184, 184, 179],
               ...,
               [158, 163, 165, ..., 185, 185, 188],
               [165, 163, 167, ..., 184, 188, 185],
               [166, 154, 167, ..., 183, 184, 188]], dtype=uint8),
        array([0, 1])],
       [array([[246, 246, 244, ..., 110, 107, 112],
               [247, 246, 244, ..., 110, 116, 119],
               [247, 246, 244, ..., 105,  98,  90],
               ...,
               [252, 250, 250, ...,  59,  63,  65],
               [252, 252, 250, ...,  62,  64,  64],
               [251, 250, 248, ...,  67,  69,  66]], dtype=uint8),
        array([0, 1])],
       [array([[102, 137, 144, ...,  93,  91,  79],
               [125, 133, 146, ...,  99,  96,  86],
               [127, 133, 149, ..., 107, 108,  94],
               ...,
               [130, 142, 148, ..., 124, 117, 117],
               [128, 138, 142,

In [15]:
#save the datasets
np.save("melanoma_training_data.npy", training_data)

In [16]:
#merge testing data
testing_data = ben_testing_data + mal_testing_data
np.random.shuffle(testing_data)    #shuffling is important as the classifier may learn pattern from the arrangement
testing_data = np.asarray(testing_data, dtype="object")   #to avoid ValueError:  setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.
testing_data

array([[array([[  6,  49,  75, ...,  98,  83,  65],
               [ 31,  62,  85, ..., 105,  88,  72],
               [ 51,  76,  98, ..., 111, 100,  85],
               ...,
               [ 18,  63,  76, ...,  76,  59,  38],
               [  6,  48,  64, ...,  67,  49,  32],
               [  5,   9,  56, ...,  54,  40,  33]], dtype=uint8),
        array([1, 0])],
       [array([[203, 204, 206, ..., 195, 196, 199],
               [201, 203, 205, ..., 195, 199, 200],
               [201, 203, 201, ..., 198, 201, 202],
               ...,
               [183, 175, 177, ..., 209, 208, 209],
               [186, 188, 190, ..., 208, 209, 207],
               [193, 193, 195, ..., 208, 206, 205]], dtype=uint8),
        array([0, 1])],
       [array([[165, 168, 176, ..., 173, 159, 150],
               [168, 167, 175, ..., 165, 151, 138],
               [170, 176, 176, ..., 168, 161, 130],
               ...,
               [149, 153, 156, ..., 161, 158, 146],
               [149, 147, 149,

In [17]:
#save the datasets
np.save("melanoma_testing_data.npy", testing_data)