In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2
import numpy as np
from tqdm import tqdm
from keras.utils import np_utils
import helpers

Using TensorFlow backend.


#### Partition ISIC archive to cancer and normal class folders

In [2]:
def crop_and_resize(img, resize_dim=299):
    #val = 240
    #y,x,channel = img.shape
    #startx = x//2-(resize_dim//2)
    #starty = y//2-(resize_dim//2)
    #img = img[starty:starty+resize_dim,startx:startx+resize_dim]
    img=cv2.resize(img,(resize_dim,resize_dim),interpolation=cv2.INTER_AREA)
    
    return img

def get_image(path):
    
    img=cv2.imread(path)
    img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    img=crop_and_resize(img)
    return img

In [3]:
def get_data(path):
    img=cv2.imread(path)
    #img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    return img

In [4]:
#Root directory of the project
ROOT_DIR = os.path.abspath(".")
# Training file directory
DATASET = os.path.join(ROOT_DIR, 'dataset')
DATASET

'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset'

In [5]:
helpers.create_directory("{}/ISIC_ARCHIVE/".format(DATASET))
NEW_DATASET_PATH = "{}/{}".format(DATASET, "ISIC_ARCHIVE")
NEW_DATASET_PATH

'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/ISIC_ARCHIVE'

In [6]:
helpers.create_directory("{}/normal".format(NEW_DATASET_PATH))
helpers.create_directory("{}/cancer".format(NEW_DATASET_PATH))
helpers.create_directory("{}/normal_2629".format(NEW_DATASET_PATH))

In [7]:
# new paths
CANCER_FOLDER = "{}/cancer".format(NEW_DATASET_PATH)
NORMAL_FOLDER = "{}/normal".format(NEW_DATASET_PATH)

CANCER_FOLDER, NORMAL_FOLDER

('C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/ISIC_ARCHIVE/cancer',
 'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/ISIC_ARCHIVE/normal')

In [8]:
# archive path
ANNOT = "{}/data/{}".format(DATASET, "Descriptions")
IMAGES = "{}/data/{}".format(DATASET, "Images")
ANNOT, IMAGES

('C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Descriptions',
 'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Images')

In [9]:
len(os.listdir(ANNOT)), len(os.listdir(IMAGES))

(23906, 23906)

In [10]:
fname = os.listdir(ANNOT)[0]

In [11]:
f=open("{}/{}".format(ANNOT, fname), "r")
f

<_io.TextIOWrapper name='C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Descriptions/ISIC_0000000' mode='r' encoding='cp1252'>

In [12]:
import json
with open("{}/{}".format(ANNOT, fname), "rb") as fin:
    content = json.load(fin)

In [13]:
type(content)

dict

In [14]:
content['meta']['clinical']['benign_malignant']

'benign'

In [15]:
annot_names = ["{}/{}".format(ANNOT, fname) for fname in os.listdir(ANNOT)]
annot_names[:2]

['C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Descriptions/ISIC_0000000',
 'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Descriptions/ISIC_0000001']

In [16]:
image_names = ["{}/{}".format(IMAGES, fname) for fname in os.listdir(IMAGES)]
image_names[:2]

['C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Images/ISIC_0000000.jpeg',
 'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/data/Images/ISIC_0000001.jpeg']

In [17]:

images = []
labels = []
no_label_count = 0

for name, img_path in tqdm(zip(annot_names[:], image_names[:])):
    with open(name, "rb") as annot:
        content = json.load(annot)
        try:
            g_t = content['meta']['clinical']['benign_malignant']
        except KeyError:
            no_label_count+=1
            pass
        
        img = get_image(img_path)
        images.append(img)
        if g_t == 'benign':
            labels.append(0)
        else:
            labels.append(1)
            
images = np.array(images)
labels = np.array(labels)

23906it [50:53,  7.83it/s]


In [None]:
from keras.utils import np_utils

# one hot encoded vectors
num_classes = 2
g_t_cat = np_utils.to_categorical(g_t,num_classes)
print(g_t_cat.shape)

In [20]:
c = []
n = []


cnt = None
number = 0
no_label_count = 0

for name, img_path in tqdm(zip(annot_names[:cnt], image_names[:cnt])):
    with open(name, "rb") as annot:
        content = json.load(annot)
        try:
            g_t = content['meta']['clinical']['benign_malignant']
        except KeyError:
            no_label_count+=1
            pass
        
        if g_t == 'benign':
            img = cv2.imread(img_path) 
            cv2.imwrite(NORMAL_FOLDER + '/norm' + str(number) + ".jpeg", img)
            n.append(g_t)
            img=None
        else:
            img = cv2.imread(img_path) 
            cv2.imwrite(CANCER_FOLDER + '/can' + str(number) + ".jpeg", img)
            c.append(g_t)
            img=None
    number+=1

len(n), len(c)

23906it [1:21:40,  4.88it/s]


(21277, 2629)

In [None]:
no_label_count

In [16]:
NORMAL2629_FOLDER = "{}/normal_2629".format(NEW_DATASET_PATH)
NORMAL2629_FOLDER

'C:\\Users\\Hasib\\Desktop\\works\\lesion-learnt\\dataset/ISIC_ARCHIVE/normal_2629'

In [18]:
c = []
n = []

cnt = None
number = 0
no_label_count = 0

for name, img_path in tqdm(zip(annot_names[17500:cnt], image_names[17500:cnt])):
    with open(name, "rb") as annot:
        content = json.load(annot)
        try:
            g_t = content['meta']['clinical']['benign_malignant']
        except KeyError:
            pass
        
        if no_label_count == 2630:
            break
        else:
            if g_t == 'benign':
                #print(g_t)
                no_label_count+=1
                img = cv2.imread(img_path) 
                cv2.imwrite(NORMAL2629_FOLDER + '/benign' + str(number) + ".jpeg", img)
                n.append(g_t)
            else:
                pass
            
    number+=1
    

len(n), len(c), no_label_count, number


0it [00:00, ?it/s]
1it [00:00,  6.13it/s]
6it [00:00,  8.29it/s]
11it [00:00,  9.73it/s]
16it [00:00, 12.66it/s]
20it [00:00, 15.87it/s]
26it [00:00, 20.30it/s]
31it [00:01, 24.59it/s]
36it [00:01, 25.94it/s]
40it [00:01, 28.02it/s]
48it [00:01, 34.58it/s]
55it [00:01, 39.73it/s]
61it [00:01, 41.72it/s]
67it [00:01, 42.59it/s]
72it [00:01, 43.18it/s]
77it [00:02, 41.03it/s]
82it [00:02, 42.37it/s]
87it [00:02, 42.05it/s]
93it [00:02, 45.56it/s]
98it [00:02, 44.80it/s]
103it [00:02, 44.99it/s]
109it [00:02, 44.33it/s]
115it [00:02, 47.98it/s]
120it [00:03, 20.65it/s]
127it [00:03, 25.92it/s]
132it [00:03, 30.30it/s]
139it [00:03, 35.24it/s]
146it [00:03, 40.48it/s]
153it [00:03, 41.48it/s]
159it [00:04, 41.19it/s]
168it [00:04, 47.94it/s]
176it [00:04, 53.18it/s]
183it [00:04, 53.62it/s]
190it [00:04, 54.19it/s]
197it [00:04, 57.84it/s]
204it [00:04, 60.70it/s]
211it [00:05, 50.60it/s]
217it [00:05, 49.42it/s]
223it [00:05, 51.24it/s]
230it [00:05, 54.30it/s]
238it [00:05, 58.00it/s]
2

(2630, 0, 2630, 2960)


2958it [01:28, 39.70it/s]