#### 간략 정리
* 사용자는 피부병 사진을 업로드하면, 지속적인 예측이 가능하도록 설계
* 이 커널은 인간의 피부병을 분류하는 모델을 제작하는 커널
* 총 7개의 피부병이 존재
* 모델은 MobileNet CNN에서 fine tune을 사용
* 이 커널에서 모든 모델은 training 됨
* 주요 해결 포인트는 imblanced한 모델과 작은 양의 데이터
* class imblanced를 해결하기 위해 augmentation 기법을 활용하여, 한쪽으로 편향된 accuracy score에서 벗어남
* MobileNet은 small size고, 빨라 모델 실행 연동시 유리 --> 차후 알파도 팻케어 어플과 연동하기 위함

### 모델 선정 전략
* 이 커널은 모델 선정 전략에 있어 중요한 가이던스를 제공
* 전체 accuracy는 60% 정도 일 수 있지만, top 3 accuracy에 대해서는 높은 정확률을 나타낼 수 있음  
  이러한 모델은 꽤 좋은 모델이라고 생각할 수 있음

#### 1. 필요한 라이브러리 로딩

In [None]:
from numpy.random import seed
seed(101)
# from tensorflow  import set_random_seed
# set_random_seed(101)

import pandas as pd
import numpy as np

import tensorflow
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

import os

from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline

#### Labels : 총 7개 class --> [Total images = 10015]
* nv : Melanocytic nevi, 6705개
* mel : Melanoma, 1113개
* bkl : Benign keratosis, 1099개
* bcc : Basal cell carcinoma, 514개
* akiec : Actinic Keratoses, 327개
* vasc : Vascular skin, 142개
* df : Dermatofibroma, 115개[](http://)

In [None]:
os.listdir('../input/skin-cancer-mnist-ham10000/')

#### 디렉토리 구조 만들기
* Keras generator에 사용하기 위한 directory 구조 설계
* 다음과 같은 구조로 설계

#### train_dir
  * nv
  * mel
  * bkl
  * bcc
  * akiec
  * vasc
  * df
  
#### val_dir
  * nv
  * mel
  * bkl
  * bcc
  * akiec
  * vasc
  * df

In [None]:
base_dir = 'base_dir'
os.mkdir(base_dir)

train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

#### train, validation set 만들기

In [None]:
df_data = pd.read_csv('../input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')
df_data.head()

#### 층별화된 validation set 만들기

In [None]:
df = df_data.groupby('lesion_id').count()
df = df[df['image_id'] == 1]

# lesion_id 별 image가 1개만 있는 것들만 추출해서 확인
df.reset_index(inplace = True)
df.head()

In [None]:
# lesion_id 별로 이미지가 여러개 있는 것들과, 오직 1개만 있는 것들 확인
def identify_duplicates(x):
    
    unique_list = list(df['lesion_id'])
    
    if x in unique_list:
        return "no_duplicates"
    else:
        return "has_duplicates"
    
df_data['duplicates'] = df_data['lesion_id']
df_data['duplicates'] = df_data['duplicates'].apply(identify_duplicates)

df_data.head()

In [None]:
df_data['duplicates'].value_counts()

In [None]:
# 중복이 없는 unique한 image만 추출
# validationSet에는 중복된 데이터가 없어야 하기 때문에 일부러 unique한 이미지만 filtering

df = df_data[df_data['duplicates'] == 'no_duplicates']
df.shape

In [None]:
# 데이터 수가 매우 적으므로, 전체 데이터 0.17%만 Test 데이터로 사용..
y = df['dx']
_, df_val = train_test_split(df, test_size = 0.17, random_state = 101, stratify = y)

In [None]:
df_val.shape

In [None]:
# 각 y label별 count를 확인해보면, nv가 압도적으로 많음. --> imblanced 한 데이터. & small data
df_val['dx'].value_counts()

#### validationSet을 제외한 나머지 데이터로 train Dataset 만들기

In [None]:
def identify_val_rows(x):
    
    val_list = list(df_val['image_id'])
    
    if str(x) in val_list:
        return 'val'
    else:
        return 'train'
    
    
df_data['train_or_val'] = df_data['image_id']
df_data['train_or_val'] = df_data['train_or_val'].apply(identify_val_rows)

df_train = df_data[df_data['train_or_val'] == 'train']

In [None]:
print(len(df_train))
print(len(df_val))

In [None]:
df_train['dx'].value_counts()

In [None]:
df_val['dx'].value_counts()

#### 이미지를 폴더로 전송하기

In [None]:
# df_data의 index를 image_id로 setting
df_data.set_index('image_id', inplace=True)

In [None]:
# 2개 폴더의 directory 저장
folder_1 = os.listdir('../input/skin-cancer-mnist-ham10000/ham10000_images_part_1')
folder_2 = os.listdir('../input/skin-cancer-mnist-ham10000/ham10000_images_part_2')

# train / validation의 image_id list 저장
train_list = list(df_train['image_id'])
val_list = list(df_val['image_id'])

# train image 전송
for image in train_list:
    fname = image + ".jpg"
    label = df_data.loc[image, 'dx']
    
    if fname in folder_1:
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_1', fname)
        dst = os.path.join(train_dir, label, fname)
        shutil.copyfile(src, dst)
        
    if fname in folder_2:
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_2', fname)
        dst = os.path.join(train_dir, label, fname)
        shutil.copyfile(src, dst)
    

for image in val_list:
    
    fname = image + '.jpg'
    label = df_data.loc[image,'dx']
    
    if fname in folder_1:
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_1', fname)
        dst = os.path.join(val_dir, label, fname)
        shutil.copyfile(src, dst)

    if fname in folder_2:
        src = os.path.join('../input/skin-cancer-mnist-ham10000/ham10000_images_part_2', fname)
        dst = os.path.join(val_dir, label, fname)
        shutil.copyfile(src, dst)

In [None]:
print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

In [None]:
print(len(os.listdir('base_dir/val_dir/nv')))
print(len(os.listdir('base_dir/val_dir/mel')))
print(len(os.listdir('base_dir/val_dir/bkl')))
print(len(os.listdir('base_dir/val_dir/bcc')))
print(len(os.listdir('base_dir/val_dir/akiec')))
print(len(os.listdir('base_dir/val_dir/vasc')))
print(len(os.listdir('base_dir/val_dir/df')))

#### aug_dir 디렉토리로 train dataSet 복사

In [None]:
class_list = ['mel','bkl','bcc','akiec','vasc','df']

for item in class_list:
    
    # aug_dir directory 생성
    aug_dir = 'aug_dir'
    if not(os.path.isdir(aug_dir)): 
        os.mkdir(aug_dir)
    
    # aug_dir/img_dir directory 생성
    img_dir = os.path.join (aug_dir, 'img_dir')
    if not(os.path.isdir(img_dir)): 
        os.mkdir(img_dir)
    
    # label class 명 저장
    img_class = item
    
    # img_list -> 기존 trainDataset에 있는 image dataList 
    img_list = os.listdir('base_dir/train_dir/' + img_class)
    
    # img_list 를 aug_dir/img_dir/class명/ directory 에 복사
    for fname in img_list:
        src = os.path.join('base_dir/train_dir/' + img_class, fname)
        dst = os.path.join(img_dir, fname)
        shutil.copyfile(src, dst)
        
    path = aug_dir
    save_path = 'base_dir/train_dir/' + img_class
    
    # augmentation을 위한 ImageDataGenerator 생성
    datagen = ImageDataGenerator(
        rotation_range    = 180,
        width_shift_range = 0.1,
        height_shift_range= 0.1,
        zoom_range        = 0.1,
        horizontal_flip   = True,
        vertical_flip     = True,
        fill_mode         ='nearest')
    # batch_size --> 50
    batch_size = 50
    
    # flow_from_directory function을 통한 batch_size 지정
    aug_datagen = datagen.flow_from_directory(path,
                                           save_to_dir = save_path,
                                           save_format = 'jpg',
                                           target_size = (224,224),
                                           batch_size  = batch_size)
    
    # label당 총 augmenation image 개수를 6000로 대충 맞추고 싶음
    
    num_aug_images_wanted = 6000
    num_files  = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))  
    
    for i in range(0,num_batches):
            imgs, labels = next(aug_datagen)
            
    shutil.rmtree('aug_dir')

In [None]:
# 증식된 데이터를 포함함 trainDataSet 확인
print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

##### 50개의 증식된 이미지 시각화

In [None]:
def plots(ims, figsize=(12,6), rows=5, interp=False, titles=None): # 12,6
    if type(ims[0]) is np.ndarray:
        ims = np.array(ims).astype(np.uint8)
        if (ims.shape[-1] != 3):
            ims = ims.transpose((0,2,3,1))
    f = plt.figure(figsize=figsize)
    
    cols = len(ims)//rows if len(ims) % 2 == 0 else len(ims)//rows + 1
    for i in range(len(ims)):
        sp = f.add_subplot(rows, cols, i+1)
        sp.axis('Off')
        if titles is not None:
            sp.set_title(titles[i], fontsize=16)
        plt.imshow(ims[i], interpolation=None if interp else 'none')
        
plots(imgs, titles=None) # titles=labels will display the image labels

### generator set up 하기

In [None]:
train_path = 'base_dir/train_dir'
valid_path = 'base_dir/val_dir'

num_train_samples = len(df_train)  # train dataset 개수
num_val_samples   = len(df_val)    # val dataset 개수
train_batch_size  = 10             # train batch size : 10
val_batch_size    = 10             # val batch size   : 10
image_size        = 224            # image_size       : 224,

train_steps       = np.ceil(num_train_samples / train_batch_size)
val_steps         = np.ceil(num_val_samples / val_batch_size)

In [None]:
datagen = ImageDataGenerator(
    preprocessing_function = \
     tensorflow.keras.applications.mobilenet.preprocess_input)

train_batches = datagen.flow_from_directory(
        train_path,                             # train image가 있는 path
        target_size = (image_size, image_size), # image_size
        batch_size  = train_batch_size)

valid_batches = datagen.flow_from_directory(
         valid_path,                             # train image가 있는 path
        target_size = (image_size, image_size),  # image_size
        batch_size  = val_batch_size)


# test dataset은 shuffle 되면 안되므로, shuffle = False.
test_batches = datagen.flow_from_directory(
            valid_path,
            target_size = (image_size,image_size),
            batch_size  = 1,
            shuffle = False)

### MobileNet 수정하기

In [None]:
mobile = tensorflow.keras.applications.mobilenet.MobileNet()

In [None]:
mobile.summary()

In [None]:
type(mobile.layers)

In [None]:
# mobileNet이 가지고 있는 layer 수 : 93
len(mobile.layers)

### fine tuning.
* 마지막 5개 layer 제거
* global_average_pooling2d_1 를 포함한 모든 레이어를 포함시킴
* 마지막 dense layer의 class --> 7
* 0.25 값의 dropout 포함시킴

In [None]:
x = mobile.layers[-6].output
x = Dropout(0.25)(x)

predictions  = Dense(7, activation = 'softmax')(x)
model = Model(inputs = mobile.input, outputs = predictions)

In [None]:
model.summary()

In [None]:
# 훈련시킬 layer를 설정해야 함
# 마지막 23개의 layer를 제외한 나머지  weights를 freezing.

for layer in model.layers[:-23]:
    layer.trainable = False

### model training 시키기

In [None]:
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy 

def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k = 3)

def top_2_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k = 2)

In [None]:
model.compile(Adam(lr=0.01), loss = 'categorical_crossentropy',
             metrics = [categorical_accuracy, top_2_accuracy, top_3_accuracy]
             )

### model 생성
* melanoma에 좀 더 민감하도록 모델 생성

In [None]:
print(valid_batches.class_indices)

In [None]:
class_weights = {
    0: 1.0,
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 3.0,
    5: 1.0,
    6: 1.0,
}

In [None]:
filepath   = "model.h5"
checkpoint = ModelCheckpoint(  filepath, monitor = 'val_top_3_accuracy'
                             , verbose = 1
                             , save_best_only =  True
                             , mode = 'max') 

# 검증 손실이 줄어들지 않을 때 학습률을 작게 조정할 수 있음.
reduce_lr = ReduceLROnPlateau( monitor = 'val_top_3_accuracy'
                              , factor = 0.5
                              , patience = 2
                              , verbose = 1
                              , mode = 'max'
                              , min_lr = 0.00001)

callbacks_list = [checkpoint, reduce_lr]

history = model.fit_generator(
        train_batches,
        steps_per_epoch = train_steps,
        class_weight    = class_weights,
        validation_data = valid_batches,
        validation_steps = val_steps,
        epochs = 30,
        verbose = 1,
        callbacks = callbacks_list
)

In [None]:
model.metrics_names

In [None]:
# 마지막 epoch를 수행한 모델의 가중치 사용

val_loss, val_cat_acc, val_top_2_acc, val_top_3_acc = \
model.evaluate_generator(test_batches
                         , steps = len(df_val))

print('val_loss', val_loss)
print('val_cat_acc', val_cat_acc)
print('val_top_2_acc:', val_top_2_acc)
print('val_top_3_acc:', val_top_3_acc)

In [None]:
# best epoch를 수행한 모델의 가중치 사용

model.load_weights('model.h5')
val_loss, val_cat_acc, val_top_2_acc, val_top_3_acc = \
model.evaluate_generator(test_batches, 
                        steps=len(df_val))

print('val_loss:', val_loss)
print('val_cat_acc:', val_cat_acc)
print('val_top_2_acc:', val_top_2_acc)
print('val_top_3_acc:', val_top_3_acc)

### training curve plot

In [None]:
# display the loss and accuracy curves

import matplotlib.pyplot as plt

acc = history.history['categorical_accuracy']
val_acc = history.history['val_categorical_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
train_top2_acc = history.history['top_2_accuracy']
val_top2_acc = history.history['val_top_2_accuracy']
train_top3_acc = history.history['top_3_accuracy']
val_top3_acc = history.history['val_top_3_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.figure()

plt.plot(epochs, acc, 'bo', label='Training cat acc')
plt.plot(epochs, val_acc, 'b', label='Validation cat acc')
plt.title('Training and validation cat accuracy')
plt.legend()
plt.figure()


plt.plot(epochs, train_top2_acc, 'bo', label='Training top2 acc')
plt.plot(epochs, val_top2_acc, 'b', label='Validation top2 acc')
plt.title('Training and validation top2 accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, train_top3_acc, 'bo', label='Training top3 acc')
plt.plot(epochs, val_top3_acc, 'b', label='Validation top3 acc')
plt.title('Training and validation top3 accuracy')
plt.legend()


plt.show()

### Create Confusion Matrix

In [None]:
test_labels  = test_batches.classes

In [None]:
test_batches.class_indices

In [None]:
# test data에 대한 prediction값 출력
predictions = model.predict_generator(test_batches, steps=len(df_val), verbose=1)
predictions.shape

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
cm = confusion_matrix(test_labels, predictions.argmax(axis=1))

In [None]:
test_labels.shape

In [None]:
test_batches.class_indices

In [None]:
# Define the labels of the class indices. These need to match the 
# order shown above.
cm_plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'mel','nv', 'vasc']

plot_confusion_matrix(cm, cm_plot_labels, title='Confusion Matrix')

### Classification Report

In [None]:
# 다차원 배열의 경우에 차원에 따라 가장 큰 값의 인덱스들을 반환해주는 함수
# test image에 대한 가장 큰 확률 값 return
y_pred = np.argmax(predictions, axis=1)

# Get the labels of the test images.
y_true = test_batches.classes

In [None]:
from sklearn.metrics import classification_report

# Generate a classification report
report = classification_report(y_true, y_pred, target_names=cm_plot_labels)

print(report)