In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
from PIL import Image
import matplotlib.image as img
import seaborn as sns
import PIL
# base_skin_dir = os.path.join('C://Users//Mi//Documents//Projets Python//Kaggle//skin cancer', 'HAM')
base_skin_dir = os.path.join('../input/skin-cancer-mnist-ham10000')    #Kaggle

Mon premier objectif avec ce Notebook est de predire les classes des differents types de taches corporelles seulement a l'aide des images disponibles dans les donnees. \
Mon objectif futur sera d'ameliorer la precision a l'aide des autres donnees disponibles. \
Je me concentre sur des 'Convolutional Neural Network' dans ce notebook.

Pour les donnees et certaines explications: https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000

# Exploration des données et premières remarques

In [4]:
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'}

tile_df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes
tile_df[['cell_type_idx', 'cell_type']].sort_values('cell_type_idx').drop_duplicates()

data = tile_df
tile_df.head()

In [5]:
# path = 'C://Users//Mi//Documents//Projets Python//Kaggle//skin cancer//HAM//'
path='../input/ham1000-segmentation-and-classification/images/'     #Kaggle

Quelques images de la base de données dont on dispose. On peut observer les differents types de cellules et leur differences via des exemples.

In [6]:
rows = 4
cols = 2
axes=[]
fig, ax = plt.subplots(figsize=(10, 10))

for clas in range(7):
    image = path+data[data['cell_type_idx']==clas].iloc[1]['image_id']+'.jpg'
    axes.append( fig.add_subplot(rows, cols, clas+1) )
    subplot_title=str(data[data['cell_type_idx']==clas].iloc[1]['cell_type'])
    axes[-1].set_title(subplot_title)  
    plt.imshow(img.imread(image), aspect='auto')
fig.tight_layout()
plt.show()

In [7]:
# print(data['cell_type_idx'].unique())
plt.hist(data['cell_type_idx'],bins=6,align='left')
plt.title('Repartition des classes')
plt.show()

On va mieux balancer notre dataset.

In [8]:
dfcell4  = data[data['cell_type_idx']==4].sample(1200)
dfbalanced = pd.concat([dfcell4,data[data['cell_type_idx']!=4]], ignore_index=True)

In [9]:
pd.DataFrame(dfbalanced['cell_type_idx']).value_counts()

On a mieux repartit les classes

# Selection des donnees

On va changer la taille des photos. On prend une taille que mon ordinateur peut accepter pour le modele.

In [10]:
import cv2

dfbalanced['image'] = path+dfbalanced['image_id']+'.jpg'

imagesbalanced = []

shape = (60,45)

for i in range(dfbalanced.shape[0]):
    imagesbalanced.append(cv2.resize(img.imread(path+dfbalanced['image_id'].iloc[i]+'.jpg'),shape))  #resize
    # imagesbalanced.append(img.imread(path+dfbalanced['image_id'].iloc[i]+'.jpg'))   #si pas de resizing
print('La taille des images est: '+str(imagesbalanced[0].shape))

## Train/Test pour notre modele

In [11]:
X = np.array(imagesbalanced)
from tensorflow.keras.utils import to_categorical
y = to_categorical(np.array(dfbalanced['cell_type_idx']),num_classes=7)

In [12]:
from sklearn.model_selection import train_test_split
x_train_complet, x_test, y_train_complet, y_test = train_test_split(X, y, test_size=0.20, random_state=2022)

In [13]:
plt.imshow(x_train_complet[0])
plt.title('Exemple de photo resize')
plt.show()

## Modele Keras

In [127]:
from tensorflow.keras.layers import Rescaling, Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Dropout, Concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import datetime
import tensorflow as tf
num_classes = 7
inputsize = (45,60,3)
input2size = (6,)

model = Sequential()
model.add(Rescaling(1./255, input_shape=inputsize))
model.add(Conv2D(32, (3, 3), activation="relu", padding="same"))
model.add(MaxPooling2D())
model.add(Dropout(0.2))
model.add(Conv2D(64, (3, 3), activation="relu", padding="same"))
model.add(MaxPooling2D())
# model.add(Conv2D(32, (3, 3), activation="relu", padding="same"))
model.add(MaxPooling2D())
# model.add(MaxPooling2D())
model.add(Flatten())

# modelresult = Sequential()

# model2 = Sequential()
# model2.add(Dense(32, input_shape=input2size, activation='sigmoid'))

# modelmerged = Concatenate([model, model2])
# modelmerged.add(Dense(128, activation="relu"))

# modelresult.add(Dense(7, activation="softmax"))

model.add(Dense(32, activation="relu"))
model.add(Dense(7, activation="softmax"))

optimizer = Adam(learning_rate=0.0001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])



In [113]:
model.summary()

In [125]:
hist = model.fit(x_train_complet,y_train_complet, epochs = 60,verbose = 1,validation_split=0.1)

# Verification du modele

## Sur le Train

In [120]:
loss_train = hist.history['loss']
loss_val = hist.history['val_loss']
epochs = range(1, len(loss_train)+1)
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [98]:
y_true = y_train_complet
y_predic = np.argmax(np.array(model.predict(x_train_complet)), axis=-1)

from sklearn.metrics import confusion_matrix
confusion_matrix(np.argmax(y_true,axis=1),y_predic)

## Sur le Test

In [99]:
model.evaluate(x_test,y_test)

In [100]:
y_true_test = y_test
y_predic_test = np.argmax(np.array(model.predict(x_test)), axis=-1)

from sklearn.metrics import confusion_matrix
confusion_matrix(np.argmax(y_true_test,axis=1),y_predic_test)