#### Extraction features avec VGG16

In [1]:
%matplotlib inline
import numpy as np

import matplotlib.pyplot as plt
import cv2
import seaborn as sns

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 

In [2]:
#Charge le modèle VGG16 entrainé sur les 8 dernières couches
model = load_model('vgg16_8')

In [3]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, None, None, 512)   14714688  
_________________________________________________________________
global_average_pooling2d_1 ( (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dense_5 (Dense)              (None, 512)               524800    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_7 (Dense)              (None, 8)                 2056      
Total params: 15,898,184
Trainable params: 15,898,184
Non-trainable params: 0
__________________________________________

In [4]:
src_img = '../images_mendeley/dataset_redim/'

#### Récupération de la base d'images et création jeu d'évaluation, de test et d'entrainement

In [5]:
# Récupère les mêmes bases : train, test et eval
data = pd.read_csv('../mendeley_cells_redim.csv',index_col=0)
data_train, data_eval = train_test_split(data,test_size=0.2,random_state=123)
data_train, data_test = train_test_split(data_train,test_size=0.2,random_state=123)

In [6]:
# ImageDataGenerator pour éviter de charger les images en mémoire
# Shuffle à false, pour récupérer les classes correspondantes
data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)
train_generator = data_generator.flow_from_dataframe(data_train, directory=src_img, x_col='filename',y_col='category',class_mode ='sparse',target_size=(224,224),
                                              batch_size=32,shuffle=False)

eval_generator = data_generator.flow_from_dataframe(data_eval, directory=src_img, x_col='filename',y_col='category',class_mode ='sparse',target_size=(224,224),
                                              batch_size=32,shuffle=False)

Found 10938 validated image filenames belonging to 8 classes.
Found 3419 validated image filenames belonging to 8 classes.


##### Extraction de features

In [7]:
intermediate_layer_model = Model(inputs=model.input, outputs=model.layers[2].output)
X_train_features = intermediate_layer_model.predict(train_generator)
X_eval_features = intermediate_layer_model.predict(eval_generator)

In [8]:
# Classes correspondantes
lst_classes = [key for key, val in train_generator.class_indices.items()]
train_class_name = [lst_classes[i] for i in train_generator.classes]
eval_class_name = [lst_classes[i] for i in eval_generator.classes]

#### Modèle SVC avec features

In [9]:
svm = SVC(C=100)

svm.fit(X_train_features, train_class_name)
svm.score(X_eval_features,eval_class_name)


0.9853758408891489

#### Modèle RandomForest avec features

In [11]:

rf_clf = RandomForestClassifier(n_jobs=-1)
rf_clf.fit(X_train_features, train_class_name)
rf_clf.score(X_eval_features,eval_class_name)

0.9850833577069319

> Meilleur score SVC => étude plus en détail des prédictions

In [12]:
predict = svm.predict(X_eval_features)

In [13]:
print(metrics.classification_report( eval_class_name,predict ))

              precision    recall  f1-score   support

    basophil       1.00      0.99      0.99       222
  eosinophil       1.00      1.00      1.00       627
erythroblast       0.98      0.99      0.98       311
          ig       0.96      0.98      0.97       590
  lymphocyte       0.96      0.98      0.97       236
    monocyte       0.99      0.95      0.97       302
  neutrophil       0.99      0.99      0.99       682
    platelet       1.00      1.00      1.00       449

    accuracy                           0.99      3419
   macro avg       0.98      0.98      0.98      3419
weighted avg       0.99      0.99      0.99      3419



In [14]:
# Recherche de meilleurs paramètres SVC
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV
distributions = {'C': loguniform(1e0, 1e3),
 'gamma': loguniform(1e-4, 1e-3),
 'kernel': ['rbf'],
 'class_weight':['balanced', None]}

clf = RandomizedSearchCV(svm, distributions)
search = clf.fit(X_train_features, train_class_name)
search.best_params_

{'C': 178.79439651520408,
 'class_weight': None,
 'gamma': 0.0001739428408958799,
 'kernel': 'rbf'}

In [16]:
clf.score(X_eval_features,eval_class_name)

0.9847908745247148

> Pas d'amélioration du score par la recherche des meilleurs paramètres

##### Matrice de confusion

In [17]:
pd.crosstab(pd.Series(eval_class_name, name='Réalité'),pd.Series(predict, name='Prédiction'))

Prédiction,basophil,eosinophil,erythroblast,ig,lymphocyte,monocyte,neutrophil,platelet
Réalité,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basophil,220,0,0,1,0,0,1,0
eosinophil,0,625,0,2,0,0,0,0
erythroblast,0,0,307,2,2,0,0,0
ig,0,0,2,577,1,3,7,0
lymphocyte,0,0,2,2,232,0,0,0
monocyte,1,1,1,4,6,288,1,0
neutrophil,0,0,0,10,0,0,672,0
platelet,0,0,1,0,0,0,0,448


> Mauvais rappel et précision des IG : confusion avec monocyte et neutrophil
>
> Toujours le problème de classification des IG