# Préparation du dataset

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
from os import listdir
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve
from glob import glob

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, GlobalAveragePooling1D, Flatten, Dense, Dropout, experimental 
#from tensorflow.keras.layers import Rescaling, RandomFlip, RandomRotation, RandomZoom
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical

# os.environ["TF_KERAS"]='1'
print("La version de tensorflow utilisé est:",tf.version.VERSION)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

La version de tensorflow utilisé est: 2.1.0
Num GPUs Available:  1


In [30]:
df = pd.read_csv('data/cleaned/description_cleaned_spacy.csv')
df.head()

Unnamed: 0,uniq_id,image,description,description_clean,cat_1
0,55b85ea15a1536d46b7190ad6fff8ce7,55b85ea15a1536d46b7190ad6fff8ce7.jpg,Key Features of Elegance Polyester Multicolor ...,"['key', 'feature', 'elegance', 'polyester', 'm...",home furnishing
1,7b72c92c2f6c40268628ec5f14c6d590,7b72c92c2f6c40268628ec5f14c6d590.jpg,Specifications of Sathiyas Cotton Bath Towel (...,"['specification', 'sathiyas', 'cotton', 'bath'...",baby care
2,64d5d4a258243731dc7bbb1eef49ad74,64d5d4a258243731dc7bbb1eef49ad74.jpg,Key Features of Eurospa Cotton Terry Face Towe...,"['key', 'feature', 'eurospa', 'cotton', 'terry...",baby care
3,d4684dcdc759dd9cdf41504698d737d8,d4684dcdc759dd9cdf41504698d737d8.jpg,Key Features of SANTOSH ROYAL FASHION Cotton P...,"['key', 'feature', 'santosh', 'royal', 'fashio...",home furnishing
4,6325b6870c54cd47be6ebfbffa620ec7,6325b6870c54cd47be6ebfbffa620ec7.jpg,Key Features of Jaipur Print Cotton Floral Kin...,"['key', 'feature', 'jaipur', 'print', 'cotton'...",home furnishing


In [31]:
import pickle 
from sklearn import preprocessing

try:
    with open('data/cleaned/description_cleaned_spacy.pkl', 'rb') as f1:
        data = pickle.load(f1)
except:
    data = pd.read_csv('data/cleaned/description_cleaned_spacy.csv')

data = data[['image','cat_1']].rename(columns={"cat_1": "label_name"})

# Definir la liste des catégories
le = preprocessing.LabelEncoder()
data["label"] = le.fit_transform(data["label_name"])

# Récuperer la liste des libellé des catégories
list_labels = data.label_name.unique()

# Definir le nombre de catégories
NBCLASS_ = len(data.label_name.unique())

data

Unnamed: 0,image,label_name,label
0,55b85ea15a1536d46b7190ad6fff8ce7.jpg,home furnishing,4
1,7b72c92c2f6c40268628ec5f14c6d590.jpg,baby care,0
2,64d5d4a258243731dc7bbb1eef49ad74.jpg,baby care,0
3,d4684dcdc759dd9cdf41504698d737d8.jpg,home furnishing,4
4,6325b6870c54cd47be6ebfbffa620ec7.jpg,home furnishing,4
...,...,...,...
1045,958f54f4c46b53c8a0a9b8167d9140bc.jpg,baby care,0
1046,fd6cbcc22efb6b761bd564c28928483c.jpg,baby care,0
1047,5912e037d12774bb73a2048f35a00009.jpg,baby care,0
1048,c3edc504d1b4f0ba6224fa53a43a7ad6.jpg,baby care,0


# Recuprer la liste de path des train et test files

In [32]:
from glob import glob
path_train = 'data/cleaned/Images/train/'
path_test = 'data/cleaned/Images/test/'

data_train_path = glob(path_train+'*/*.jp*')
data_test_path = glob(path_test+'*/*.jp*')

def data_fct(path) :
    data = pd.DataFrame()
    data["image_path"] = path
    data["image_path"] = data["image_path"].str.replace('\\','/') #transforme les \\ en /
    data["label_name"] = data["image_path"].str.split('/',expand=True)[4] #
    return data

data_train = data_fct(data_train_path)
data_test = data_fct(data_test_path)


In [33]:
data_train['label'] = data_train.label_name.map({'baby_care':0, 'beauty_and_personal_care':1, 'computers':2,
       'home_decor_&_festive_needs':3, 'home_furnishing':4, 'kitchen_&_dining':5, 'watches':6})

data_test['label'] = data_test.label_name.map({'baby_care':0, 'beauty_and_personal_care':1, 'computers':2,
       'home_decor_&_festive_needs':3, 'home_furnishing':4, 'kitchen_&_dining':5, 'watches':6})

data_train.sample(10)

Unnamed: 0,image_path,label_name,label
360,data/cleaned/Images/train/Home_Decor_&_Festive...,Home_Decor_&_Festive_Needs,
119,data/cleaned/Images/train/Beauty_and_Personal_...,Beauty_and_Personal_Care,
507,data/cleaned/Images/train/Kitchen_&_Dining/0c7...,Kitchen_&_Dining,
654,data/cleaned/Images/train/Watches/8748b6cd9f03...,Watches,
132,data/cleaned/Images/train/Beauty_and_Personal_...,Beauty_and_Personal_Care,
530,data/cleaned/Images/train/Kitchen_&_Dining/503...,Kitchen_&_Dining,
150,data/cleaned/Images/train/Beauty_and_Personal_...,Beauty_and_Personal_Care,
366,data/cleaned/Images/train/Home_Decor_&_Festive...,Home_Decor_&_Festive_Needs,
239,data/cleaned/Images/train/Computers/6e7cc21610...,Computers,
644,data/cleaned/Images/train/Watches/700dfd088162...,Watches,


In [34]:
data_test.label_name.value_counts()

Home_Decor_&_Festive_Needs    50
Watches                       50
Baby_Care                     50
Kitchen_&_Dining              50
Home_Furnishing               50
Computers                     50
Beauty_and_Personal_Care      50
Name: label_name, dtype: int64

In [35]:
data.groupby("label").count()

Unnamed: 0_level_0,image,label_name
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,150,150
1,150,150
2,150,150
3,150,150
4,150,150
5,150,150
6,150,150


# Etape 1 : étude de faisabilité

## Création du modèle pré-entraîné

In [36]:
base_model = VGG16()
model = Model(inputs=base_model.inputs, outputs=base_model.layers[-2].output)

print(model.summary())

ResourceExhaustedError: OOM when allocating tensor with shape[25088,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Mul] name: fc1/kernel/Initializer/random_uniform/mul/

## Création des features des images

In [None]:
images_features = []
i=0 
for image_file in data["image"] :
    if i%100 == 0 : print(i)
    i +=1
    image = load_img('data/source/raw_folder_image/'+image_file, target_size=(224, 224))
    image = img_to_array(image) 
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    images_features.append(model.predict(image, verbose=0)[0]) # predict from pretrained model

images_features = np.asarray(images_features)
images_features.shape

## Réduction dimension et analyse

### Réduction de dimension PCA

In [None]:
from sklearn import manifold, decomposition

print(images_features.shape)
pca = decomposition.PCA(n_components=0.99)
feat_pca= pca.fit_transform(images_features)
print(feat_pca.shape)

### Réduction de dimension T-SNE et affichage des images selon vraies classes

In [None]:
from sklearn import manifold, decomposition
import time

temps1 = time.time()

tsne = manifold.TSNE(n_components=2, perplexity=50, n_iter=2000, init='random', random_state=6)
X_tsne = tsne.fit_transform(feat_pca)

duration1=time.time()-temps1
print("temps de T-SNE : ", "%15.2f" % duration1, "secondes")

In [None]:
df_tsne = pd.DataFrame(X_tsne, columns=['tsne1', 'tsne2'])
df_tsne["class"] = data["label_name"]

plt.figure(figsize=(12,10))
sns.scatterplot(
    x="tsne1", y="tsne2",
    hue="class",
    palette=sns.color_palette('tab10', n_colors= NBCLASS_), s=50, alpha=0.6,
    data=df_tsne,
    legend="brief"
)

plt.title('TSNE selon les vraies classes', fontsize = 30, pad = 35, fontweight = 'bold')
plt.xlabel('tsne1', fontsize = 26, fontweight = 'bold')
plt.ylabel('tsne2', fontsize = 26, fontweight = 'bold')
plt.legend(prop={'size': 14}) 

plt.show()


* L'analyse graphique montre visuellement qu'il est réalisable de séparer automatiquement les images selon leurs vraies classes
* Ceci suffit à démontrer la faisabilité de réaliser ultérieurement une classification supervisée pour déterminer automatiquement les classes des images
* Cette étape 1 est très rapide à mettre en oeuvre. Une conclusion négative sur la faisabilité aurait éviter de réaliser des traitements beaucoup plus lourd de classification supervisée
* Cette démarche en 2 étapes (1. Faisabilité, 2. Classification supervisée si étape 1 OK) s'inscrit dans une démarche agile de tout projet Data

### Création de clusters à partir du T-SNE et affichage des images selon clusters
* Attention : ici, il ne s'agit pas de faire une classification non supervisée, mais simplement, par une mesure de l'ARI, de conforter l'analyse graphique précédente qui démontre la faisabilité de réaliser ultérieurement une classification supervisée. Cette mesure de l'ARI nécessite de créer des clusters théoriques via KMeans
* Il s'agit donc de réaliser une mesure de ce que nous voyons graphiquement, donc à partir des données en sortie du t-sne
* Pour réaliser une classification non supervisée, il aurait fallu repartir des données avant t-sne
* Dans la démarche en 2 étapes, il n'est pas utile de réaliser une classification non supervisée, une classification supervisée est bien plus performante. Même le calcul de l'ARI n'est pas indispensable, nous pourrions passer directement du graphique t-sne précédent à l'étape 2 de classification supervisée
* Il n'est donc pas utile de passer du temps à optimiser l'ARI, un ordre de grandeur suffit pour conforter le 1er graphique t-sne. D'ailleurs la meilleure solution de feature engineering ne génère pas toujours le meilleur ARI. L'analyse graphique t-sne est bien plus riche d'enseignement


In [None]:
from sklearn import cluster, metrics

cls = cluster.KMeans(n_clusters= NBCLASS_ , n_init=100)
cls.fit(X_tsne)

In [None]:
df_tsne["cluster"] = cls.labels_

plt.figure(figsize=(12,10))
sns.scatterplot(
    x="tsne1", y="tsne2",
    hue="cluster",
    palette=sns.color_palette('tab10', n_colors= NBCLASS_ ), s=50, alpha=0.6,
    data=df_tsne,
    legend="brief")

plt.title('TSNE selon les clusters', fontsize = 30, pad = 35, fontweight = 'bold')
plt.xlabel('tsne1', fontsize = 26, fontweight = 'bold')
plt.ylabel('tsne2', fontsize = 26, fontweight = 'bold')
plt.legend(prop={'size': 14}) 

plt.show()

labels = data["label"]
print("ARI : ", metrics.adjusted_rand_score(labels, cls.labels_))

* Analyse : le modèle pré-entraîné confond "cloud" avec de la neige ...

### Analyse par classes

In [None]:
conf_mat = metrics.confusion_matrix(data.label, cls.labels_)
print(conf_mat)

In [None]:
def conf_mat_transform(y_true,y_pred) :
    conf_mat = metrics.confusion_matrix(y_true,y_pred)
    
    corresp = np.argmax(conf_mat, axis=0)
    print ("Correspondance des clusters : ", corresp)
    # y_pred_transform = np.apply_along_axis(correspond_fct, 1, y_pred)
    labels = pd.Series(y_true, name="y_true").to_frame()
    labels['y_pred'] = y_pred
    labels['y_pred_transform'] = labels['y_pred'].apply(lambda x : corresp[x]) 
    
    return labels['y_pred_transform']

In [None]:
cls_labels_transform = conf_mat_transform(labels, cls.labels_)
conf_mat = metrics.confusion_matrix(labels, cls_labels_transform)
print(conf_mat)
print()
print(metrics.classification_report(labels, cls_labels_transform))

In [None]:
df_cm = pd.DataFrame(conf_mat, index = [label for label in data.label_name.unique()],
                  columns = [i for i in "0123456"])

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5), tight_layout=True,sharey=True)
sns.heatmap(df_cm, annot=True,fmt='d', cmap="Blues",ax= ax1)
sns.heatmap(df_cm.apply(lambda x:x/x.sum(),axis=1), annot=True,fmt='.2%', cmap="Blues",ax= ax2)

* La classe la moins bien prédite est "kitchen & dining" (CF exemple ci-dessus : confond la neige avec un nuage)

# Etape 2 : classification supervisée
4 approches sont présentées :
* Une approche simple par préparation initiale de l'ensemble des images avant classification supervisée
* Une approche par data generator, permettant facilement la data augmentation. Les images sont directement récupérées à la volée dans le repertoire des images
* Une approche récente proposée par Tensorflow.org par DataSet, sans data augmentation
* Une approche par  DataSet, avec data augmentation intégrée au modèle : layer en début de modèle


## Création du modèle de classification

In [None]:
def create_model_fct() :
    # Récupération modèle pré-entraîné
    model0 = VGG16(include_top=False, weights="imagenet", input_shape=(224, 224, 3))

    # Layer non entraînables = on garde les poids du modèle pré-entraîné
    for layer in model0.layers:
        layer.trainable = False

    # Récupérer la sortie de ce réseau
    x = model0.output
    # Compléter le modèle
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(NBCLASS_, activation='softmax')(x)

    # Définir le nouveau modèle
    model = Model(inputs=model0.input, outputs=predictions)
    # compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer= 'adam',#'rmsprop', 
                  metrics=["accuracy"])

    print(model.summary())
    
    return model
    

In [None]:
data

In [None]:
data_train

## Approche préparation initiale des images

In [None]:
def image_prep_fct(data) :
    prepared_images = []
    for image_num in range(len(data['image_path'])) :
        img = (load_img(
            data['image_path'][image_num],
            target_size=(224, 224)))
        img = img_to_array(img)
        img = img.reshape((img.shape[0], img.shape[1], img.shape[2]))
        img = preprocess_input(img)
        prepared_images.append(img)
        prepared_images_np = np.array(prepared_images)
    return prepared_images_np
    
images_np_train = image_prep_fct(data_train)
print(images_np_train.shape)
images_np_test = image_prep_fct(data_test)
print(images_np_test.shape)

In [None]:
X = images_np_train
y = to_categorical(data_train['label'])

X_test = images_np_test
y_test = to_categorical(data_test['label'])

y

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
X_train.shape

In [None]:
%%time
# Création du modèle
with tf.device('/gpu:0'): 
    model1 = create_model_fct()

# Création du callback
model1_save_path1 = "./model1_best_weights.h5"
checkpoint = ModelCheckpoint(model1_save_path1, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]


In [None]:
%%time
# Entraîner sur les données d'entraînement (X_train, y_train)
with tf.device('/gpu:0'): 
    history1 = model1.fit(X_train, y_train, epochs=50, batch_size=64, 
                       callbacks=callbacks_list, validation_data=(X_val, y_val), verbose=1)


In [None]:
# Score du dernier epoch

loss, accuracy = model1.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
print()
loss, accuracy = model1.evaluate(X_val, y_val, verbose=True)
print("Validation Accuracy:  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model1.load_weights(model1_save_path1)

loss, accuracy = model1.evaluate(X_val, y_val, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model1.evaluate(X_test, y_test, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))


In [None]:
try:
    from plot_keras_history import show_history, plot_history
    import matplotlib.pyplot as plt

    show_history(history1)
    plot_history(history1, path="standard.png")
    plt.close()
except:
    epochs_range = range(11)
    plt.figure(figsize=(15, 10))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, history1.history['accuracy'], label='Training Accuracy')
    plt.plot(epochs_range, history1.history['val_accuracy'], label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, history1.history['loss'], label='Training Loss')
    plt.plot(epochs_range, history1.history['val_loss'], label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()

In [None]:
y_val_num = np.argmax(y_val, axis=1)
y_val_pred = np.argmax(model1.predict(X_val), axis=1)
y_val_num = np.argmax(y_val, axis=1)
print(y_val_num)
print()
print(y_val_pred)

In [None]:
conf_mat = metrics.confusion_matrix(y_val_num, y_val_pred)
print(conf_mat)

In [None]:
y_val_pred_transform = conf_mat_transform(y_val_num, y_val_pred)
conf_mat = metrics.confusion_matrix(y_val_num, y_val_pred_transform)
print(conf_mat)
print()
print(metrics.classification_report(y_val_num, y_val_pred_transform))

In [None]:
df_cm = pd.DataFrame(conf_mat, index = [label for label in list_labels],
                  columns = [i for i in "0123456"])
plt.figure(figsize = (6,4))
sns.heatmap(df_cm, annot=True, cmap="Blues");

</Br>

## Approche ImageDatagenerator avec data augmentation

CF https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator#flow (noté désormais comme "deprecated", incite à utiiser l'approche suivante)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
# Pour mélanger les images, classées initalement par classe
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
batch_size = 32

def data_flow_fct(data, datagen, data_type=None) :
    data_flow = datagen.flow_from_dataframe(data, directory='',
                                x_col='image_path', y_col='label_name',
                                weight_col=None, target_size=(224, 224),
                                classes=None, class_mode='categorical',
                                batch_size=batch_size, shuffle=True, seed=42,
                                subset=data_type
                                )
    return data_flow

In [None]:
datagen_train = ImageDataGenerator(
#    featurewise_center=True,
#    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,
    preprocessing_function=preprocess_input)

train_flow = data_flow_fct(data_train, datagen_train, data_type='training')
val_flow = data_flow_fct(data_train, datagen_train, data_type='validation')

datagen_test = ImageDataGenerator(
    validation_split=0,
    preprocessing_function=preprocess_input)

test_flow = data_flow_fct(data_test, datagen_test, data_type=None)

# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
# datagen.fit(X_train)
# fits the model on batches with real-time data augmentation:

In [None]:
# Création du modèle
with tf.device('/gpu:0'): 
    model2 = create_model_fct()

# Création du callback
model2_save_path = "./model2_best_weights.h5"
checkpoint = ModelCheckpoint(model2_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
%%time
with tf.device('/gpu:0'): 
    history2 = model2.fit(train_flow,
                    validation_data=val_flow,
                    batch_size=batch_size, epochs=50, callbacks=callbacks_list, verbose=1)


In [None]:
# Score du dernier epoch

loss, accuracy = model2.evaluate(train_flow, verbose=True)
print("Training Accuracy   : {:.4f}".format(accuracy))
print()
loss, accuracy = model2.evaluate(val_flow, verbose=True)
print("Validation Accuracy :  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model2.load_weights(model2_save_path)

loss, accuracy = model2.evaluate(val_flow, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model2.evaluate(test_flow, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))


In [None]:
try:
    from plot_keras_history import show_history, plot_history
    import matplotlib.pyplot as plt

    show_history(history2)
    plot_history(history2, path="standard.png")
    plt.close()
except:
    epochs_range = range(len(history2.history['loss']))

    plt.figure(figsize=(15, 10))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, history2.history['accuracy'], label='Training Accuracy')
    plt.plot(epochs_range, history2.history['val_accuracy'], label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, history2.history['loss'], label='Training Loss')
    plt.plot(epochs_range, history2.history['val_loss'], label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()

In [None]:
y_test_num = np.argmax(y_test, axis=1)
y_test_pred = np.argmax(model2.predict(X_test), axis=1)
y_test_num = np.argmax(y_test, axis=1)
print(y_test_num)
print()
print(y_test_pred)

y_test_pred_transform = conf_mat_transform(y_test_num, y_test_pred)
conf_mat = metrics.confusion_matrix(y_test_num, y_test_pred_transform)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=list_labels)
disp.plot()
plt.rcParams['figure.figsize']=[6,6]
plt.xticks(rotation=90)
plt.show()

In [None]:
print(metrics.classification_report(y_test_num, y_test_pred_transform))

## Approche nouvelle par Dataset sans data augmentation

CF https://www.tensorflow.org/tutorials/load_data/images

In [None]:
batch_size = 32
try:
    def dataset_fct(path, validation_split=0, data_type=None) :
        dataset = tf.keras.utils.image_dataset_from_directory(
                        path, labels='inferred', label_mode='categorical',
                        class_names=None, batch_size=32, image_size=(224, 224), shuffle=True, seed=42,
                        validation_split=validation_split, subset=data_type
                        )
        return dataset
    
    # test de la création de la fonction
    dataset_fct(path_train, validation_split=0.25, data_type='training')
    
except:
    def dataset_fct(path, validation_split=0, data_type=None) :
        dataset = tf.keras.preprocessing.image_dataset_from_directory(
                        path, labels='inferred', label_mode='categorical',
                        class_names=None, batch_size=32, image_size=(224, 224), shuffle=True, seed=42,
                        validation_split=validation_split, subset=data_type
                        )
        return dataset

In [None]:
dataset_train = dataset_fct(path_train, validation_split=0.25, data_type='training')
dataset_val = dataset_fct(path_train, validation_split=0.25, data_type='validation')
dataset_test = dataset_fct(path_test, validation_split=0, data_type=None)

In [None]:
# Création du modèle
with tf.device('/gpu:0'): 
    model3 = create_model_fct()

# Création du callback
model3_save_path = "./model3_best_weights.h5"
checkpoint = ModelCheckpoint(model3_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
%%time
with tf.device('/gpu:0'): 
    history3 = model3.fit(dataset_train,
                    validation_data=dataset_val,
                    batch_size=batch_size, epochs=50, callbacks=callbacks_list, verbose=1)


In [None]:
# Score du dernier epoch

loss, accuracy = model3.evaluate(dataset_train, verbose=True)
print("Training Accuracy   : {:.4f}".format(accuracy))
print()
loss, accuracy = model3.evaluate(dataset_val, verbose=True)
print("Validation Accuracy :  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model3.load_weights(model3_save_path)

loss, accuracy = model3.evaluate(dataset_val, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model3.evaluate(dataset_test, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))


In [None]:
try:
    from plot_keras_history import show_history, plot_history
    import matplotlib.pyplot as plt

    show_history(history3)
    plot_history(history3, path="standard.png")
    plt.close()
except:
    epochs_range = range(len(history3.history['loss']))

    plt.figure(figsize=(15, 10))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, history3.history['accuracy'], label='Training Accuracy')
    plt.plot(epochs_range, history3.history['val_accuracy'], label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, history3.history['loss'], label='Training Loss')
    plt.plot(epochs_range, history3.history['val_loss'], label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()

In [None]:
y_test_pred = np.argmax(model3.predict(X_test), axis=1)

y_test_pred_transform = conf_mat_transform(y_test_num, y_test_pred)
conf_mat = metrics.confusion_matrix(y_test_num, y_test_pred_transform)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=list_labels)
disp.plot()
plt.rcParams['figure.figsize']=[6,6]
plt.xticks(rotation=90)
plt.show()

In [None]:
print(metrics.classification_report(y_test_num, y_test_pred_transform))

## Approche nouvelle par Dataset avec data augmentation intégrée au modèle

CF https://www.tensorflow.org/tutorials/images/data_augmentation

In [None]:
batch_size = 32

try:
    def dataset_fct(path, validation_split=0, data_type=None) :
        dataset = tf.keras.utils.image_dataset_from_directory(
                        path, labels='inferred', label_mode='categorical',
                        class_names=None, batch_size=batch_size, image_size=(224, 224), shuffle=True, seed=42,
                        validation_split=validation_split, subset=data_type
                        )
        return dataset
    
    # test de la création de la fonction
    dataset_fct(path_train, validation_split=0.25, data_type='training')
    
except:
    def dataset_fct(path, validation_split=0, data_type=None) :
        dataset = tf.keras.preprocessing.image_dataset_from_directory(
                        path, labels='inferred', label_mode='categorical',
                        class_names=None, batch_size=batch_size, image_size=(224, 224), shuffle=True, seed=42,
                        validation_split=validation_split, subset=data_type
                        )
        return dataset

In [None]:
dataset_train = dataset_fct(path_train, validation_split=0.25, data_type='training')
dataset_val = dataset_fct(path_train, validation_split=0.25, data_type='validation')
dataset_test = dataset_fct(path_test, validation_split=0, data_type=None)

In [None]:
def resize_and_rescale(image, label):
    image = tf.cast(image, tf.float32)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = (image / 255.0)
    return image, label



In [None]:
import importlib.util
test_spec = importlib.util.find_spec("tensorflow.keras.layers.RandomFlip")
test_spec is not None

In [None]:
def create_model_fct2() :
    import importlib.util
    test_spec = importlib.util.find_spec("tensorflow.keras.layers.RandomFlip")
    
    if test_spec is not None:
        # Data augmentation
        data_augmentation = Sequential([
            tf.keras.layers.RandomFlip("horizontal", input_shape=(224, 224, 3)), #A preprocessing layer which randomly flips images during training.
            tf.keras.layers.RandomRotation(0.1), #A preprocessing layer which randomly rotates images during training.
            tf.keras.layers.RandomZoom(0.1), # A preprocessing layer which randomly zooms images during training
            # tf.keras.layers.Rescaling(1./127.5, offset=-1.0)
          ])
        
        # Récupération modèle pré-entraîné
        model_base = VGG16(include_top=False, weights="imagenet", input_shape=(224, 224, 3))
        for layer in model_base.layers:
            layer.trainable = False

        # Définition du nouveau modèle
        model = Sequential([
                    data_augmentation,
                    
                    # To rescale an input in the [0, 255] range to be in the [0, 1] range, you would pass scale=1./255.
                    # To rescale an input in the [0, 255] range to be in the [-1, 1] range, you would pass scale=1./127.5, offset=-1.
                    tf.keras.layers.Rescaling(1./127.5, offset=-1),
                    model_base,
                    GlobalAveragePooling2D(),
                    Dense(256, activation='relu'),
                    Dropout(0.5),
                    Dense(NBCLASS_, activation='softmax')
                    ])
    
    else:
        # Data augmentation
        data_augmentation = Sequential([
            experimental.preprocessing.RandomFlip("horizontal", input_shape=(224, 224, 3)),
            experimental.preprocessing.RandomRotation(0.1),
            experimental.preprocessing.RandomZoom(0.1),
            # experimental.preprocessing.Rescaling(1./127.5, offset=-1.0)
        ])
        
        # Récupération modèle pré-entraîné
        model_base = VGG16(include_top=False, weights="imagenet", input_shape=(224, 224, 3))
        for layer in model_base.layers:
            layer.trainable = False

        # Définition du nouveau modèle
        model = Sequential([
                    data_augmentation,
                    # To rescale an input in the [0, 255] range to be in the [0, 1] range, you would pass scale=1./255.
                    # To rescale an input in the [0, 255] range to be in the [-1, 1] range, you would pass scale=1./127.5, offset=-1.
                    experimental.preprocessing.Rescaling(1./127.5, offset=-1),
                    model_base,
                    GlobalAveragePooling2D(),
                    Dense(256, activation='relu'),
                    Dropout(0.5),
                    Dense(NBCLASS_, activation='softmax')
                    ])

    

    # compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])

    print(model.summary())

    return model


In [None]:
# Création du modèle
with tf.device('/gpu:0'): 
    model4 = create_model_fct2()

# Création du callback
model4_save_path = "./model4_best_weights.h5"
checkpoint = ModelCheckpoint(model4_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
%%time
with tf.device('/gpu:0'): 
    history4 = model4.fit(dataset_train,
                    validation_data=dataset_val,
                    batch_size=batch_size, epochs=50, callbacks=callbacks_list, verbose=1)


In [None]:
# Score du dernier epoch

loss, accuracy = model4.evaluate(dataset_train, verbose=True)
print("Training Accuracy   : {:.4f}".format(accuracy))
print()
loss, accuracy = model4.evaluate(dataset_val, verbose=True)
print("Validation Accuracy :  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model4.load_weights(model4_save_path)

loss, accuracy = model4.evaluate(dataset_val, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model4.evaluate(dataset_test, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))


In [None]:
try:
    from plot_keras_history import show_history, plot_history
    import matplotlib.pyplot as plt

    show_history(history4)
    plot_history(history4, path="standard.png")
    plt.close()
except:
    epochs_range = range(len(history4.history['loss']))

    plt.figure(figsize=(15, 10))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, history4.history['accuracy'], label='Training Accuracy')
    plt.plot(epochs_range, history4.history['val_accuracy'], label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, history4.history['loss'], label='Training Loss')
    plt.plot(epochs_range, history4.history['val_loss'], label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()

In [None]:
y_test_pred = np.argmax(model4.predict(X_test), axis=1)

y_test_pred_transform = conf_mat_transform(y_test_num, y_test_pred)
conf_mat = metrics.confusion_matrix(y_test_num, y_test_pred_transform)

disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=list_labels)
disp.plot()
plt.rcParams['figure.figsize']=[6,6]
plt.xticks(rotation=90)
plt.show()

In [None]:
print(metrics.classification_report(y_test_num, y_test_pred_transform))