# Préparation

## Import librairies et data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
from os import listdir


from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve
from glob import glob

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, GlobalAveragePooling1D, Flatten, Dense, Dropout 
from tensorflow.keras.layers import Rescaling, RandomFlip, RandomRotation, RandomZoom
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical

import time
import cv2
from PIL import Image
#tf.config.experimental_run_functions_eagerly(True)

tf.config.run_functions_eagerly(True)

In [4]:
liste_file = []

try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        files = os.listdir("/content/drive/MyDrive/Notebooks/P6/Dataset/Images")
        path ="/content/drive/MyDrive/Notebooks/P6/Dataset/Images/"
        data_T =  pd.read_csv("/content/drive/MyDrive/Notebooks/P6/basep6.csv")

except ModuleNotFoundError : 
        files = os.listdir(r'C:\Users\Hugues\gdrive\Notebooks\P6\Dataset\Images')
        path ="C:\\Users\\Hugues\\gdrive\\Notebooks\\P6\\Dataset\\Images\\"
        data_T =  pd.read_csv("C:\\Users\\Hugues\\gdrive\\Notebooks\\P6\\basep6.csv")

for file in files:
    try:
        from google.colab import drive
        imgfile = "/content/drive/MyDrive/Notebooks/P6/Dataset/Images/"+file
        liste_file.append ( file)
        
    except ModuleNotFoundError :
        imgfile  = "C:\\Users\\Hugues\\gdrive\\Notebooks\\P6\\Dataset\\Images\\"+file
        liste_file.append ( file)




print (len(liste_file))


Mounted at /content/drive
1053


In [5]:
print (path) 

/content/drive/MyDrive/Notebooks/P6/Dataset/Images/


In [6]:
data_T.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 41 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Unnamed: 0.1                       1050 non-null   int64 
 1   Unnamed: 0                         1050 non-null   int64 
 2   uniq_id                            1050 non-null   object
 3   crawl_timestamp                    1050 non-null   object
 4   product_url                        1050 non-null   object
 5   product_name                       1050 non-null   object
 6   product_category_tree              1050 non-null   object
 7   pid                                1050 non-null   object
 8   retail_price                       1050 non-null   object
 9   discounted_price                   1050 non-null   object
 10  image                              1050 non-null   object
 11  is_FK_Advantage_product            1050 non-null   bool  
 12  descri

## Création des labels des photos à partir des category

In [7]:
#data=data_T.sample(100)
data=data_T

In [8]:
data.groupby("category").count()

Unnamed: 0_level_0,Unnamed: 0.1,Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,...,product_name_length_bow,product_name_length_dl,product_category_tree_length_bow,product_category_tree_length_dl,description_length_bow,description_length_dl,product_specifications_length_bow,product_specifications_length_dl,toute_info,toute_info_dl
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BabyCare,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150
BeautyandPersonalCare,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150
Computers,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150
HomeDecor&FestiveNeeds,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150
HomeFurnishing,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150
Kitchen&Dining,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150
Watches,150,150,150,150,150,150,150,150,150,150,...,150,150,150,150,150,150,150,150,150,150


In [9]:
list_labels= data['category'].unique().tolist()

In [10]:
data["label"] = [list_labels.index(l) for l in data["category"]]

In [11]:
data[["label","category"]].sample(n = 20).sort_values('label')

Unnamed: 0,label,category
444,0,HomeFurnishing
999,0,HomeFurnishing
477,0,HomeFurnishing
971,1,BabyCare
621,1,BabyCare
79,1,BabyCare
556,2,Watches
578,2,Watches
916,3,HomeDecor&FestiveNeeds
195,3,HomeDecor&FestiveNeeds


# préparation initiale des images

In [12]:
print (data.shape)

(1050, 42)


In [13]:
data_backup = data

In [14]:
data,data_test = train_test_split( data_backup, random_state=22, test_size=0.25)

In [15]:
data = data.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

In [16]:
# redimensionnement de l'image en une taille de (224, 224)
#Conversion de l'image en un tableau numpy 
#Normalisation des valeurs de pixels de l'image à l'aide de la fonction preprocess_input() 

print (data.shape)

In [17]:
def image_prep_fct(data) :
    prepared_images = []
    for image_num in range(len(data['image'])) :
        img = (load_img(path+data['image'][image_num],
            target_size=(224, 224)))
        img = img_to_array(img)
        img = img.reshape((img.shape[0], img.shape[1], img.shape[2]))
        img = preprocess_input(img)
        prepared_images.append(img)
        prepared_images_np = np.array(prepared_images)
    return prepared_images_np

images_np = image_prep_fct(data)
print(images_np.shape)
images_np_test = image_prep_fct(data_test)
print(images_np_test.shape)





(787, 224, 224, 3)
(263, 224, 224, 3)


In [18]:
images_np = image_prep_fct(data)
print(images_np.shape)
images_np_test = image_prep_fct(data_test)
print(images_np_test.shape)

(787, 224, 224, 3)
(263, 224, 224, 3)


In [19]:
X = images_np
y = to_categorical(data['label'])

X_test = images_np_test
y_test = to_categorical(data_test['label'])



In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
X_train.shape

(590, 224, 224, 3)

In [21]:
X_val.shape


(197, 224, 224, 3)

In [22]:
def create_model_fct() :
    # Récupération modèle pré-entraîné
    model0 = VGG16(include_top=False, weights="imagenet", input_shape=(224, 224, 3))

    # Layer non entraînables = on garde les poids du modèle pré-entraîné
    for layer in model0.layers:
        layer.trainable = False

    # Récupérer la sortie de ce réseau
    x = model0.output
    # Compléter le modèle
    x = GlobalAveragePooling2D()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    predictions = Dense(7, activation='softmax')(x)

    # Définir le nouveau modèle
    model = Model(inputs=model0.input, outputs=predictions)
    # compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=["accuracy"])

    print(model.summary())
    
    return model

# Modele VGG16 

In [23]:
# Création du modèle
with tf.device('/gpu:0'): 
    model1 = create_model_fct()

# Création du callback
model1_save_path1 = path+"./model1_best_weights.h5"
checkpoint = ModelCheckpoint(model1_save_path1, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)    

In [None]:
# Entraîner sur les données d'entraînement (X_train, y_train)
with tf.device('/gpu:0'): 
    history1 = model1.fit(X_train, y_train, epochs=50, batch_size=64, 
                       callbacks=callbacks_list, validation_data=(X_val, y_val), verbose=1)



Epoch 1/50
Epoch 1: val_loss improved from inf to 1.87801, saving model to /content/drive/MyDrive/Notebooks/P6/Dataset/Images/./model1_best_weights.h5
Epoch 2/50
Epoch 2: val_loss improved from 1.87801 to 1.21557, saving model to /content/drive/MyDrive/Notebooks/P6/Dataset/Images/./model1_best_weights.h5
Epoch 3/50
Epoch 3: val_loss improved from 1.21557 to 1.21007, saving model to /content/drive/MyDrive/Notebooks/P6/Dataset/Images/./model1_best_weights.h5
Epoch 4/50
Epoch 4: val_loss improved from 1.21007 to 0.85611, saving model to /content/drive/MyDrive/Notebooks/P6/Dataset/Images/./model1_best_weights.h5
Epoch 5/50
Epoch 5: val_loss did not improve from 0.85611
Epoch 6/50
Epoch 6: val_loss improved from 0.85611 to 0.80510, saving model to /content/drive/MyDrive/Notebooks/P6/Dataset/Images/./model1_best_weights.h5
Epoch 7/50
Epoch 7: val_loss did not improve from 0.80510
Epoch 8/50
Epoch 8: val_loss did not improve from 0.80510
Epoch 9/50
Epoch 9: val_loss did not improve from 0.805

In [None]:
# Score du dernier epoch

loss, accuracy = model1.evaluate(X_train, y_train, verbose=True)
print("Training Accuracy: {:.4f}".format(accuracy))
print()
loss, accuracy = model1.evaluate(X_val, y_val, verbose=True)
print("Validation Accuracy:  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model1.load_weights(model1_save_path1)

loss, accuracy = model1.evaluate(X_val, y_val, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model1.evaluate(X_test, y_test, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))

In [None]:
#affichage de l'historique Keras

In [None]:
!pip install plot_keras_history
from plot_keras_history import show_history, plot_history
import matplotlib.pyplot as plt

show_history(history1)
plot_history(history1, path="standard.png")
plt.close()

In [None]:
y_val_num = np.argmax(y_val, axis=1)
y_val_pred = np.argmax(model1.predict(X_val), axis=1)
y_val_num = np.argmax(y_val, axis=1)
print(y_val_num)
print()
print(y_val_pred)

## Matrice de confusion VGG16 

In [None]:
conf_mat = metrics.confusion_matrix(y_val_num, y_val_pred)
print(conf_mat)

In [None]:
def conf_mat_transform(y_true,y_pred) :
    conf_mat = metrics.confusion_matrix(y_true,y_pred)
    
    corresp = np.argmax(conf_mat, axis=0)
    print ("Correspondance des clusters : ", corresp)
    # y_pred_transform = np.apply_along_axis(correspond_fct, 1, y_pred)
    labels = pd.Series(y_true, name="y_true").to_frame()
    labels['y_pred'] = y_pred
    labels['y_pred_transform'] = labels['y_pred'].apply(lambda x : corresp[x]) 
    
    return labels['y_pred_transform']

In [None]:
y_val_pred_transform = conf_mat_transform(y_val_num, y_val_pred)
conf_mat = metrics.confusion_matrix(y_val_num, y_val_pred_transform)
print(conf_mat)
print()
print(metrics.classification_report(y_val_num, y_val_pred_transform))

In [None]:
df_cm = pd.DataFrame(conf_mat, index = [label for label in list_labels],
                  columns = [i for i in "0123456"])
plt.figure(figsize = (6,4))
sns.heatmap(df_cm, annot=True, cmap="Blues")

In [None]:
#todo vérifier meileur attribution des classes / Labelr 

# Approche ImageDatagenerator avec data augmentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
#data=data_T.sample(100)

In [None]:
# Pour mélanger les images, classées initalement par classe
#data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data = data_backup.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
data['image']

In [None]:
data['image'] = data['image'].apply(lambda x: path+x)

In [None]:
data_test['image'] = data_test['image'].apply(lambda x: path+x)

In [None]:
batch_size = 32

def data_flow_fct(data, datagen, data_type=None) :
    data_flow = datagen.flow_from_dataframe(data, directory='',
                                x_col='image', y_col='category',
                                weight_col=None, target_size=(224, 224),
                                classes=None, class_mode='categorical',
                                batch_size=batch_size, shuffle=True, seed=42,
                                subset=data_type
                                )
    return data_flow

In [None]:
datagen_train = ImageDataGenerator(
#    featurewise_center=True,
#    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.25,
    preprocessing_function=preprocess_input)

train_flow = data_flow_fct(data, datagen_train, data_type='training')
val_flow = data_flow_fct(data, datagen_train, data_type='validation')

datagen_test = ImageDataGenerator(
    validation_split=0,
    preprocessing_function=preprocess_input)

test_flow = data_flow_fct(data_test, datagen_test, data_type=None)

# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
# datagen.fit(X_train)
# fits the model on batches with real-time data augmentation:

In [None]:
# Création du modèle
with tf.device('/gpu:0'): 
    model2 = create_model_fct()

# Création du callback
model2_save_path = path+"./model2_best_weights.h5"

checkpoint = ModelCheckpoint(model2_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
with tf.device('/gpu:0'): 
    history2 = model2.fit(train_flow,
                    validation_data=val_flow,
                    batch_size=batch_size, epochs=50, callbacks=callbacks_list, verbose=1) 



In [None]:
# Score du dernier epoch

loss, accuracy = model2.evaluate(train_flow, verbose=True)
print("Training Accuracy   : {:.4f}".format(accuracy))
print()
loss, accuracy = model2.evaluate(val_flow, verbose=True)
print("Validation Accuracy :  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model2.load_weights(model2_save_path)

loss, accuracy = model2.evaluate(val_flow, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model2.evaluate(test_flow, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))


In [None]:
from plot_keras_history import show_history, plot_history
import matplotlib.pyplot as plt

show_history(history2)
plot_history(history2, path="standard.png")
plt.close()

In [None]:
test_flow

In [None]:
dataset_train = train_flow  #dataset_fct(path2, validation_split=0.25, data_type='training')
dataset_val = val_flow # dataset_fct(path2, validation_split=0.25, data_type='validation')
dataset_test = test_flow # dataset_fct(path2_test, validation_split=0, data_type=None)""" 

# Approche avec data augmentation intégrée au modèle

In [None]:




dataset_train = train_flow  #dataset_fct(path2, validation_split=0.25, data_type='training')
dataset_val = val_flow # dataset_fct(path2, validation_split=0.25, data_type='validation')
dataset_test = test_flow # dataset_fct(path2_test, validation_split=0, data_type=None)""" 

In [None]:
def resize_and_rescale(image, label):
    image = tf.cast(image, tf.float32)
    image = tf.image.resize(image, [224, 224])
    image = (image / 255.0)
    return image, label

In [None]:
def create_model_fct2() :
    # Data augmentation
    data_augmentation = Sequential([
        RandomFlip("horizontal", input_shape=(224, 224, 3)),
        RandomRotation(0.2),
        RandomZoom(0.2),
        # Rescaling(1./127.5, offset=-1.0)
      ])
    
    # Récupération modèle pré-entraîné
    model_base = VGG16(include_top=False, weights="imagenet", input_shape=(224, 224, 3))
    for layer in model_base.layers:
        layer.trainable = False

    # Définition du nouveau modèle
    model = Sequential([
                data_augmentation,
                Rescaling(1./127.5, offset=-1),
                model_base,
                GlobalAveragePooling2D(),
                Dense(224, activation='relu'), #changé de 256 à 224 
                Dropout(0.5),
                Dense(7, activation='softmax')
                ])

    # compilation du modèle 
    model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])

    print(model.summary())
    
    return model
    

In [None]:
# Création du modèle
with tf.device('/gpu:0'): 
    model4 = create_model_fct2()

# Création du callback
model4_save_path = path+"./model4_best_weights.h5"
checkpoint = ModelCheckpoint(model4_save_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
callbacks_list = [checkpoint, es]

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
with tf.device('/gpu:0'): 
    history4 = model4.fit(dataset_train,
                    validation_data=dataset_val,
                    batch_size=batch_size, epochs=50, callbacks=callbacks_list, verbose=1)

In [None]:
# Score du dernier epoch

loss, accuracy = model4.evaluate(dataset_train, verbose=True)
print("Training Accuracy   : {:.4f}".format(accuracy))
print()
loss, accuracy = model4.evaluate(dataset_val, verbose=True)
print("Validation Accuracy :  {:.4f}".format(accuracy))

In [None]:
# Score de l'epoch optimal

model4.load_weights(model4_save_path)

loss, accuracy = model4.evaluate(dataset_val, verbose=False)
print("Validation Accuracy :  {:.4f}".format(accuracy))

loss, accuracy = model4.evaluate(dataset_test, verbose=False)
print("Test Accuracy       :  {:.4f}".format(accuracy))

In [None]:
from plot_keras_history import show_history, plot_history
import matplotlib.pyplot as plt

show_history(history4)
plot_history(history4, path="standard.png")
plt.close()