# Post-traitement du dataset

# Import des librairies et configuration des variables

In [2]:
import numpy as np
import pandas as pd
from PIL import Image
import shutil
import math
import os
import logging
from datetime import datetime
import cv2
import tensorflow as tf
from tensorflow.keras import layers
print("version de TensorFlow importée :", tf.__version__)

2024-09-16 15:02:20.589222: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 15:02:20.592794: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 15:02:20.607443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-16 15:02:20.628836: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-16 15:02:20.634642: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-16 15:02:20.648337: I tensorflow/core/platform/cpu_feature_gu

version de TensorFlow importée : 2.17.0


In [4]:
# Repertoire des donnés brute
raw_data_path = "../../dataset/"

# Repertoire racine des données
root = '../../data/'

# DB a utiliser
db = 'MO/'

# Chemins des dossiers LAYERS vers les data
LAYER0 = root+'LAYER0/'
LAYER1 = root+'LAYER1/'
LAYER2 = root+'LAYER2/'

## Création de l'architecture

## LAYER 0 : Données brutes (séléction et webscraping déjà effectuée)

### Copie des données brutes

In [43]:
shutil.copytree(raw_data_path, LAYER0+db)

'../../data/LAYER0/MO/'

### Générer les données structurés

In [45]:
data = []
for id_folder in os.listdir(LAYER0+db):
    path = LAYER0+db+id_folder
    files = os.listdir(path)
    files_without_extension = [os.path.splitext(file)[0] for file in files]  # os.path.splitext(file) : Cette fonction de la bibliothèque os.path prend un nom de fichier et le divise en deux parties
    data.append({
        "species_id": id_folder, 
        "merged_ids": ','.join(files_without_extension),
        "imgs_files": ','.join(files),
        "total_files": len(files)
    })

webscraped = pd.DataFrame(data)
display(webscraped)
print("Nombre total de fichiers :", webscraped['total_files'].sum())

# Génération du fichier CSV
webscraped.to_csv(LAYER0+'dataset.csv', index=False) 
print(f'Data saved to {LAYER0}'+'dataset.csv')

Unnamed: 0,species_id,merged_ids,imgs_files,total_files
0,42,"1073,113898,11410,114795,115947,117503,118285,...","1073.jpg,113898.jpg,11410.jpg,114795.jpg,11594...",246
1,53,"1003017,1106469,1115838,1115839,1125351,113170...","1003017.jpg,1106469.jpg,1115838.jpg,1115839.jp...",192
2,267,"1042116,1051907,1055774,1055775,1055776,105577...","1042116.jpg,1051907.jpg,1055774.jpg,1055775.jp...",203
3,330,"1000839,101125,101126,1020108,1047060,1058461,...","1000839.jpg,101125.jpg,101126.jpg,1020108.jpg,...",190
4,344,"102050,107133,1071722,1075718,1089687,1090898,...","102050.jpg,107133.jpg,1071722.jpg,1075718.jpg,...",194
5,362,"1001208,1002765,1002766,1002767,1002768,100276...","1001208.jpg,1002765.jpg,1002766.jpg,1002767.jp...",201
6,373,"1017358,1029199,1029201,1029205,1040142,104014...","1017358.jpg,1029199.jpg,1029201.jpg,1029205.jp...",249
7,382,"100599,100600,1047052,1062503,1062531,1069054,...","100599.jpg,100600.jpg,1047052.jpg,1062503.jpg,...",236
8,401,"1027663,1036403,1036404,1036405,103870,107722,...","1027663.jpg,1036403.jpg,1036404.jpg,1036405.jp...",164
9,1174,"1038882,1043175,1043176,1061154,1076908,113092...","1038882.jpg,1043175.jpg,1043176.jpg,1061154.jp...",226


Nombre total de fichiers : 3665
Data saved to ../../data/LAYER0/dataset.csv


# LAYER 1 : Transformation des images

In [22]:
# Créer le répertoire de la DB
if not os.path.isdir(LAYER1+db):
    os.makedirs(LAYER1+db)

# Transformation
for specie in os.listdir(LAYER0+db):
    if not os.path.isdir(LAYER1+db+specie):
        os.makedirs(LAYER1+db+specie)
    fichiers = os.listdir(LAYER0+db+specie)

    for fichier in fichiers:
        if not os.path.isfile(LAYER1+db+specie+'/'+fichier):
            image = Image.open(LAYER0+db+specie+'/'+fichier)
            zone_recadrage = (0, 0, 224, 224) # Taille de l'image
            image_recadree = image.crop(zone_recadrage)
            image_recadree.save(LAYER1+db+specie+'/'+fichier)

# LAYER 2 : Augmentation des donnés et séparation des donnés

## Fonction personalisé pour l'augmentation des donnés et la séparation des donnés

In [5]:
def augmentData(
        source_path,
        dest_path,
        train_size_percent,
        target_train_size,
        max_class_size,
        min_class_size,
        ):

    # Nettoyage de l'arborescence avant de commencer
    if os.path.isdir(dest_path):
        shutil.rmtree(dest_path)

    species = os.listdir(source_path)

    # Création des dossiers à partir de la liste dans le DataFrame
    for specie in species:
        images_source = os.listdir(source_path+"/"+str(specie))
        if len(images_source) < min_class_size:
            continue;
        for split_folder in ["train", "validation", "test"]:
            folder_path = os.path.join(dest_path, split_folder, specie) # Construire le chemin complet du dossier
            os.makedirs(folder_path, exist_ok=True) # Créer le dossier, existe_ok=True permet de ne pas lever d'erreur si le dossier existe déjà


    # Configuration des paramètres pour l'augmentation des images
    data_augmentation = tf.keras.Sequential([
        layers.Rescaling(1./255),                       # Mise à l'échelle
        layers.RandomFlip("horizontal_and_vertical"),   # Flip
        layers.RandomRotation(0.2),                     # Rotation
        layers.RandomZoom((-0.2, 0.2)),                 # Zoom
        layers.RandomTranslation(0.2, 0.2),             # Translation
        layers.RandomBrightness(factor= [-.001, .001]), # Ajustement de la luminosité
        layers.RandomContrast(factor= .4),              # Ajustement du contraste
    ])

    # Data Augmentation
    for specie in species: # Pour chaque espèce de champi
        images_source = os.listdir(source_path+"/"+str(specie))
        if len(images_source) < min_class_size:
            continue;
        
        #calcul nb images de l'espèce courrante à mettre dans le jeu de train
        if(max_class_size == 0):
            train_count = round(len(images_source) * train_size_percent / 100)
        else:
            train_count = round(max_class_size * train_size_percent / 100)
            
        train_count_total = 0
        source_pictures_count = 0
        
        for pic in images_source:

            if(train_count > 0):
                split_dir = "train"
                train_count_total += 1
                #calcul nb images augmentées à générer pour chaque image source
                augment_ratio = math.floor( (target_train_size - train_count_total) / train_count)
            else:
                if(max_class_size == 0 or source_pictures_count < max_class_size):
                    split_dir = "validation"
                else:
                    split_dir = "test"

            # Chemin complet de l'image champi
            mushroom_pic_path = str(source_path+"/"+str(specie)+"/"+str(pic))

            # Décodage de l'image JPEG en tant qu'image tensor, channels=3 pour une image couleur (RGB)
            image = tf.image.decode_jpeg(tf.io.read_file(mushroom_pic_path), channels=3) 

            #copie de l'image d'origine dans le dataset
            shutil.copy(mushroom_pic_path, str(dest_path)+"/"+split_dir+"/"+str(specie)+"/"+str(pic))

            if(train_count > 0):
                for i in range(augment_ratio): # Nombre d'images aumgentée générée par photo
                    # Générer l'image augmentée
                    augmented_image = data_augmentation(image)

                    # Enregistrer l'image au format JPEG
                    augmented_image_conv = tf.image.convert_image_dtype(augmented_image, tf.uint8) # Convertion
                    augmented_image_enc = tf.image.encode_jpeg(augmented_image_conv) # Encodage JPEG
                    fname = str(pic)+"_"+str(i)+".jpg" # Nommage
                    with open(str(dest_path)+"/"+split_dir+"/"+str(specie)+"/"+str(fname), 'wb') as f: f.write(augmented_image_enc.numpy()) # Écriture
                    train_count_total += 1
                    if(train_count_total == target_train_size):
                        break
            
            train_count -= 1
            source_pictures_count += 1

## Execution

In [6]:
augmentData(
        source_path = LAYER1+db,
        dest_path = LAYER2+db,
        train_size_percent = 70,
        target_train_size = 1000,
        max_class_size = 120, # Nombre de photos conservées, les autres servirons pour le jeu de test
        min_class_size = 135, # Les classes ne disposant pas assez d'images seront ignorés
        )

## Vérification

In [7]:
def get_total_size(directory):
    total_size = 0
    for root, _, files in os.walk(directory):
        for filename in files:
            filepath = os.path.join(root, filename)
            if os.path.isfile(filepath):
                filesize = os.path.getsize(filepath)
                total_size += filesize
    return total_size / 1024 ** 2  # Convertir en Mégaoctets

# Liste des dossiers
print("Dossier trouvés : ", len(os.listdir(LAYER2+db)), " (", round(get_total_size(LAYER2+db), 2), "Mo ) :\n")
for item in os.listdir(LAYER2+db):
    print(item)

Dossier trouvés :  3  ( 330.21 Mo ) :

train
validation
test
