<a href="https://colab.research.google.com/github/EcovisionSN/EcoVision_PAS_2023/blob/dev/preprocessing/Transform_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
from random import shuffle
import tensorflow as tf
from PIL import Image
import skimage.io as io
import numpy as np
import random
import PIL
import sys
import os
import numpy as np

#### Montage de drive

In [2]:
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
dir_name = "/content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/dataset_16_10_2023"
if not os.path.exists(dir_name):
  print(f"[INFO] Directory with dataset: {dir_name} not found. Unpacking backup...")
  !unzip /content/drive/MyDrive/Notebooks/Dataset/dataset_16_10_2023.zip -d {dir_name}
else:
  print(f"[INFO] Directory with dataset: {dir_name} was found.")

[INFO] Directory with dataset: /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/dataset_16_10_2023 was found.


#### Définition des fonctions d'aide

In [5]:
from os import listdir
from os.path import isfile, join

def preprocess_one_hot_encode(image_rgb):
    img = np.copy(image_rgb[..., 0])
    for i, num in enumerate([11, 226, 51]):
        img[img == num] = i
    one_hot = tf.keras.utils.to_categorical(img, 3)
    return one_hot

def list_dataset(path):
  """
  path - (/content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/data/)[train/dev]/[image/mask]/img
  """
  prefix_train_mask = "/train/mask175/img"
  prefix_train_image = "/train/image/img"
  prefix_dev_mask = "/dev/mask/img"
  prefix_dev_image = "/dev/image/img"

  images_train = [f"{f}" for f in listdir(path + prefix_train_image) if f.endswith(".png")]
  images_dev = [f"{f}" for f in listdir(path + prefix_dev_image) if f.endswith(".png")]
  print(f"[INFO] Found train images: {len(listdir(path + prefix_train_image))}")
  print(f"[INFO] Found train masks: {len(listdir(path + prefix_train_mask))}")

  pairs_train = [(f"{path}{prefix_train_image}/{f}", f"{path}{prefix_train_mask}/{f}") for f in images_train]
  pairs_dev = [(f"{path}{prefix_dev_image}/{f}", f"{path}{prefix_dev_mask}/{f}") for f in images_dev]

  return [pairs_train, pairs_dev]

def list_dataset_new(path):
  """
  path - (/content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/data/)[images/masks]
  """
  prefix_mask = "/masks"
  prefix_image = "/images"

  images = [f"{f}" for f in listdir(path + prefix_image) if f.endswith(".png")]
  masks = images.copy()
  print(f"[INFO] Found train images: {len(listdir(path + prefix_image))}")
  print(f"[INFO] Found train masks: {len(listdir(path + prefix_mask))}")

  pairs_train = [(f"{path}{prefix_image}/{f}", f"{path}{prefix_mask}/{f}") for f in images]

  return pairs_train

l = list_dataset_new("/content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/dataset_16_10_2023/data")

[INFO] Found train images: 1500
[INFO] Found train masks: 1500


In [6]:
def printDivisors(n) :
    i = 1
    ls = []
    while i <= n :
        if (n % i==0) :
            ls.append(i),
        i = i + 1
    return ls

# Helper functions for defining tf types
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def write_image_annotation_pairs_to_tfrecord(filename_pairs, tfrecords_filename):
    writer = tf.io.TFRecordWriter(tfrecords_filename)

    for img_path, mask_path in filename_pairs:

        img = open(img_path, 'rb').read()
        annotation = open(mask_path, 'rb').read()
        mask = np.asarray(Image.open(mask_path))
        mask_one_channel = np.copy(mask[..., 0])
        for i, num in enumerate([11, 226, 51]):
            mask_one_channel[mask_one_channel == num] = i

        mask_one_channel = np.clip(mask_one_channel, 0, 2)  # Nouvelle ligne ajoutée

        mask_one_hot = tf.keras.utils.to_categorical(mask_one_channel, 3).ravel().tobytes()

        example = tf.train.Example(features=tf.train.Features(feature={
              'image': _bytes_feature(img),
              'mask': _bytes_feature(mask_one_hot),
              }))

        writer.write(example.SerializeToString())
    writer.close()

#### Créer des tfrecord à partir de fichiers et les enregistrer

In [8]:
# dataset_dir = "data_balanced"
# filename_pairs = list_dataset(f"/content/gdrive/MyDrive/Dataset/dataset_raw/{dataset_dir}")
filename_pairs = list_dataset_new("/content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/dataset_16_10_2023/data")
# 0 - element is train dataset, 1 - element is dev dataset
shuffle(filename_pairs)
possible_amounts = printDivisors(len(filename_pairs))
print(f'[INFO] Good dataset sizes: ')
for i in possible_amounts:
  print(f"- {i}")


[INFO] Found train images: 1500
[INFO] Found train masks: 1500
[INFO] Good dataset sizes: 
- 1
- 2
- 3
- 4
- 5
- 6
- 10
- 12
- 15
- 20
- 25
- 30
- 50
- 60
- 75
- 100
- 125
- 150
- 250
- 300
- 375
- 500
- 750
- 1500


In [9]:
tfrecords_filename_template = "/content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_{}.tfrec"

for i in range(1501):
  filename = tfrecords_filename_template.format(i)
  if os.path.exists(filename):
    os.remove(filename)
    print(f"Removed {filename}")
# if  os.path.exists(tfrecords_filename_dev):
#     os.remove(tfrecords_filename_dev)
#     print("Removed dev.tfrec")

print("[INFO] Writing started!")

photos_in_tfrecord = 100
counter = 0

for i in range(len(filename_pairs)):
  filename = tfrecords_filename_template.format(i+1)
  files = filename_pairs[i * photos_in_tfrecord : i * photos_in_tfrecord + photos_in_tfrecord]

  if i * photos_in_tfrecord > len(filename_pairs):
    break
  if len(filename_pairs[(i+1) * photos_in_tfrecord : (i+1) * photos_in_tfrecord + photos_in_tfrecord]) < photos_in_tfrecord:
    files = filename_pairs[i * photos_in_tfrecord : (i+1) * photos_in_tfrecord + photos_in_tfrecord]
    write_image_annotation_pairs_to_tfrecord(files, filename)
    print(f'[INFO] File : {filename} written, with {photos_in_tfrecord} pairs of the photos.')
    counter += 1
    break

  write_image_annotation_pairs_to_tfrecord(files, filename)
  print(f'[INFO] File : {filename} written, with {photos_in_tfrecord} pairs of the photos.')
  counter += 1

print(f"[INFO] Files created : {counter}.")
# write_image_annotation_pairs_to_tfrecord(filename_pairs, tfrecords_filename_template)
# print("Dev dataset written.")
# write_image_annotation_pairs_to_tfrecord(filename_pairs[0], tfrecords_filename_train)
# print("Train dataset written.")

[INFO] Writing started!
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_1.tfrec written, with 100 pairs of the photos.
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_2.tfrec written, with 100 pairs of the photos.
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_3.tfrec written, with 100 pairs of the photos.
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_4.tfrec written, with 100 pairs of the photos.
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_5.tfrec written, with 100 pairs of the photos.
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_6.tfrec written, with 100 pairs of the photos.
[INFO] File : /content/drive/MyDrive/ECOVISION_PAS_CHALLENGE/Dataset/bucket/tfrecords_v2_part_7.tfrec written, with 100 pairs of t