<a href="https://colab.research.google.com/github/linbeta/ingredients_recognition_model/blob/main/02_data_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 載入dataset: 方法二
# https://drive.google.com/file/d/1AZTnvvY8bl5mKSeTdBLE61sDXRI1yvDY/view?usp=sharing
!gdown --id '1AZTnvvY8bl5mKSeTdBLE61sDXRI1yvDY' -O dataset.zip
!unzip dataset.zip

In [None]:
import tensorflow as tf
import cv2
import numpy as np
from glob import glob
from sklearn.model_selection import train_test_split

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, img_paths, batch_size, img_size, shuffle=True, aug=False):
        self.img_paths = img_paths
        self.batch_size = batch_size
        self.img_size = img_size
        self.shuffle = shuffle
        self.aug = aug

        self.indexes = np.arange(len(self.img_paths)) # [0, 1, 2, 3, 4, 5, 6, ~]
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.img_paths) / self.batch_size)) # batches per epoch

    def __getitem__(self, batch_index):
        # Generate indexes of the batch
        idxs = self.indexes[batch_index * self.batch_size: (batch_index + 1) * self.batch_size]
        # Find list of IDs
        batch_img_paths = [self.img_paths[i] for i in idxs]

        # Generates data containing batch_size samples
        x = np.empty((len(batch_img_paths), self.img_size, self.img_size, 3), dtype=np.float32)
        y = np.empty((len(batch_img_paths), 1), dtype=np.float32)

        for i, img_path in enumerate(batch_img_paths):
            # read img
            img = cv2.imread(img_path)
            # img = cv2.resize(img, (self.img_size, self.img_size))
            img = img / 255.

            if self.aug:
                img = img # Augmentation
                
            # read class label: 讀取訓練資料的資料夾名稱的類別數字
            cls = img_path.split('/')[-2].split('_')[0]
            # if cls == 'MT_Free':
            #     cls = 0
            # else:
            #     cls = 1
                
            x[i] = img
            y[i] = cls
            
        return x, y

    def on_epoch_end(self):
        # Updates indexes after each epoch
        if self.shuffle:
            np.random.shuffle(self.indexes)

In [None]:
# 讀取DataSet資料夾裡面全部的資料夾裡面全部的照片
img_paths = glob('./DataSet/*/*.jpg')

In [None]:
len(img_paths)

5000

In [None]:
# 切出多少比例的訓練/驗證資料
train_img_paths, val_img_paths = train_test_split(img_paths, test_size=0.2)
len(train_img_paths), len(val_img_paths)

(4000, 1000)

In [None]:
train_gen = DataGenerator(train_img_paths, 32, 224, shuffle=True, aug=False)
val_gen = DataGenerator(val_img_paths, 32, 224, shuffle=False, aug=False)

In [None]:
# 所有訓練和驗證資料依照batch_size需要跑幾輪
train_gen.indexes
len(train_gen), len(val_gen)

(125, 32)

In [None]:
imgs, labels = train_gen[0]
print(len(labels), labels)

32 [[ 0.]
 [ 0.]
 [ 2.]
 [55.]
 [34.]
 [28.]
 [25.]
 [ 0.]
 [ 2.]
 [27.]
 [55.]
 [26.]
 [31.]
 [27.]
 [ 5.]
 [25.]
 [55.]
 [ 0.]
 [26.]
 [55.]
 [ 5.]
 [ 5.]
 [ 0.]
 [27.]
 [ 0.]
 [ 5.]
 [27.]
 [25.]
 [ 5.]
 [25.]
 [26.]
 [27.]]


In [None]:
imgs.shape, labels.shape

((32, 224, 224, 3), (32, 1))