### Data Augmention

+ 데이터 증강 기법
+ 데이터 양을 늘리기 위해서 원본 이미지에 각종 변화를 적용해서 데이터 수를 늘리는 기법
+ albumentations

In [41]:
# pip install albumentations --user

import albumentations as A

In [42]:
import os
import math 
import random


import cv2
import tensorflow as tf



from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action="ignore")

import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname="C:/Windows/Fonts/malgun.ttf").get_name()
plt.rc("font", family=font_name)

import matplotlib as mpl
mpl.rcParams["axes.unicode_minus"] = False

In [43]:
class Augmentation:
    def __init__(self, size, mode="train"):
        if mode == "train":
            self.transform = A.Compose([
                    A.HorizontalFlip(p=0.5),
                    A.ShiftScaleRotate(p=0.5, shift_limit=0.05, scale_limit=0.5, rotate_limit=15),
                    A.CoarseDropout(p=0.5, max_holes=8, 
                                    max_width=int(0.1 * size), max_height=int(0.1*size)),
                    A.RandomBrightnessContrast(p=0.2)
                ])
    
    def __call__(self, **kwargs):
        if self.transform:
            augmented = self.transform(**kwargs)
            img = augmented["image"]
            return img
    
# * list 
# ** dict

In [44]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, batch_size, csv_path, image_size, fold, mode, shuffle=True):
        self.batch_size = batch_size
        self.image_size = image_size
        self.fold = fold
        self.mode = mode
        self.shuffle = shuffle
        
        self.df = pd.read_csv(csv_path)
        
        if self.mode == "train":
            self.df = self.df[self.df["fold"] != self.fold]
        elif self.mode == "val":
            self.df = self.df[self.df["fold"] == self.fold]
        
        #### Remove invalid files
        #### https://github.com/tensorflow/models/issues/3134
        invalid_filenames = [
            'Egyptian_Mau_14',
            'Egyptian_Mau_139',
            'Egyptian_Mau_145',
            'Egyptian_Mau_156',
            'Egyptian_Mau_167',
            'Egyptian_Mau_177',
            'Egyptian_Mau_186',
            'Egyptian_Mau_191',
            'Abyssinian_5',
            'Abyssinian_34',
            'chihuahua_121',
            'beagle_116'
        ]
        self.df = self.df[~self.df['file_name'].isin(invalid_filenames)]
        
        self.transform = Augmentation(image_size, mode)
        
        self.on_epoch_end()

    def __len__(self):
        return math.ceil(len(self.df) / self.batch_size)

    def __getitem__(self, idx):
        start = idx * self.batch_size
        fin = (idx+1) * self.batch_size
        data = self.df.iloc[start : fin]
        
        batch_x, batch_y = self.get_data(data)
        
        return np.array(batch_x), np.array(batch_y)
    
    def get_data(self, data):
        batch_x = []
        batch_y = []
        
        for _, j in data.iterrows():
            file_name = j["file_name"]
            
            image = cv2.imread(f"../data/images/{file_name}.jpg")
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)                 
            image = cv2.resize(image, (self.image_size, self.image_size))
            
            if self.mode == "train":
                image = image.astype("uint8")
                image = self.transform(image=image)
            
            image = image.astype("float32")
            image = image / 255.
            
            
            label = int(j["species"]) - 1
            
            batch_x.append(image)
            batch_y.append(label)
        
        return batch_x, batch_y
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)

In [45]:
csv_path = "../data/kfolds.csv"

train_generator = DataGenerator(
    batch_size=128, 
    csv_path = csv_path, 
    image_size=256, 
    fold=1, 
    mode="train", 
    shuffle=True
)

valid_generator = DataGenerator(
    batch_size=128, 
    csv_path = csv_path, 
    image_size=256, 
    fold=1, 
    mode="val", 
    shuffle=True
)

In [52]:
class_name = ["Cat", "Dog"]

for batch in train_generator:
    X, y = batch
    plt.figure(figsize=(15, 15))
    
    for i in range(9):
        ax = plt.subplot(3, 3, i+1)
        plt.imshow(X[i])
        plt.title(class_name[y[i]])
        plt.axis("off")
    
    break

## 구멍뚫리게 나옴

error: OpenCV(4.5.5) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'


In [49]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=3, verbose=1, mode="min", restore_best_weights=False
)

reduce_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.1, patience=10, verbose=1, mode="min", min_lr=0.001
)

filepath = "{epoch:02d}-{val_loss:.2f}.hdf5"
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath, monitor="val_loss", verbose=1, save_best_only=True, save_weights_only=False,
    mode="min"
)

In [50]:
history = model.fit(
                train_generator,
                validation_data = valid_generator,
                epochs=10,
                callbacks = [
                    early_stopping,
                    reduce_on_plateau,
                    model_checkpoint
                ],
                verbose=1
            )

NameError: name 'model' is not defined

In [51]:
import matplotlib.pyplot as plt
history = history.history

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.plot(history['loss'], label='train')
plt.plot(history['val_loss'], label='val')
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title("Loss")

plt.subplot(1, 2, 2)
plt.plot(history['accuracy'], label='train')
plt.plot(history['val_accuracy'], label='val')
plt.legend()
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.title("Accuracy")
plt.show()


NameError: name 'history' is not defined