# Augmentation Hyperparameter Search

In [None]:
%load_ext lab_black

%load_ext autoreload
%autoreload 2

In [None]:
# Don't know why my local jupyter doesn't autocomplete
%config Completer.use_jedi = False

In [None]:
import pandas as pd
from local_lib import utils, train
from pathlib import Path
import random
import shutil
import optuna
import imgaug.augmenters as iaa

In [None]:
labels_pt = "../data/processed/labels_v0.1.0.csv"
source_image_dir = Path("../data/processed/labeling/images")
train_image_dir = Path("../data/processed/tmp_training")
# TODO: figure out why 8 had to be substracted
max_images = 10_000 - 8

In [None]:
labels = pd.read_csv(labels_pt)
remainder = max_images - len(labels)

labels_filtered = labels.query("suggestion != 'remove' and org_source != 'val'")
images_2_aug = labels_filtered.query(
    "confusing == 'none' and org_source != 'val'"
).copy()

In [None]:
images_2_aug["aug_amount"] = 0
images_per_group = remainder // 10
symbol_counts = images_2_aug.symbol.value_counts()

for row in symbol_counts.to_frame().reset_index().itertuples():

    symbol_sample_min = int(images_per_group / row.symbol)
    sample_amount = [symbol_sample_min] * row.symbol
    sample_remainder = images_per_group - sum(sample_amount)
    if sample_remainder > 0:
        sample_amount[:sample_remainder] = [symbol_sample_min + 1] * sample_remainder
    random.shuffle(sample_amount)
    images_2_aug.loc[images_2_aug.symbol == row.index, "aug_amount"] = sample_amount

assert images_2_aug.aug_amount.sum() <= remainder

In [None]:
def objective(trial):

    if train_image_dir.exists():
        shutil.rmtree(train_image_dir)

    _ = utils.copy_group_to_dir(
        df=labels,
        source_dir=source_image_dir,
        dest_dir=train_image_dir,
        group=["org_source", "symbol"],
    )
    assert len(_) == 0, "Unexpected missing images"

    # trying to see if optuna correctly filter this parameter out
    fake = trial.suggest_float(name="fake_value", low=0.01, high=0.5, step=0.01)

    aug_seq = []
    # set how frequent a set of augmentations will happen
    common_aug = trial.suggest_float(name="common_aug", low=0.2, high=0.3, step=0.02)
    freq_aug = trial.suggest_float(name="freq_aug", low=0.04, high=0.2, step=0.02)
    rare_aug = trial.suggest_float(name="rare_aug", low=0.01, high=0.05, step=0.01)

    # common augmentations
    if trial.suggest_categorical(name="aug_scale", choices=[True, False]):
        aug_seq.append(
            iaa.Sometimes(p=common_aug, then_list=iaa.Affine(scale=(0.5, 1.5)))
        )

    if trial.suggest_categorical(name="aug_crop_pad", choices=[True, False]):
        aug_seq.append(
            iaa.Sometimes(
                p=common_aug,
                then_list=iaa.CropAndPad(
                    percent=(-0.25, 0.25),
                    pad_mode="constant",
                    pad_cval=255,
                ),
            )
        )

    if trial.suggest_categorical(name="aug_resize", choices=[True, False]):
        aug_seq.append(
            iaa.Sometimes(
                p=common_aug,
                then_list=iaa.Affine(
                    translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}
                ),
            )
        )

    if trial.suggest_categorical(name="aug_rotate", choices=[True, False]):
        aug_seq.append(iaa.Sometimes(p=common_aug, then_list=iaa.Rotate((-25, 25))))

    if trial.suggest_categorical(name="aug_coarsed_dropout", choices=[True, False]):
        aug_seq.append(
            iaa.Sometimes(
                p=common_aug,
                then_list=iaa.CoarseDropout(p=(0.0, 0.05), size_percent=(0.02, 0.25)),
            )
        )

    # I don't want to have combination of these because it might make the augmentation
    # too complicated and unrealistic to the data we have
    oneof = []
    if trial.suggest_categorical(name="aug_shear", choices=[True, False]):
        oneof.append(iaa.Sometimes(p=freq_aug, then_list=iaa.Affine(shear=(-16, 16))))

    if trial.suggest_categorical(name="aug_avg_blurk", choices=[True, False]):
        oneof.append(iaa.Sometimes(p=rare_aug, then_list=iaa.AverageBlur((2, 11))))

    if oneof:
        aug_seq.append(iaa.OneOf(oneof))

    seq = iaa.Sequential(aug_seq, random_order=True)

    utils.apply_aug(
        images_2_aug, source_image_dir, train_image_dir, seq, ["org_source", "symbol"]
    )

    loss, acc = train.train_model(
        train_image_dir, "../data/raw/label_book", epochs=25, verbose=0
    )

    return acc

TODO: add images from website for documentations


* dropout
    * From observation, I don't see this that common and not that extreme. Therefore, selecting a lower percent of dropout. 
    * 
* coarsedropout
* saltandpepper
* AverageBlur
    
* meanshiftblur
* rotate
* geometric 
    * affine
    * perspectivetransform
* imgcorruptlike
    * snow
    * spatter
* pad?
* resize?

In [None]:
study_name = "aug-08-17-21" 
storage_name = "sqlite:///{}.db".format(study_name)
study = optuna.create_study(
    study_name=study_name,
    storage=storage_name,
    direction="maximize",
    
)

In [None]:
study.optimize(objective, n_trials=100, show_progress_bar=True)