In [2]:
import os
os.chdir('..')

In [3]:
os.getcwd()

'C:\\Users\\gueganj\\Desktop\\Eyeglasses Detection'

# Packages

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
import glob
import numpy as np
from dataset import Dataset_SMP, get_preprocessing, split_data, get_training_augmentation
import segmentation_models_pytorch as smp
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler
from functools import partial
import ray
from models.bisenet import BiSeNet
from models.densenet import FCDenseNet
from models.hrnet import HighResolutionNet
from models.config.hrnet import HRNet as cfg_HRNet
def loguniform(a=0, b=1):
    return np.exp(np.random.uniform(np.log(a), np.log(b)))

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
torch.backends.cudnn.deterministic = True
random.seed(123456)
torch.manual_seed(123456)
torch.cuda.manual_seed(123456)
np.random.seed(123456)

# Data

In [31]:
data_root    = "C:\\Users\\gueganj\\Desktop\\My_Database\\nature\\"
num_epochs   = 150
input_size   = [544,960]
size_dataset = 118
landmarks_dir = False
threshold = 0.5

In [32]:
# path
train_set, valid_set, test_set = split_data(data_root, "images", ".jpg", "masks\\frame", ".png", size_dataset, use_id=True)
train_image, train_mask = train_set
valid_image, valid_mask = valid_set
test_image, test_mask   = test_set

TOTAL : 136  images -  52  personnes
train : 106  images -  41  personnes
valid : 13  images -  5  personnes
test  : 17  images -  6  personnes


In [33]:
train_augmentation = get_training_augmentation()

# Full Training Function

In [34]:
def train_function(search_space, model="unet", num_epochs=150, input_size=[544,960]):
    # ============ SEARCH SPACE ============
    # unload dictionnary
    batch_size   = search_space['batch_size']
    lr           = search_space['lr']
    momentum     = search_space['momentum']
    weight_decay = search_space['weight_decay']
    nesterov     = search_space['nesterov']
    # ============= MODEL =============
    if model == "BiseNet":
        model = BiSeNet(activation='sigmoid', n_classes=1)
    elif model == "DenseNet":
        model = FCDenseNet(activation='sigmoid', n_classes=1)
    elif model == "HRNet":
        model = HighResolutionNet(activation='sigmoid', n_classes=1, config=cfg_HRNet)
    elif model == "PSPNet":
        model = smp.PSPNet(activation='sigmoid', classes=1)
    elif model == "DeepLabV3Plus":
        model = smp.DeepLabV3Plus(activation='sigmoid', classes=1)
    elif model == "UnetPlusPlus":
        model = smp.UnetPlusPlus(activation='sigmoid', classes=1)
        
    preprocessing_fn = smp.encoders.get_preprocessing_fn('resnet18', 'imagenet')
    model.to(device)
    # ============= DATALOADER =============
    train_dataset = Dataset_SMP(train_image, train_mask, input_size, train_augmentation, get_preprocessing(preprocessing_fn), landmarks_dir)
    valid_dataset = Dataset_SMP(valid_image, valid_mask, input_size, None, get_preprocessing(preprocessing_fn), landmarks_dir)
    train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    valid_loader  = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    # ============= OPTIMIZER =============
    params_to_update = model.parameters() # change here if we want finetune only certain layer
    optimizer = optim.SGD(params_to_update, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov)
    # ============= LOSS =============
    loss    = smp.utils.losses.DiceLoss()
    metrics = [smp.utils.metrics.IoU(threshold=threshold)]
    # ============= TRAINING =============
    # create epoch runners
    train_epoch = smp.utils.train.TrainEpoch(model, loss=loss, metrics=metrics, optimizer=optimizer, device=device, verbose=False)
    valid_epoch = smp.utils.train.ValidEpoch(model, loss=loss, metrics=metrics, device=device, verbose=False)
    # train model
    best_score = 0
    date_time = datetime.now().strftime("%d_%m_%Y-%H_%M")
    for epoch in range(0, num_epochs):
        train_logs = train_epoch.run(train_loader)
        valid_logs = valid_epoch.run(valid_loader)
        # save checkpoint with Ray Tune
        if best_score < valid_logs['iou_score']:
            with tune.checkpoint_dir(step=epoch) as checkpoint_dir:
                ckpt_path = os.path.join(checkpoint_dir,'..',"best_model.ckpt")
                torch.save((model.state_dict(), optimizer.state_dict()), ckpt_path)
        
        tune.report(loss=valid_logs['dice_loss'], score=valid_logs['iou_score'])
    print("Finished Training !")

# Configure Search space

In [35]:
# DO a sampling not a grid
search_space = {
    "lr":           tune.loguniform(1e-4, 1e-1),
    "batch_size":   tune.choice([2, 8, 16]), # 32 and 64 ==> CUDA out of memory
    "momentum":     tune.sample_from(lambda spec: 1-loguniform(1e-4, 1e-1)),
    "weight_decay": tune.loguniform(1e-5, 1e1),
    "nesterov":     tune.choice([True,False])
}

# Scheduler

In [36]:
# early stopping with asha
scheduler = ASHAScheduler(metric="loss", mode="min", max_t=170, grace_period=50, reduction_factor=2)

# Reporter

In [37]:
reporter = JupyterNotebookReporter(overwrite=True, metric_columns=["loss", "iou_score", "training_iteration"])

# Main

In [40]:
# Uncomment this to enable distributed execution
ray.init(num_cpus=8, num_gpus=2, include_dashboard=False) #address="auto"

{'node_ip_address': '10.18.4.222',
 'raylet_ip_address': '10.18.4.222',
 'redis_address': '10.18.4.222:6379',
 'object_store_address': 'tcp://127.0.0.1:59583',
 'raylet_socket_name': 'tcp://127.0.0.1:54499',
 'webui_url': None,
 'session_dir': 'C:\\Users\\gueganj\\AppData\\Local\\Temp\\ray\\session_2020-11-30_11-36-34_603144_4056',
 'metrics_export_port': 54484}

In [41]:
num_samples = 15 # Number of times to sample from the hyperparameter space
for model in ["BiseNet","HRNet","DenseNet","PSPNet","DeepLabV3Plus","UnetPlusPlus"]:
    result = tune.run(partial(train_function, model=model, num_epochs=num_epochs, input_size=input_size), 
                      config=search_space, num_samples=num_samples, resources_per_trial={"cpu": 4, "gpu": 1}, 
                      verbose=1, scheduler=scheduler, name=model+'_'+datetime.now().strftime("%d_%m_%Y-%H_%M"))
    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))

Trial name,status,loc,batch_size,lr,momentum,nesterov,weight_decay
DEFAULT_ebdfe_00000,ERROR,,8,0.0381339,0.978726,False,1.0657e-05
DEFAULT_ebdfe_00001,ERROR,,2,0.000732811,0.997466,True,0.00495455
DEFAULT_ebdfe_00002,PENDING,,16,0.00219919,0.995286,False,0.003692
DEFAULT_ebdfe_00003,PENDING,,16,0.011807,0.999872,True,0.0183513
DEFAULT_ebdfe_00004,PENDING,,8,0.00334003,0.997464,False,6.94012
DEFAULT_ebdfe_00005,PENDING,,2,0.0152094,0.987368,True,0.0123694
DEFAULT_ebdfe_00006,PENDING,,2,0.0010307,0.99979,False,0.153913
DEFAULT_ebdfe_00007,PENDING,,2,0.0042229,0.999865,True,2.80399e-05
DEFAULT_ebdfe_00008,PENDING,,16,0.0141598,0.990327,True,0.0140636
DEFAULT_ebdfe_00009,PENDING,,8,0.00218153,0.999842,True,0.294131

Trial name,# failures,error file
DEFAULT_ebdfe_00000,1,"C:\Users\gueganj\ray_results\DEFAULT\DEFAULT_ebdfe_00000_0_batch_size=8,lr=0.038134,momentum=0.97873,nesterov=False,weight_decay=1.0657e-05_2020-11-30_11-36-36\error.txt"
DEFAULT_ebdfe_00001,1,"C:\Users\gueganj\ray_results\DEFAULT\DEFAULT_ebdfe_00001_1_batch_size=2,lr=0.00073281,momentum=0.99747,nesterov=True,weight_decay=0.0049545_2020-11-30_11-36-42\error.txt"


KeyboardInterrupt: 

[2m[36m(pid=19804)[0m 2020-11-30 11:37:22,810	ERROR function_runner.py:233 -- Runner Thread raised error.
[2m[36m(pid=19804)[0m Traceback (most recent call last):
[2m[36m(pid=19804)[0m   File "C:\Users\gueganj\Miniconda3\envs\pytorch_env\lib\site-packages\ray\tune\function_runner.py", line 227, in run
[2m[36m(pid=19804)[0m     self._entrypoint()
[2m[36m(pid=19804)[0m   File "C:\Users\gueganj\Miniconda3\envs\pytorch_env\lib\site-packages\ray\tune\function_runner.py", line 290, in entrypoint
[2m[36m(pid=19804)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=19804)[0m   File "C:\Users\gueganj\Miniconda3\envs\pytorch_env\lib\site-packages\ray\tune\function_runner.py", line 497, in _trainable_func
[2m[36m(pid=19804)[0m     output = train_func(config)
[2m[36m(pid=19804)[0m   File "<ipython-input-34-617ec5f77356>", line 38, in train_function
[2m[36m(pid=19804)[0m   File "C:\Users\gueganj\Miniconda3\envs\pytorch_env\lib\site-packages\segmentation_models