<h2> Goal </h2>
<p> This notebook uses ray tune to find training parameters for the MaskRCNN model. </p>

In [2]:
## Imports
from torchvision.transforms import v2 as T
from torch.utils.data import DataLoader
import numpy as np
from pathlib import Path
import os
import math
import sys
import tempfile
from functools import partial

# Model Transforms
from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F

# Model imports
import torch
import torchvision
from torchvision.models.detection import MaskRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection import FasterRCNN

# Data imports
from facet_ml.classification import mask_rcnn
from pathlib import Path
import os

# Ray tune imports
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle

import matplotlib.pyplot as plt

device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm
2024-11-03 23:40:42,949	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-11-03 23:40:43,155	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
### Model Creation Functions ###
def get_model_instance_segmentation(num_classes,
                                    config:dict):
    
    ## load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    ## Make an updated FastRCNN with backgone changes as needed
    backbone = config.get("backbone","resnet50")
    if backbone == "mobilenet_v2":
        backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
        backbone.out_channels = 1280
        fast_rcnn = FasterRCNN(backbone, in_features=in_features, num_classes=num_classes,)
        model.roi_heads.box_predictor = fast_rcnn
    else:
        # Use defualt resnet50 bacbone
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )

    return model

def get_model_instance_segmentation(num_classes,config):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )

    return model

def get_optimizer(model,config):
    '''
    Get an optimzier based on the config settings
    '''
    params = [p for p in model.parameters() if p.requires_grad]
    if config["optimizer"] == "Adam":
        optimizer = torch.optim.Adam(params,
                         lr=config["lr"],
                        weight_decay=config["weight_decay"],
                        betas=config["betas"]
            )
    elif config["optimizer"] == "SGD":
        optimizer = torch.optim.SGD(
            params,
            lr=config["lr"],
            momentum=config["momentum"],
            weight_decay=config["weight_decay"] # .0005 starting
        )
    
    return optimizer

def get_scheduler(optimizer,config):
    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=3,
        gamma=config["gamma"]
    )
    return lr_scheduler

In [4]:
### Load Transforms and Data ###

# Augmentation iff training!
def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        transforms.append(T.RandomVerticalFlip(0.5))
        transforms.append(T.RandomRotation(90))
        transforms.append(T.RandomResizedCrop(size=256, scale=(0.6, 1.4)))
        T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.0)
        # transforms.append(T.RandomCrop(size=(224, 224)))


    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

def load_colloidal_data(data_dir=r"C:\Users\Jacob\Desktop\Academics\Mirkin\cC_Manuscript_Data\Coco_v5"
):
    train_dir = Path(data_dir) / "train"
    test_dir  = Path(data_dir) / "test"
    cd_train = mask_rcnn.ManualCocoColloidalDataset(
        str(train_dir),
        str(train_dir / "_annotations.coco.json"),
        transforms=get_transform(True)
    )
    cd_test = mask_rcnn.ManualCocoColloidalDataset(
        str(test_dir),
        str(Path(test_dir) / "_annotations.coco.json"),
        transforms=get_transform(False)
    )


    return cd_train, cd_test

In [5]:
## Ray Tune config parameters
ray_config ={
    ## Opt choice
    "optimizer": tune.choice(["Adam","SGD"]),

    ## Region Choices
    # Unused

    # General choices
    "lr": tune.loguniform(1e-4,1e-1),
    "betas":  tune.choice([(0.9, 0.999), (0.5, 0.999)]),
    "momentum": tune.uniform(0.5, 0.9),
    "weight_decay": tune.loguniform(1e-4,1e-1),
    "gamma": tune.uniform(0.1, 0.9)
}

In [6]:
## Ray Tune Functions ##

def train_colloidal(config,data_dir):
    '''
    Ray tune train loop
    '''
    device = "cuda"

    model = get_model_instance_segmentation(2,config)
    model.to(device)
    optimizer = get_optimizer(model, config)
    scheduler = get_scheduler(optimizer, config)

    checkpoint = get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            data_path = Path(checkpoint_dir) / "data.pkl"
            with open(data_path, "rb") as fp:
                checkpoint_state = pickle.load(fp)
            start_epoch = checkpoint_state["epoch"]
            model.load_state_dict(checkpoint_state["net_state_dict"])
            optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0

    trainset, testset = load_colloidal_data(data_dir)
    def collate_fn(batch):
        '''
        Collation function receives [(image_1, targets_1{masks,boxes,labels}), (image_10, targets_1{masks,boxes,labels})...]
        Need to stack image_1
        '''
        images = [item[0] for item in batch]
        targets = [item[1] for item in batch]
        # return tuple(zip(*batch))
        return images,targets
    
    train_loader = DataLoader(
        trainset,
        batch_size=2,
        num_workers=0,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        testset,
        batch_size=2,
        num_workers=0,
        collate_fn=collate_fn
    )
    for epoch in range(start_epoch, 10):  # loop over the dataset multiple times
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            images, targets = data
            images = list(image.to(device) for image in images)
            targets = [
                {
                    k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in t.items()
                }
                for t in targets
            ]
            with torch.cuda.amp.autocast(enabled=False):
                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())

            losses_reduced = sum(loss for loss in loss_dict.values())
            loss_value = losses_reduced.item()

            if not math.isfinite(loss_value):
                print(f"Loss is {loss_value}, stopping training")
                # print(loss_dict_reduced)
                sys.exit(1)
            
            losses.backward()
            optimizer.step()

            if scheduler is not None:
                scheduler.step()

            test_loss = 0.0
            test_steps = 0

            for i, data in enumerate(test_loader,0):
                with torch.no_grad():
                    images, targets = data
                    images = list(image.to(device) for image in images)
                    targets = [
                        {
                            k: v.to(device) if isinstance(v, torch.Tensor) else v
                            for k, v in t.items()
                        }
                        for t in targets
                    ]
                    loss_dict = model(images, targets)
                    losses_reduced = sum(loss for loss in loss_dict.values())
                    test_loss += losses_reduced
                    test_steps += 1
            
            checkpoint_data = {
                "epoch": epoch,
                "net_state_dict":model.state_dict(),
                "optimizer_state_dict":optimizer.state_dict()
            }
            with tempfile.TemporaryDirectory() as checkpoint_dir:
                data_path = Path(checkpoint_dir) / "data.pkl"
                with open(data_path, "wb") as fp:
                    pickle.dump(checkpoint_data, fp)

                checkpoint = Checkpoint.from_directory(checkpoint_dir)
                train.report(
                    {"loss": test_loss.to("cpu").detach().numpy() / test_steps, 
                    #  "accuracy": correct / total
                     },
                    checkpoint=checkpoint,
                )
    print("Finished Training")

In [7]:
data_dir = os.path.abspath(r"C:\Users\Jacob\Desktop\Academics\Mirkin\colloidal_crystal_ML\ProcessedData\Coco_v5")

test_config ={
    # Opt choice
    "optimizer": "SGD",

    # Region Choices

    # General choices
    "lr": 0.005,
    "betas":  (0.9, 0.999),
    "momentum":0.9,
    "weight_decay": .5,
    "gamma": 0.6
}

In [8]:
def main(config, num_samples=10, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath(r"C:\Users\Jacob\Desktop\Academics\Mirkin\colloidal_crystal_ML\ProcessedData\Coco_v5")
    load_colloidal_data(data_dir)
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2,
    )

    def short_dirname(trial):
        return "trial_" + str(trial.trial_id)
    
    result = tune.run(
        partial(train_colloidal, data_dir=data_dir),
        # resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        trial_dirname_creator=short_dirname,
        max_concurrent_trials=4
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
    # print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")

In [9]:
main(ray_config,num_samples=30)

2024-11-03 23:40:46,153	INFO worker.py:1783 -- Started a local Ray instance.
2024-11-03 23:40:47,653	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-11-03 23:40:47,655	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-11-04 00:20:03
Running for:,00:39:15.50
Memory:,13.4/31.8 GiB

Trial name,# failures,error file
train_colloidal_57ca6_00015,1,C:/Users/Jacob/AppData/Local/Temp/ray/session_2024-11-03_23-40-44_450333_54200/artifacts/2024-11-03_23-40-47/train_colloidal_2024-11-03_23-40-47/driver_artifacts/trial_57ca6_00015/error.txt
train_colloidal_57ca6_00023,1,C:/Users/Jacob/AppData/Local/Temp/ray/session_2024-11-03_23-40-44_450333_54200/artifacts/2024-11-03_23-40-47/train_colloidal_2024-11-03_23-40-47/driver_artifacts/trial_57ca6_00023/error.txt

Trial name,status,loc,betas,gamma,lr,momentum,optimizer,weight_decay,iter,total time (s),loss
train_colloidal_57ca6_00000,TERMINATED,127.0.0.1:62432,"(0.5, 0.999)",0.71103,0.0260702,0.754554,SGD,0.00444182,1,57.0528,9.20266
train_colloidal_57ca6_00001,TERMINATED,127.0.0.1:62440,"(0.5, 0.999)",0.597532,0.000251792,0.518651,SGD,0.0227673,1,57.6619,9.95315
train_colloidal_57ca6_00002,TERMINATED,127.0.0.1:62416,"(0.9, 0.999)",0.301362,0.000117524,0.848071,Adam,0.0882751,10,2351.29,2.68323
train_colloidal_57ca6_00003,TERMINATED,127.0.0.1:62404,"(0.5, 0.999)",0.227879,0.00030917,0.882355,SGD,0.000102035,1,57.7297,9.46945
train_colloidal_57ca6_00004,TERMINATED,127.0.0.1:30144,"(0.9, 0.999)",0.62551,0.000120133,0.886775,Adam,0.00292122,10,864.212,2.95595
train_colloidal_57ca6_00005,TERMINATED,127.0.0.1:63400,"(0.5, 0.999)",0.124623,0.000333498,0.545806,Adam,0.0377119,10,2280.29,3.05577
train_colloidal_57ca6_00006,TERMINATED,127.0.0.1:61752,"(0.5, 0.999)",0.827628,0.00382336,0.744711,SGD,0.000937597,2,116.176,4.17588
train_colloidal_57ca6_00007,TERMINATED,127.0.0.1:49888,"(0.9, 0.999)",0.742266,0.0250938,0.505058,SGD,0.0714078,1,37.8472,16.3096
train_colloidal_57ca6_00008,TERMINATED,127.0.0.1:41764,"(0.5, 0.999)",0.554964,0.0789794,0.745354,SGD,0.00359084,1,37.5478,8918.79
train_colloidal_57ca6_00009,TERMINATED,127.0.0.1:48384,"(0.5, 0.999)",0.301497,0.00155265,0.848978,Adam,0.000700889,1,38.2843,772.37


[36m(func pid=62416)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=62416)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00002/checkpoint_000000)
[36m(func pid=62440)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)[32m [repeated 3x across cluster][0m


Trial name,loss,should_checkpoint
train_colloidal_57ca6_00000,9.20266,True
train_colloidal_57ca6_00001,9.95315,True
train_colloidal_57ca6_00002,2.68323,True
train_colloidal_57ca6_00003,9.46945,True
train_colloidal_57ca6_00004,2.95595,True
train_colloidal_57ca6_00005,3.05577,True
train_colloidal_57ca6_00006,4.17588,True
train_colloidal_57ca6_00007,16.3096,True
train_colloidal_57ca6_00008,8918.79,True
train_colloidal_57ca6_00009,772.37,True


2024-11-03 23:41:49,060	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
2024-11-03 23:41:49,627	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
2024-11-03 23:41:49,652	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=62440)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00001/checkpoint_000000)[32m [repeated 3x across cluster][0m
[36m(func pid=61752)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=30144)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=30144)[0m Checkpoint successfully created at: Checkpoint(filesystem=local

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff188098f0d8b316521c880fc801000000 Worker ID: 7ff48a6779fad41a5559179386ab41a16a945b4ee7e642e7664ff6ca Node ID: 9c53e45550f8f7a91dbda7951901aa8724c65ce0d30ee3238c0304c1 Worker IP address: 127.0.0.1 Worker port: 50869 Worker PID: 55624 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.


2024-11-03 23:59:09,474	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}


[36m(func pid=55624)[0m Loss is nan, stopping training


[36m(func pid=10028)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=62416)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00002/checkpoint_000001)
[36m(func pid=63400)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00005/checkpoint_000000)
2024-11-03 23:59:36,318	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=52844)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00013/checkpoint_000009)
[36m(func pid=24004)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
2024-11-03 23:59:49,039	INFO tens

[36m(func pid=63020)[0m Loss is nan, stopping training
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff7dd2eea48b21b803947359fb01000000 Worker ID: fa8ecdb741611bfb9b5fa17addcec6fc228a56a60da65353dada7cb4 Node ID: 9c53e45550f8f7a91dbda7951901aa8724c65ce0d30ee3238c0304c1 Worker IP address: 127.0.0.1 Worker port: 51829 Worker PID: 63020 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.


2024-11-04 00:13:26,440	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=62416)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00002/checkpoint_000004)
[36m(func pid=63400)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00005/checkpoint_000005)
[36m(func pid=62472)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
2024-11-04 00:13:45,169	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=30136)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-11-03_23-40-47/trial_57ca6_00024/checkpoint_000001)

TuneError: ('Trials did not complete', [train_colloidal_57ca6_00015, train_colloidal_57ca6_00023])

In [10]:
from ray.tune import ExperimentAnalysis
folder_path = r"C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14" 
analysis = ExperimentAnalysis(folder_path)

completed_trials = [trial for trial in analysis.trials if trial.status == "TERMINATED"]
failed_trials = [trial for trial in analysis.trials if trial.status == "ERROR"]

best_trial = min(completed_trials, key=lambda trial: trial.last_result["loss"])

# Print the best performing trial and its results
print("Best trial config: ", best_trial.config)
print("Best trial final result: ", best_trial.last_result)

Best trial config:  {'optimizer': 'Adam', 'lr': 0.00016517420107310982, 'betas': [0.5, 0.999], 'momentum': 0.5277638298935976, 'weight_decay': 0.0006188018644099798, 'gamma': 0.512368076225412}
Best trial final result:  {'loss': 2.4554710388183594, 'timestamp': 1726167629, 'checkpoint_dir_name': 'checkpoint_000009', 'should_checkpoint': True, 'done': True, 'training_iteration': 10, 'trial_id': '03f42_00012', 'date': '2024-09-12_14-00-29', 'time_this_iter_s': 38.13473606109619, 'time_total_s': 319.3866858482361, 'pid': 22496, 'hostname': 'DESKTOP-RD74FOL', 'node_ip': '127.0.0.1', 'config': {'optimizer': 'Adam', 'lr': 0.00016517420107310982, 'betas': [0.5, 0.999], 'momentum': 0.5277638298935976, 'weight_decay': 0.0006188018644099798, 'gamma': 0.512368076225412}, 'time_since_restore': 319.3866858482361, 'iterations_since_restore': 10, 'experiment_tag': '12_betas=0_5_0_999,gamma=0.5124,lr=0.0002,momentum=0.5278,optimizer=Adam,weight_decay=0.0006'}
