## Goal
This notebook is set up to train a MaskRCNN model using Ray Tune for hyperparameter tuning

In [1]:
## Imports
from torchvision.transforms import v2 as T
from torch.utils.data import DataLoader
import numpy as np
from pathlib import Path
import os
import math
import sys
import tempfile
from functools import partial

# Model Transforms
from torchvision.io import read_image
from torchvision.ops.boxes import masks_to_boxes
from torchvision import tv_tensors
from torchvision.transforms.v2 import functional as F

# Model imports
import torch
import torchvision
from torchvision.models.detection import MaskRCNN
from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.detection import FasterRCNN

# Data imports
from facet_ml.classification import mask_rcnn
from pathlib import Path
import os

# Ray tune imports
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle

import matplotlib.pyplot as plt

device = "cuda"

  from .autonotebook import tqdm as notebook_tqdm
2024-09-12 12:11:10,304	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-09-12 12:11:10,483	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
### Model Creation Functions ###
def get_model_instance_segmentation(num_classes,
                                    config:dict):
    
    ## load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    ## Make an updated FastRCNN with backgone changes as needed
    backbone = config.get("backbone","resnet50")
    if backbone == "mobilenet_v2":
        backbone = torchvision.models.mobilenet_v2(weights="DEFAULT").features
        backbone.out_channels = 1280
        fast_rcnn = FasterRCNN(backbone, in_features=in_features, num_classes=num_classes,)
        model.roi_heads.box_predictor = fast_rcnn
    else:
        # Use defualt resnet50 bacbone
        model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )

    return model

def get_model_instance_segmentation(num_classes,config):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(weights="DEFAULT")

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(
        in_features_mask,
        hidden_layer,
        num_classes
    )

    return model

def get_optimizer(model,config):
    '''
    Get an optimzier based on the config settings
    '''
    params = [p for p in model.parameters() if p.requires_grad]
    if config["optimizer"] == "Adam":
        optimizer = torch.optim.Adam(params,
                         lr=config["lr"],
                        weight_decay=config["weight_decay"],
                        betas=config["betas"]
            )
    elif config["optimizer"] == "SGD":
        optimizer = torch.optim.SGD(
            params,
            lr=config["lr"],
            momentum=config["momentum"],
            weight_decay=config["weight_decay"] # .0005 starting
        )
    
    return optimizer

def get_scheduler(optimizer,config):
    lr_scheduler = torch.optim.lr_scheduler.StepLR(
        optimizer,
        step_size=3,
        gamma=config["gamma"]
    )
    return lr_scheduler

In [3]:
### Load Transforms and Data ###
def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
        transforms.append(T.RandomVerticalFlip(0.5))
        transforms.append(T.RandomRotation(90))
        transforms.append(T.RandomResizedCrop(size=256, scale=(0.6, 1.4)))
        T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.0)
        # transforms.append(T.RandomCrop(size=(224, 224)))


    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

def load_colloidal_data(data_dir=r"C:\Users\Jacob\Desktop\Academics\Mirkin\colloidal_crystal_ML\ProcessedData\Coco_v5"
):
    train_dir = Path(data_dir) / "train"
    test_dir  = Path(data_dir) / "test"
    cd_train = mask_rcnn.ManualCocoColloidalDataset(
        str(train_dir),
        str(train_dir / "_annotations.coco.json"),
        transforms=get_transform(True)
    )
    cd_test = mask_rcnn.ManualCocoColloidalDataset(
        str(test_dir),
        str(Path(test_dir) / "_annotations.coco.json"),
        transforms=get_transform(False)
    )


    return cd_train, cd_test

In [4]:
## Ray Tune config parameters
ray_config ={
    # Opt choice
    "optimizer": tune.choice(["Adam","SGD"]),

    # Region Choices

    # General choices
    "lr": tune.loguniform(1e-4,1e-1),
    "betas":  tune.choice([(0.9, 0.999), (0.5, 0.999)]),
    "momentum": tune.uniform(0.5, 0.9),
    "weight_decay": tune.loguniform(1e-4,1e-1),
    "gamma": tune.uniform(0.1, 0.9)
}

In [5]:
## Ray Tune Functions ##

def train_colloidal(config,data_dir):
    '''
    Ray tune train loop
    '''
    device = "cuda"

    model = get_model_instance_segmentation(2,config)
    model.to(device)
    optimizer = get_optimizer(model, config)
    scheduler = get_scheduler(optimizer, config)

    checkpoint = get_checkpoint()
    if checkpoint:
        with checkpoint.as_directory() as checkpoint_dir:
            data_path = Path(checkpoint_dir) / "data.pkl"
            with open(data_path, "rb") as fp:
                checkpoint_state = pickle.load(fp)
            start_epoch = checkpoint_state["epoch"]
            model.load_state_dict(checkpoint_state["net_state_dict"])
            optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0

    trainset, testset = load_colloidal_data(data_dir)
    def collate_fn(batch):
        '''
        Collation function receives [(image_1, targets_1{masks,boxes,labels}), (image_10, targets_1{masks,boxes,labels})...]
        Need to stack image_1
        '''
        images = [item[0] for item in batch]
        targets = [item[1] for item in batch]
        # return tuple(zip(*batch))
        return images,targets
    
    train_loader = DataLoader(
        trainset,
        batch_size=2,
        num_workers=0,
        collate_fn=collate_fn
    )
    test_loader = DataLoader(
        testset,
        batch_size=2,
        num_workers=0,
        collate_fn=collate_fn
    )
    for epoch in range(start_epoch, 10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            images, targets = data
            images = list(image.to(device) for image in images)
            targets = [
                {
                    k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in t.items()
                }
                for t in targets
            ]
            with torch.cuda.amp.autocast(enabled=False):
                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())

            losses_reduced = sum(loss for loss in loss_dict.values())
            loss_value = losses_reduced.item()

            print(losses)
            if not math.isfinite(loss_value):
                print(f"Loss is {loss_value}, stopping training")
                # print(loss_dict_reduced)
                sys.exit(1)
            
            losses.backward()
            optimizer.step()

            if scheduler is not None:
                scheduler.step()

            test_loss = 0.0
            test_steps = 0
            total = 0
            correct = 0

            for i, data in enumerate(test_loader,0):
                with torch.no_grad():
                    images, targets = data
                    images = list(image.to(device) for image in images)
                    targets = [
                        {
                            k: v.to(device) if isinstance(v, torch.Tensor) else v
                            for k, v in t.items()
                        }
                        for t in targets
                    ]
                    loss_dict = model(images, targets)
                    losses_reduced = sum(loss for loss in loss_dict.values())
                    test_loss += losses_reduced
                    test_steps += 1
            
            checkpoint_data = {
                "epoch": epoch,
                "net_state_dict":model.state_dict(),
                "optimizer_state_dict":optimizer.state_dict()
            }
            with tempfile.TemporaryDirectory() as checkpoint_dir:
                data_path = Path(checkpoint_dir) / "data.pkl"
                with open(data_path, "wb") as fp:
                    pickle.dump(checkpoint_data, fp)

                checkpoint = Checkpoint.from_directory(checkpoint_dir)
                train.report(
                    {"loss": test_loss.to("cpu").detach().numpy() / test_steps, 
                    #  "accuracy": correct / total
                     },
                    checkpoint=checkpoint,
                )
    print("Finished Training")

In [14]:
data_dir = os.path.abspath(r"C:\Users\Jacob\Desktop\Academics\Mirkin\colloidal_crystal_ML\ProcessedData\Coco_v5")
print(ray_config["optimizer"])
test_config ={
    # Opt choice
    "optimizer": "SGD",

    # Region Choices

    # General choices
    "lr": 0.005,
    "betas":  (0.9, 0.999),
    "momentum":0.9,
    "weight_decay": .5,
    "gamma": 0.6
}
train_colloidal(test_config,data_dir=data_dir)

<ray.tune.search.sample.Categorical object at 0x000002B061ECCF70>


  return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


tensor(8.9019, device='cuda:0', grad_fn=<AddBackward0>)




tensor(3.1666, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5968, device='cuda:0', grad_fn=<AddBackward0>)
tensor(35.0724, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.3626, device='cuda:0', grad_fn=<AddBackward0>)
tensor(9982.0693, device='cuda:0', grad_fn=<AddBackward0>)
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
Loss is nan, stopping training


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [7]:
def main(config, num_samples=10, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath(r"C:\Users\Jacob\Desktop\Academics\Mirkin\colloidal_crystal_ML\ProcessedData\Coco_v5")
    load_colloidal_data(data_dir)
    # config = {
    #     "l1": tune.choice([2**i for i in range(9)]),
    #     "l2": tune.choice([2**i for i in range(9)]),
    #     "lr": tune.loguniform(1e-4, 1e-1),
    #     "batch_size": tune.choice([2, 4, 8, 16]),
    # }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2,
    )

    def short_dirname(trial):
        return "trial_" + str(trial.trial_id)
    
    result = tune.run(
        partial(train_colloidal, data_dir=data_dir),
        # resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        trial_dirname_creator=short_dirname,
        max_concurrent_trials=4
    )

    best_trial = result.get_best_trial("loss", "min", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
    # print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}")

In [8]:
main(ray_config,num_samples=30)

2024-09-12 12:11:12,864	INFO worker.py:1783 -- Started a local Ray instance.
2024-09-12 12:11:14,054	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-09-12 12:11:14,055	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-09-12 14:10:41
Running for:,01:59:27.85
Memory:,11.8/31.8 GiB

Trial name,# failures,error file
train_colloidal_03f42_00008,1,C:/Users/Jacob/AppData/Local/Temp/ray/session_2024-09-12_12-11-11_479769_26952/artifacts/2024-09-12_12-11-14/train_colloidal_2024-09-12_12-11-14/driver_artifacts/trial_03f42_00008/error.txt
train_colloidal_03f42_00015,1,C:/Users/Jacob/AppData/Local/Temp/ray/session_2024-09-12_12-11-11_479769_26952/artifacts/2024-09-12_12-11-14/train_colloidal_2024-09-12_12-11-14/driver_artifacts/trial_03f42_00015/error.txt
train_colloidal_03f42_00022,1,C:/Users/Jacob/AppData/Local/Temp/ray/session_2024-09-12_12-11-11_479769_26952/artifacts/2024-09-12_12-11-14/train_colloidal_2024-09-12_12-11-14/driver_artifacts/trial_03f42_00022/error.txt
train_colloidal_03f42_00023,1,C:/Users/Jacob/AppData/Local/Temp/ray/session_2024-09-12_12-11-11_479769_26952/artifacts/2024-09-12_12-11-14/train_colloidal_2024-09-12_12-11-14/driver_artifacts/trial_03f42_00023/error.txt

Trial name,status,loc,betas,gamma,lr,momentum,optimizer,weight_decay,iter,total time (s),loss
train_colloidal_03f42_00000,TERMINATED,127.0.0.1:34412,"(0.5, 0.999)",0.553449,0.0130623,0.801122,Adam,0.00108585,1,229.197,inf
train_colloidal_03f42_00001,TERMINATED,127.0.0.1:20128,"(0.5, 0.999)",0.397102,0.00125083,0.604578,SGD,0.000301029,10,6205.02,3.10163
train_colloidal_03f42_00002,TERMINATED,127.0.0.1:10444,"(0.9, 0.999)",0.354488,0.000180204,0.736511,SGD,0.00884615,10,6219.1,3.02893
train_colloidal_03f42_00003,TERMINATED,127.0.0.1:19260,"(0.9, 0.999)",0.662824,0.00253106,0.741076,Adam,0.0357407,1,229.046,110258000.0
train_colloidal_03f42_00004,TERMINATED,127.0.0.1:8976,"(0.5, 0.999)",0.378463,0.000818231,0.605687,Adam,0.00021017,2,1726.07,16.8359
train_colloidal_03f42_00005,TERMINATED,127.0.0.1:20088,"(0.9, 0.999)",0.139124,0.00906127,0.539653,SGD,0.00535453,8,5995.78,3.30357
train_colloidal_03f42_00006,TERMINATED,127.0.0.1:11856,"(0.5, 0.999)",0.842264,0.00304824,0.518698,Adam,0.00387564,1,24.8512,20982800000.0
train_colloidal_03f42_00007,TERMINATED,127.0.0.1:23384,"(0.5, 0.999)",0.201527,0.000102438,0.875539,SGD,0.0468364,2,4172.16,6.5979
train_colloidal_03f42_00009,TERMINATED,127.0.0.1:16644,"(0.5, 0.999)",0.644479,0.00725294,0.823485,SGD,0.00233057,4,104.723,7084660.0
train_colloidal_03f42_00010,TERMINATED,127.0.0.1:3368,"(0.5, 0.999)",0.157441,0.00330996,0.55121,Adam,0.000204109,1,26.8196,3379720000000.0


[36m(func pid=10444)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=10444)[0m tensor(8.0813, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(7.9802, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19260)[0m tensor(8.3867, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=34412)[0m tensor(5.9127, device='cuda:0', grad_fn=<AddBackward0>)


Trial name,loss,should_checkpoint
train_colloidal_03f42_00000,inf,True
train_colloidal_03f42_00001,3.10163,True
train_colloidal_03f42_00002,3.02893,True
train_colloidal_03f42_00003,110258000.0,True
train_colloidal_03f42_00004,16.8359,True
train_colloidal_03f42_00005,3.30357,True
train_colloidal_03f42_00006,20982800000.0,True
train_colloidal_03f42_00007,6.5979,True
train_colloidal_03f42_00008,,True
train_colloidal_03f42_00009,7084660.0,True


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000000)
[36m(func pid=19260)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)[32m [repeated 3x across cluster][0m
2024-09-12 12:15:06,345	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
NaN or Inf found in input tensor.
2024-09-12 12:15:06,444	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=34412)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00000/checkpoint_000000)[32m [repeated 3x across cluster][0m
[36m(func pid=20088)[0m   return torch.as_tensor(data, dtype=dtype, device=device).

[36m(func pid=10444)[0m tensor(7.2119, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(3.3882, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=8976)[0m tensor(11.1042, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20088)[0m tensor(8.8892, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000001)
[36m(func pid=20128)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00001/checkpoint_000001)


[36m(func pid=10444)[0m tensor(4.4884, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(2.5757, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=8976)[0m tensor(5.5953, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000002)[32m [repeated 3x across cluster][0m


[36m(func pid=20088)[0m tensor(2.6395, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=10444)[0m tensor(4.2694, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(2.8671, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=20088)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00005/checkpoint_000001)[32m [repeated 2x across cluster][0m
2024-09-12 12:43:57,162	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=11856)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=8976)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00004/checkpoint_000001)[32m [repeated 3x across cluster][0m


[36m(func pid=10444)[0m tensor(2.8731, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(3.0673, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20088)[0m tensor(4.0548, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=11856)[0m tensor(9.8381, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000004)
[36m(func pid=20128)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00001/checkpoint_000004)
2024-09-12 12:44:27,815	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=11856)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00006/checkpoint_000000)[32m [repeated 2x across cluster][0m
[36m(func pid=23384)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=10444)[0m tensor(3.4427, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(2.7722, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20088)[0m tensor(30.3317, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000005)


[36m(func pid=23384)[0m tensor(7.3120, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=23384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00007/checkpoint_000000)[32m [repeated 3x across cluster][0m


[36m(func pid=10444)[0m tensor(2.8396, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(2.9092, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20088)[0m tensor(2.7556, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000006)
[36m(func pid=20128)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00001/checkpoint_000006)


[36m(func pid=23384)[0m tensor(6.5855, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(2.6954, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=10444)[0m tensor(2.5689, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=20088)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00005/checkpoint_000004)
[36m(func pid=20128)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00001/checkpoint_000007)


[36m(func pid=20128)[0m tensor(2.4392, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 13:54:03,224	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=23384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00007/checkpoint_000001)


[36m(func pid=20088)[0m tensor(13.6717, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000007)
[36m(func pid=9652)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=20128)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00001/checkpoint_000008)


[36m(func pid=10444)[0m tensor(2.6182, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=9652)[0m tensor(9.3498, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000008)[32m [repeated 2x across cluster][0m


[36m(func pid=20088)[0m tensor(3.4934, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=20128)[0m tensor(2.4619, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=9652)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00008/checkpoint_000000)
[36m(func pid=20088)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00005/checkpoint_000006)
2024-09-12 13:54:42,228	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}


[36m(func pid=10444)[0m tensor(2.4638, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=16644)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=20128)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00001/checkpoint_000009)
2024-09-12 13:54:54,475	ERROR tune_controller.py:1331 -- Trial task failed for trial train_colloidal_03f42_00008
Traceback (most recent call last):
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffde733cd3639a7e291e02ced501000000 Worker ID: c87f7c0096a77534feb8f5532f2d57c57a7afec0212c8c8f46d4f8d0 Node ID: 1840945993186c985884395cd85cb7685e4524dac9e433ce3e7ae42d Worker IP address: 127.0.0.1 Worker port: 53709 Worker PID: 9652 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.


2024-09-12 13:54:54,481	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}


[36m(func pid=9652)[0m tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=9652)[0m Loss is nan, stopping training


2024-09-12 13:54:56,337	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=10444)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00002/checkpoint_000009)


[36m(func pid=20088)[0m tensor(3.1801, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=3368)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=16644)[0m tensor(8.7886, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 13:55:06,913	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=20088)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00005/checkpoint_000007)
[36m(func pid=3780)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=22496)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=16644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00009/checkpoint_000000)


[36m(func pid=3368)[0m tensor(8.0254, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=3780)[0m tensor(8.4376, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 13:55:24,616	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=3368)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00010/checkpoint_000000)
2024-09-12 13:55:25,946	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}


[36m(func pid=16644)[0m tensor(2.7255, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=22496)[0m tensor(10.0544, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=18756)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=3780)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00011/checkpoint_000000)
[36m(func pid=16644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00009/checkpoint_000001)
[36m(func pid=24744)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=18756)[0m tensor(8.7528, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=24744)[0m tensor(6.2158, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=16644)[0m tensor(3.5496, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 13:55:55,525	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=18756)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00013/checkpoint_000000)[32m [repeated 2x across cluster][0m


[36m(func pid=22496)[0m tensor(2.9866, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=27284)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000000)
[36m(func pid=16644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00009/checkpoint_000002)


[36m(func pid=24744)[0m tensor(2.8268, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=27284)[0m tensor(10.1306, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=16644)[0m tensor(66.3349, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=22496)[0m tensor(2.3543, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000001)[32m [repeated 2x across cluster][0m
2024-09-12 13:56:30,588	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=22496)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00012/checkpoint_000002)[32m [repeated 3x across cluster][0m
[36m(func pid=7804)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=24744)[0m tensor(2.3521, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 13:56:49,618	ERROR tune_controller.py:1331 -- Trial task failed for trial train_colloidal_03f42_00015
Traceback (most recent call last):
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\worker.py", line 2661, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\worker.py", line 873, in get_ob

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffa38ab862ca2ef2f566659a6f01000000 Worker ID: cce792ace74bfc8c4251bea361785488b7f5f7d39474a6ad198dcacc Node ID: 1840945993186c985884395cd85cb7685e4524dac9e433ce3e7ae42d Worker IP address: 127.0.0.1 Worker port: 53910 Worker PID: 27284 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.
[36m(func pid=27284)[0m tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=27284)[0m Loss is nan, stopping training


2024-09-12 13:56:49,625	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}


[36m(func pid=7804)[0m tensor(5.8713, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000002)


[36m(func pid=22496)[0m tensor(2.2480, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19324)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
NaN or Inf found in input tensor.
2024-09-12 13:56:59,622	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=7804)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00016/checkpoint_000000)
[36m(func pid=35752)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=22496)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00012/checkpoint_000003)


[36m(func pid=19324)[0m tensor(9.2268, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=24744)[0m tensor(2.1570, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=22496)[0m tensor(2.5688, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35752)[0m tensor(8.1625, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000003)
[36m(func pid=22496)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00012/checkpoint_000004)[32m [repeated 2x across cluster][0m


[36m(func pid=24744)[0m tensor(2.2469, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19324)[0m tensor(2.7686, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000004)[32m [repeated 2x across cluster][0m


[36m(func pid=22496)[0m tensor(2.1365, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00017/checkpoint_000001)


[36m(func pid=35752)[0m tensor(2.5619, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=22496)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00012/checkpoint_000005)


[36m(func pid=24744)[0m tensor(2.4801, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19324)[0m tensor(3.4163, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000005)[32m [repeated 2x across cluster][0m


[36m(func pid=22496)[0m tensor(2.4819, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19324)[0m tensor(3.2750, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=24744)[0m tensor(2.3024, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35752)[0m tensor(2.5860, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00017/checkpoint_000003)[32m [repeated 2x across cluster][0m
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000002)[32m [repeated 3x across cluster][0m


[36m(func pid=19324)[0m tensor(3.1440, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=24744)[0m tensor(2.5768, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=22496)[0m tensor(2.1607, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00017/checkpoint_000004)


[36m(func pid=35752)[0m tensor(2.3357, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000007)
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000003)[32m [repeated 2x across cluster][0m


[36m(func pid=24744)[0m tensor(2.2047, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19324)[0m tensor(3.4087, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=22496)[0m tensor(2.3238, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35752)[0m tensor(2.5351, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000008)
[36m(func pid=19324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00017/checkpoint_000005)
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000004)[32m [repeated 2x across cluster][0m


[36m(func pid=19324)[0m tensor(2.7961, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=24744)[0m tensor(2.1743, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=22496)[0m tensor(1.9997, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00017/checkpoint_000006)


[36m(func pid=35752)[0m tensor(2.2047, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 14:00:29,686	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=22496)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00012/checkpoint_000009)
2024-09-12 14:00:29,789	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=35256)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=24744)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00014/checkpoint_000009)
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000005)

[36m(func pid=35256)[0m tensor(5.7696, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35244)[0m tensor(8.8664, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19324)[0m tensor(2.7986, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35752)[0m tensor(2.3633, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 14:01:04,153	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=35244)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=35244)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00020/checkpoint_000000)
[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000000)
[36m(func pid=17584)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=35256)[0m tensor(4.2942, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000001)


[36m(func pid=35256)[0m tensor(3.0672, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000002)


[36m(func pid=35256)[0m tensor(2.8177, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000003)


[36m(func pid=35256)[0m tensor(2.8885, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000004)


[36m(func pid=35256)[0m tensor(2.7057, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000005)


[36m(func pid=35256)[0m tensor(2.8716, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000006)


[36m(func pid=35256)[0m tensor(2.4647, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 14:04:42,780	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=35256)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00019/checkpoint_000007)


[36m(func pid=17584)[0m tensor(8.1162, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=9580)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
2024-09-12 14:04:52,409	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=19324)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00017/checkpoint_000007)
[36m(func pid=6344)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000006)
NaN or Inf found in input tensor.
2024-09-12 14:05:00,616	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=17584)[0m Checkpoint successfull

[36m(func pid=9580)[0m tensor(9.2500, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35644)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=9580)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00022/checkpoint_000000)


[36m(func pid=35752)[0m tensor(2.2051, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=6344)[0m tensor(7.7901, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35644)[0m tensor(5.7055, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000007)
[36m(func pid=6344)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00023/checkpoint_000000)
2024-09-12 14:05:35,562	ERROR tune_controller.py:1331 -- Trial task failed for trial train_colloidal_03f42_00022
Traceback (most recent call last):
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\client_mode_h

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffaa551f0a4a3cd09d6e68685b01000000 Worker ID: 3345395cc20eee85c5c434ef6868f66825af6c089e422daf9a9beb09 Node ID: 1840945993186c985884395cd85cb7685e4524dac9e433ce3e7ae42d Worker IP address: 127.0.0.1 Worker port: 54341 Worker PID: 9580 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.
[36m(func pid=9580)[0m tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=9580)[0m Loss is nan, stopping training


2024-09-12 14:05:35,567	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}


[36m(func pid=35752)[0m tensor(2.3664, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19364)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=35644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00024/checkpoint_000000)
2024-09-12 14:05:48,042	ERROR tune_controller.py:1331 -- Trial task failed for trial train_colloidal_03f42_00023
Traceback (most recent call last):
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\air\execution\_internal\event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "c:\Users\Jacob\miniconda3\envs\colloidal_crystal_env\lib\site-packages\ray\_private\client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\

[36m(func pid=6344)[0m tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=6344)[0m Loss is nan, stopping training
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffab033cf7d15a6e79612d6fe201000000 Worker ID: 879ff3652676aca15337823cf6eb4f347523b99e2bb45768ca151963 Node ID: 1840945993186c985884395cd85cb7685e4524dac9e433ce3e7ae42d Worker IP address: 127.0.0.1 Worker port: 54359 Worker PID: 6344 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code 1.


2024-09-12 14:05:48,046	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000008)


[36m(func pid=35644)[0m tensor(4.0804, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=23300)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)


[36m(func pid=19364)[0m tensor(7.9016, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=35644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00024/checkpoint_000001)
[36m(func pid=19364)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00025/checkpoint_000000)


[36m(func pid=23300)[0m tensor(5.8830, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35644)[0m tensor(3.1771, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35752)[0m tensor(2.1353, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19364)[0m tensor(2.8881, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 14:07:56,452	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=23300)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00026/checkpoint_000000)
[36m(func pid=35644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00024/checkpoint_000002)
2024-09-12 14:07:57,641	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=18736)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
[36m(func pid=35752)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00018/checkpoint_000009)

[36m(func pid=18736)[0m tensor(8.2184, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=10124)[0m tensor(7.8781, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=19364)[0m tensor(2.6472, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=35644)[0m tensor(2.8815, device='cuda:0', grad_fn=<AddBackward0>)


[36m(func pid=19364)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00025/checkpoint_000002)
[36m(func pid=10124)[0m   return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
2024-09-12 14:10:05,522	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=18736)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00027/checkpoint_000000)
2024-09-12 14:10:11,467	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
[36m(func pid=35644)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00024/checkpoint_000003)

[36m(func pid=19364)[0m tensor(3.2521, device='cuda:0', grad_fn=<AddBackward0>)
[36m(func pid=28928)[0m tensor(9.3012, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 14:10:31,026	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=19364)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00025/checkpoint_000003)
2024-09-12 14:10:32,797	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.9, 0.999)}
[36m(func pid=28928)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00029/checkpoint_000000)


[36m(func pid=10124)[0m tensor(3.4085, device='cuda:0', grad_fn=<AddBackward0>)


2024-09-12 14:10:41,919	INFO tensorboardx.py:308 -- Removed the following hyperparameter values when logging to tensorboard: {'betas': (0.5, 0.999)}
2024-09-12 14:10:41,941	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14' in 0.0205s.
[36m(func pid=10124)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14/trial_03f42_00028/checkpoint_000001)


TuneError: ('Trials did not complete', [train_colloidal_03f42_00008, train_colloidal_03f42_00015, train_colloidal_03f42_00022, train_colloidal_03f42_00023])

In [13]:
from ray.tune import ExperimentAnalysis
folder_path = r"C:/Users/Jacob/ray_results/train_colloidal_2024-09-12_12-11-14" 
analysis = ExperimentAnalysis(folder_path)

completed_trials = [trial for trial in analysis.trials if trial.status == "TERMINATED"]
failed_trials = [trial for trial in analysis.trials if trial.status == "ERROR"]

best_trial = min(completed_trials, key=lambda trial: trial.last_result["loss"])

# Print the best performing trial and its results
print("Best trial config: ", best_trial.config)
print("Best trial final result: ", best_trial.last_result)

Best trial config:  {'optimizer': 'Adam', 'lr': 0.00016517420107310982, 'betas': [0.5, 0.999], 'momentum': 0.5277638298935976, 'weight_decay': 0.0006188018644099798, 'gamma': 0.512368076225412}
Best trial final result:  {'loss': 2.4554710388183594, 'timestamp': 1726167629, 'checkpoint_dir_name': 'checkpoint_000009', 'should_checkpoint': True, 'done': True, 'training_iteration': 10, 'trial_id': '03f42_00012', 'date': '2024-09-12_14-00-29', 'time_this_iter_s': 38.13473606109619, 'time_total_s': 319.3866858482361, 'pid': 22496, 'hostname': 'DESKTOP-RD74FOL', 'node_ip': '127.0.0.1', 'config': {'optimizer': 'Adam', 'lr': 0.00016517420107310982, 'betas': [0.5, 0.999], 'momentum': 0.5277638298935976, 'weight_decay': 0.0006188018644099798, 'gamma': 0.512368076225412}, 'time_since_restore': 319.3866858482361, 'iterations_since_restore': 10, 'experiment_tag': '12_betas=0_5_0_999,gamma=0.5124,lr=0.0002,momentum=0.5278,optimizer=Adam,weight_decay=0.0006'}
