In [3]:
import os
import torch
import tempfile
import pytorch_lightning as pl
import torch.nn.functional as F
from filelock import FileLock
from torchmetrics import Accuracy
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms

In [4]:
class MNISTClassifier(pl.LightningModule):
    def __init__(self, config):
        super(MNISTClassifier, self).__init__()
        self.accuracy = Accuracy(task="multiclass", num_classes=10, top_k=1)
        self.layer_1_size = config["layer_1_size"]
        self.layer_2_size = config["layer_2_size"]
        self.lr = config["lr"]

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
        self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
        self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)
        self.eval_loss = []
        self.eval_accuracy = []

    def cross_entropy_loss(self, logits, labels):
        return F.nll_loss(logits, labels)

    def forward(self, x):
        batch_size, channels, width, height = x.size()
        x = x.view(batch_size, -1)

        x = self.layer_1(x)
        x = torch.relu(x)

        x = self.layer_2(x)
        x = torch.relu(x)

        x = self.layer_3(x)
        x = torch.log_softmax(x, dim=1)

        return x

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        self.log("ptl/train_loss", loss)
        self.log("ptl/train_accuracy", accuracy)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)
        self.eval_loss.append(loss)
        self.eval_accuracy.append(accuracy)
        return {"val_loss": loss, "val_accuracy": accuracy}

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.eval_loss).mean()
        avg_acc = torch.stack(self.eval_accuracy).mean()
        self.log("ptl/val_loss", avg_loss, sync_dist=True)
        self.log("ptl/val_accuracy", avg_acc, sync_dist=True)
        self.eval_loss.clear()
        self.eval_accuracy.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer


class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, batch_size=128):
        super().__init__()
        self.data_dir = tempfile.mkdtemp()
        self.batch_size = batch_size
        self.transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        )

    def setup(self, stage=None):
        with FileLock(f"{self.data_dir}.lock"):
            mnist = MNIST(
                self.data_dir, train=True, download=True, transform=self.transform
            )
            self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])

            self.mnist_test = MNIST(
                self.data_dir, train=False, download=True, transform=self.transform
            )

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)

In [5]:
default_config = {
    "layer_1_size": 128,
    "layer_2_size": 256,
    "lr": 1e-3,
}

In [6]:
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)


def train_func(config):
    dm = MNISTDataModule(batch_size=config["batch_size"])
    model = MNISTClassifier(config)

    trainer = pl.Trainer(
        devices="auto",
        accelerator="auto",
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
    )
    trainer = prepare_trainer(trainer)
    trainer.fit(model, datamodule=dm)

In [7]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

In [8]:
search_space = {
    "layer_1_size": tune.choice([32, 64, 128]),
    "layer_2_size": tune.choice([64, 128, 256]),
    "lr": tune.loguniform(1e-4, 1e-1),
    "batch_size": tune.choice([32, 64]),
}

In [9]:
# The maximum training epochs
num_epochs = 5

# Number of sampls from parameter space
num_samples = 10

In [10]:
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

scaling_config = ScalingConfig(
    num_workers=3, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 0}
)

run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="ptl/val_accuracy",
        checkpoint_score_order="max",
    ),
)

In [11]:
from ray.train.torch import TorchTrainer

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)

In [None]:
def tune_mnist_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": search_space},
        tune_config=tune.TuneConfig(
            metric="ptl/val_accuracy",
            mode="max",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    return tuner.fit()


results = tune_mnist_asha(num_samples=num_samples)

0,1
Current time:,2024-12-28 17:35:10
Running for:,00:14:51.07
Memory:,4.9/15.3 GiB

Trial name,status,loc,train_loop_config/ba tch_size,train_loop_config/la yer_1_size,train_loop_config/la yer_2_size,train_loop_config/lr
TorchTrainer_0334c_00000,PENDING,,64,64,64,0.00331451
TorchTrainer_0334c_00001,PENDING,,64,64,256,0.002094
TorchTrainer_0334c_00002,PENDING,,32,128,256,0.000175745
TorchTrainer_0334c_00003,PENDING,,32,128,128,0.0559725
TorchTrainer_0334c_00004,PENDING,,32,64,256,0.0980119
TorchTrainer_0334c_00005,PENDING,,32,128,64,0.00592966
TorchTrainer_0334c_00006,PENDING,,64,64,64,0.00157499
TorchTrainer_0334c_00007,PENDING,,32,128,128,0.000116221
TorchTrainer_0334c_00008,PENDING,,32,128,128,0.000275549
TorchTrainer_0334c_00009,PENDING,,32,64,64,0.0727857


[36m(autoscaler +49s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.
[33m(autoscaler +49s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +49s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +1m24s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +1m24s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +1m59s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +1m59s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +2m34s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +2m34s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +3m9s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +3m9s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +3m45s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +3m45s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +4m20s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +4m20s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +4m55s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +4m55s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +5m30s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +5m30s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +6m5s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +6m5s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +6m40s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +6m40s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +7m15s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +7m15s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +7m50s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +7m50s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +8m25s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +8m25s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +9m0s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +9m0s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +9m35s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +9m35s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +10m10s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +10m10s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +10m45s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +10m45s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +11m20s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +11m20s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +11m55s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +11m55s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +12m30s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +12m30s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +13m6s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +13m6s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +13m41s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +13m41s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +14m16s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +14m16s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.




[33m(autoscaler +14m51s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +14m51s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +15m26s)[0m Error: No available node types can fulfill resource request {'CPU': 2.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
[33m(autoscaler +15m26s)[0m Error: No available node types can fulfill resource request {'CPU': 1.0, 'GPU': 1.0}. Add suitable node types to this cluster to resolve this issue.
