In [1]:
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.utils as utils
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from torch import nn
from torch import optim  # Import the optim module
from torch.nn import functional as F
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from PIL import Image
import os
from pytorch_lightning.loggers import TensorBoardLogger
import torch.utils.data as data_utils

# Define the CNN model class
class CNN(torch.nn.Module):

    def __init__(self, in_channels, out_channels, kernel_size, num_hidden, hidden_channels):
        super().__init__()

        self.first_conv = torch.nn.Conv2d(
            in_channels=in_channels,
            out_channels=hidden_channels,
            kernel_size=kernel_size,
            padding=0
        )

        self.convs = torch.nn.ModuleList()
        for i in range(num_hidden):
            self.convs.append(
                torch.nn.Conv2d(
                    in_channels=hidden_channels,
                    out_channels=hidden_channels,
                    kernel_size=kernel_size,
                    padding=0
                )
            )

        self.final_linear = torch.nn.Linear(hidden_channels, out_channels)
    
    def forward(self, x):

        x = self.first_conv(x)
        x = torch.nn.functional.layer_norm(x, x.shape[-3:])
        x = torch.nn.functional.relu(x)

        for conv in self.convs:
            x = conv(x)
            x = torch.nn.functional.layer_norm(x, x.shape[-3:])
            x = torch.nn.functional.relu(x)
        
        # Apply average pooling over remaining spatial dimensions.
        x = torch.nn.functional.adaptive_avg_pool2d(x, 1).squeeze()

        x = self.final_linear(x)
        return x
        
# NUM_IMAGES = 4
# images = [train_ds[idx][0] for idx in range(NUM_IMAGES)]
# orig_images = [Image.fromarray(train_ds.data[idx].numpy()) for idx in range(NUM_IMAGES)]
# orig_images = [test_transform(img) for img in orig_images]

# img_grid = utils.make_grid(torch.stack(images + orig_images, dim=0), nrow=4, normalize=True, pad_value=0.5)
# img_grid = img_grid.permute(1, 2, 0)

# plt.figure(figsize=(8, 8))
# plt.title("Images sampled from the MNIST train set, augmented with test transforms.")
# plt.imshow(img_grid)
# plt.axis('off')
# plt.show()


In [2]:
!module load anaconda3/2022.05 cuda/11.8

In [2]:
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
# Define the LightningModule for training and evaluation
class DataModule(pl.LightningModule):
    def __init__(self, model_name, model_hparams, optimizer_name, optimizer_hparams):
        """
        Inputs:
            model_name - Name of the model/CNN to run. Used for creating the model (see function below)
            model_hparams - Hyperparameters for the model, as dictionary.
            optimizer_name - Name of the optimizer to use. Currently supported: Adam, SGD
            optimizer_hparams - Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        # Create model
        self.model = create_model(model_name, model_hparams)
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()
        
    def forward(self, imgs):
        return self.model(imgs)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=1e-2, weight_decay=1e-4)
        return [optimizer], []

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        self.log('train_acc', acc, on_step=False, on_epoch=True)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        self.log('val_acc', acc, prog_bar=True)
        self.log('val_loss', loss)

    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        self.log('test_acc', acc, prog_bar=True)

def train_model(model_name, logger_name, save_name=None, **kwargs):
    """
    Inputs:
        model_name - Name of the model you want to run. Is used to look up the class in "model_dict"
        save_name (optional) - If specified, this name will be used for creating the checkpoint and logging directory.
    """
    if save_name is None:
        save_name = model_name

    # Create logger
    logger = TensorBoardLogger("tb_logger", name = logger_name)

    # Create a PyTorch Lightning trainer with the generation callback
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, save_name),                          # Where to save models
                         logger=logger,
                         accelerator='auto',                                             # We run on a single GPU (if possible)
                         max_epochs=10,                                                                      # How many epochs to train for if no patience is set
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"),  # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
                                    LearningRateMonitor("epoch")])
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, save_name + ".ckpt")
    
    if os.path.isfile(pretrained_filename):
        print(f"Found pretrained model at {pretrained_filename}, loading...")
        model = DataModule.load_from_checkpoint(pretrained_filename) # Automatically loads the model with the saved hyperparameters
    else:
        pl.seed_everything(12) # To be reproducable
        model = DataModule(model_name=model_name, **kwargs)
        trainer.fit(model, train_loader, test_loader)
        model = DataModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # Load best checkpoint after training

    # Test best model on test set
    val_result = trainer.test(model.to(device), test_loader, verbose=False)
    result = {"val": val_result[0]["test_acc"]}

    return model, result

def create_model(model_name, model_hparams):
    if model_name in model_dict:
        return model_dict[model_name](**model_hparams)
    else:
        assert False, f"Unknown model name \"{model_name}\". Available models are: {str(model_dict.keys())}"

# # Create a PyTorch Lightning trainer
# trainer = pl.Trainer(default_root_dir="./checkpoints", accelerator='auto', max_epochs=10,
#                      callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"),
#                                 LearningRateMonitor("epoch")])

# # Define and train the model
# model = DataModule()
# trainer.fit(model, train_loader, test_loader)

# # Test the model on the test set
# test_result = trainer.test(model, test_loader, verbose=False)
# print(f"Test Accuracy: {test_result[0]['test_acc'] * 100:.2f}%")

In [3]:
model_dict = {
    'CNN': CNN,
    # 'GCNN': GroupEquivariantCNN
}

# Path to the folder where the datasets are be downloaded (e.g. MNIST)
DATASET_PATH = "project/data"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "project/saved_models"

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
batch_size = 64
num_data = 30000
rotations = [30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360]

for ind_rot in rotations:
    
    # Define data transforms
    train_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomRotation([0, ind_rot], interpolation=Image.BILINEAR, fill=0),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomRotation([0, 360], interpolation=Image.BILINEAR, fill=0),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    

    
    # Set the random seed for reproducibility
    pl.seed_everything(12)
    # Load the MNIST dataset
    train_ds = datasets.MNIST(root="./project/data", train=True, transform=train_transform, download=True)
    test_ds = datasets.MNIST(root="./project/data", train=False, transform=test_transform)

    # Change number of data 
    indices = torch.arange(num_data)
    train_ds_less = data_utils.Subset(train_ds, indices)
    
    train_loader = torch.utils.data.DataLoader(train_ds_less, batch_size=64, num_workers = 4,  shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=64, num_workers = 4,  shuffle=False)
    
    
    cnn_model, cnn_results = train_model(model_name="CNN",
                                         logger_name = "CNN_" + str(num_data) + "_data_deg" + str(ind_rot),
                                         model_hparams={"in_channels": 1,
                                                        "out_channels": 10,
                                                        "kernel_size": 5,
                                                        "num_hidden":4,
                                                        "hidden_channels":32},
                                         optimizer_name="Adam",
                                         optimizer_hparams={"lr": 1e-2,
                                                            "weight_decay": 1e-4},
                                         save_name='cnn-pretrained')

[rank: 0] Seed set to 12
/home/yu.sea/.conda/envs/pytorch_env/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:191: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /home/yu.sea/.conda/envs/pytorch_env/lib/python3.10/ ...
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
[rank: 0] Seed set to 12

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | CNN              | 103 K 
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
103 K     Trainable params
0         Non-trainable params
103 K     Total params
0.415     Total estimated model params size (MB)


Epoch 0:   8%|▊         | 38/469 [00:05<01:02,  6.88it/s, v_num=1]         

/home/yu.sea/.conda/envs/pytorch_env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


IsADirectoryError: [Errno 21] Is a directory: '/home/yu.sea/cs6140'

In [5]:
# # Store the training and validation losses
# train_losses = []
# val_losses = []

# # # Create a PyTorch Lightning trainer
# # trainer = pl.Trainer(default_root_dir="./checkpoints", accelerator='auto', max_epochs=10,
# #                      callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"),
# #                                 LearningRateMonitor("epoch")])

# # Define and train the model
# model = DataModule()

# # Define the optimizer (e.g., SGD or Adam) and its parameters
# optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)  # Modify with your optimizer and hyperparameters

# for epoch in range(trainer.max_epochs):
#     epoch_train_loss = 0.0  # Initialize epoch training loss
    
#     # Training loop
#     model.train()  # Set the model to training mode
#     for batch in train_loader:
#         imgs, labels = batch
#         optimizer.zero_grad()
#         preds = model(imgs)
#         loss = model.loss_module(preds, labels)
#         loss.backward()
#         optimizer.step()
        
#         epoch_train_loss += loss.item()
    
#     epoch_train_loss /= len(train_loader)
#     train_losses.append(epoch_train_loss)  # Append epoch training loss
    
#     # Calculate and append validation loss manually
#     val_loss = 0.0
#     model.eval()  # Set model to evaluation mode
#     with torch.no_grad():
#         for batch in test_loader:
#             imgs, labels = batch
#             preds = model(imgs)
#             loss = model.loss_module(preds, labels)
#             val_loss += loss.item()
    
#     val_loss /= len(test_loader)
#     val_losses.append(val_loss)

#     # Test the model on the test set
#     test_result = trainer.test(model, test_loader, verbose=False)
#     print(f"Epoch [{epoch + 1}/{trainer.max_epochs}] - Test Accuracy: {test_result[0]['test_acc'] * 100:.2f}%")

# # Plot the loss curves
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, trainer.max_epochs + 1), train_losses, label='Training Loss')
# plt.plot(range(1, trainer.max_epochs + 1), val_losses, label='Validation Loss')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.title('Training and Validation Loss Curves')
# plt.grid(True)
# plt.show()


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:05<00:00, 28.76it/s]
Epoch [1/3] - Test Accuracy: 66.46%



KeyboardInterrupt



In [8]:
len(train_ds)

60000