In [21]:
%load_ext autoreload
%autoreload 2

import os, sys
import logging
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import time
from datetime import datetime
now = datetime.now()

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
# from torch.utils.data.dataset import Datasets
from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim

print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("CUDA is available:", torch.cuda.is_available())
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
PyTorch version: 2.0.0+cu118
Torchvision version: 0.15.1+cu118
CUDA is available: True
False
False


In [22]:
lr = 1e-3
input_ch = 4 # 3 or 4
weight_decay = 1e-5
max_epochs = 100
val_interval = 1
batch_size = 32
num_workers = 10

In [23]:
timestamp = now.strftime("%d_%m_%Y_%H_%M")
data_dir = Path('/dlab/ldrive/CBT/USLJ-DSDE_DATA-I10008/shihch3/projects/HPA_single_data')
checkpoint_dir = data_dir.joinpath('checkpoints', timestamp)
checkpoint_dir.mkdir(parents=True, exist_ok=True)
log_dir = data_dir.joinpath('log', timestamp)
writer = SummaryWriter(log_dir)


In [24]:
from HPASCDataset import HPASCDataset

transform = transforms.Compose(
    [
    transforms.Resize((2048, 2048)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 
    ])  

# transform = transforms.Compose(
#     [transforms.ToTensor()])

datadir= Path('/dlab/ldrive/CBT/USLJ-DSDE_DATA-I10008/BenchmarkDatasets/hpa-single-cell-image-classification')
train_dataset_dir = datadir.joinpath('train')
train_csv = datadir.joinpath('train.csv')

HPA_dataset = HPASCDataset(
                    input_csv = train_csv, 
                    root = train_dataset_dir, 
                    split = 'train', 
                    transform = transform,
                    input_ch = input_ch, 
                    n_class = 19, 
                    # debug_size = 100,
                    )

torch.manual_seed(1947)
train_ds, val_ds = random_split(HPA_dataset, [0.9, 0.1])
print("train_ds size:", len(train_ds))
print("val_ds size:", len(val_ds))
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = True, 
                        num_workers = num_workers)
val_loader = DataLoader(val_ds, batch_size = batch_size, shuffle = True, 
                        num_workers = num_workers)


train_ds size: 19626
val_ds size: 2180


In [25]:
# functions to show an image
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# get some random training images
# dataiter = iter(trainloader)
# images, labels = next(dataiter)
# images, labels = next(iter(train_loader))

In [26]:
# print(len(images[0]))
# print(images[0].shape)
# print(images[0].dtype)

In [27]:
# show images
# imshow(torchvision.utils.make_grid(images))
# print labels
# print(' '.join(f'{labels[j]}' for j in range(batch_size)))

In [28]:
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f"Computation device: {device}\n")

Computation device: cuda:3



In [9]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(input_ch, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 509 * 509, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 19)  

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = Net().to(device)

print(model)

Using GPUs: [3, 0, 1, 2]
DataParallel(
  (module): Net(
    (conv1): Conv2d(4, 6, kernel_size=(5, 5), stride=(1, 1))
    (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (fc1): Linear(in_features=4145296, out_features=120, bias=True)
    (fc2): Linear(in_features=120, out_features=84, bias=True)
    (fc3): Linear(in_features=84, out_features=19, bias=True)
  )
)


In [10]:
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = weight_decay)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = max_epochs)

In [11]:
best_metric_epoch = -1
epoch_loss_values = []
min_val_loss = 0.0

for epoch in range(max_epochs):  # loop over the dataset multiple times
    epoch_start = time.time()
    print("-" * 10)
    print(f"epoch {epoch + 1}/{max_epochs}")
    epoch_loss = 0.0
    step = 0
    for i, batch_data  in tqdm(enumerate(train_loader, 0)):
        step_start = time.time()
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = batch_data 
        inputs = inputs.to(device)
        labels = labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        epoch_loss += loss.item()
        
        print(
            f"{step}/{len(train_ds) // train_loader.batch_size}"
            f", train_loss: {loss.item():.4f}"
            f", step time: {(time.time() - step_start):.4f}"
        )
        step += 1
        batch_calculation = 5
    
    lr_scheduler.step()
    epoch_loss /= step
    epoch_loss_values.append(epoch_loss)
    print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}")

    writer.add_scalar('Loss/train', epoch_loss, epoch+1)

    if (epoch + 1) % val_interval == 0:  # print every 2000 mini-batches
        epoch_val_loss = 0.0
        model.eval()
        with torch.no_grad():
            val_step = 0
            for val_data in val_loader:
                val_inputs, val_labels = val_data
                val_inputs = val_inputs.to(device)
                val_labels = val_labels.to(device)
                val_outputs = model(val_inputs)
                val_loss = criterion(val_outputs, val_labels)
                epoch_val_loss += val_loss.item()
                val_step += 1
        
        epoch_val_loss /= val_step
        writer.add_scalar('Loss/valid', epoch_val_loss, epoch+1)
        
        if min_val_loss > epoch_val_loss and epoch > 0: 
            print(f'Validation Loss Decreased({min_val_loss:.6f}--->{epoch_val_loss:.6f}) \t Saving The Model')
            min_val_loss = epoch_val_loss
            best_metric_epoch = epoch + 1
            print(f'Best Metric Epoch: {best_metric_epoch}')
            # Saving State Dict
            torch.save(model.state_dict(), checkpoint_dir.joinpath('best_checkpoint.pth'))
            
        else:
            min_val_loss = epoch_val_loss
            print(min_val_loss)

print('Finished Training')

----------
epoch 1/100


1it [01:02, 62.84s/it]

0/153, train_loss: 0.6970, step time: 21.6264


1it [01:07, 67.61s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB (GPU 3; 31.75 GiB total capacity; 24.12 GiB already allocated; 1.42 GiB free; 25.65 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF