In [2]:
import numpy as np
import os
import sys
import gc
import pandas as pd
from PIL import Image

Image.MAX_IMAGE_PIXELS = 10000000000  # Ignore PIL warnings about large images
from tqdm import tqdm
from typing import List, Tuple
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
import random
import torch
from torch import nn
import cv2
from torch import Tensor
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import traceback
import albumentations as A
from albumentations.pytorch import ToTensorV2
from time import time
import wandb
from albumentations import ImageOnlyTransform

import segmentation_models_pytorch as smp

In [3]:
class CFG:
    seed = 1337
    comp_name = 'vesuvius'
    mode = "train"  # 'test'  # "train"

    # ============== model cfg =============
    model_name = 'Unet'
    backbone = 'efficientnet-b0'  # 'se_resnext50_32x4d'
    model_to_load = None  # '../model_checkpoints/vesuvius_notebook_clone_exp_holdout_3/models/Unet-zdim_6-epochs_30-step_15000-validId_3-epoch_9-dice_0.5195_dict.pt'
    target_size = 1
    in_chans = 4  # 8  # 6
    pretrained = True
    inf_weight = 'best'

    # ============== training cfg =============
    epochs = 50  # 15 # 30
    train_steps = 15000
    size = 224  # Size to shrink image to
    tile_size = 224
    stride = tile_size // 2

    train_batch_size = 1
    valid_batch_size = train_batch_size * 2
    valid_id = 4
    use_amp = True

    scheduler = 'GradualWarmupSchedulerV2'  # 'CosineAnnealingLR'
    min_lr = 1e-6
    weight_decay = 1e-6
    max_grad_norm = 1000
    num_workers = 0

    # objective_cv = 'binary'  # 'binary', 'multiclass', 'regression'
    metric_direction = 'maximize'  # maximize, 'minimize'
    # metrics = 'dice_coef'

    # adamW warmup
    warmup_factor = 10
    lr = 1e-4 / warmup_factor

    # ============== Experiment cfg =============
    # ToDO consolidate these names into one
    # exp_name = f'vesuvius_notebook_clone_exp_holdout_{valid_id}'
    EXPERIMENT_NAME = f"{model_name}-zdim_{in_chans}-epochs_{epochs}-validId_{valid_id}"

    # ============== Inference cfg =============
    THRESHOLD = 0.3  # .52 score had a different value of .25

    # ============== set dataset paths =============
    comp_dir_path = '../'
    comp_dataset_path = comp_dir_path + 'data/'
    outputs_path = comp_dir_path + f'model_checkpoints/{EXPERIMENT_NAME}/'

    submission_dir = outputs_path + 'submissions/'
    submission_path = submission_dir + f'submission_{EXPERIMENT_NAME}.csv'
    model_dir = outputs_path + 'models/'
    figures_dir = outputs_path + 'figures/'

    # ============== Augmentation =============
    train_aug_list = [
        # A.RandomResizedCrop(
        #     size, size, scale=(0.85, 1.0)),
        A.Resize(size, size),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.75),
        A.ShiftScaleRotate(p=0.75),
        A.OneOf([
            A.GaussNoise(var_limit=[10, 50]),
            A.GaussianBlur(),
            A.MotionBlur(),
        ], p=0.4),
        A.GridDistortion(num_steps=5, distort_limit=0.3, p=0.5),
        A.CoarseDropout(max_holes=1, max_width=int(size * 0.3), max_height=int(size * 0.3),
                        mask_fill_value=0, p=0.5),
        # A.Cutout(max_h_size=int(size * 0.6),
        #          max_w_size=int(size * 0.6), num_holes=1, p=1.0),
        A.Normalize(
            mean=[0] * in_chans,
            std=[1] * in_chans
        ),
        ToTensorV2(transpose_mask=True),
    ]

    valid_aug_list = [
        A.Resize(size, size),
        A.Normalize(
            mean=[0] * in_chans,
            std=[1] * in_chans
        ),
        ToTensorV2(transpose_mask=True),
    ]

# Setup Data

In [4]:
def read_image_and_labels(fragment_id: str, is_train: bool = True, mode: str = "train"):
    images = []

    mid = 65 // 2
    start = mid - CFG.in_chans // 2
    end = mid + CFG.in_chans // 2
    idxs = range(start, end)

    for i in tqdm(idxs):
        image = cv2.imread(CFG.comp_dataset_path + f"{mode}/{fragment_id}/surface_volume/{i:02}.tif", 0)

        pad0 = (CFG.tile_size - image.shape[0] % CFG.tile_size)
        pad1 = (CFG.tile_size - image.shape[1] % CFG.tile_size)

        image = np.pad(image, [(0, pad0), (0, pad1)], constant_values=0)

        images.append(image)
    images = np.stack(images, axis=2)  # Shape: (8288, 6496, 6)

    print(f"Length of image stack: {images.size}")
    if is_train:
        labels = cv2.imread(CFG.comp_dataset_path + f"train/{fragment_id}/inklabels.png", 0)
        labels = np.pad(labels, [(0, pad0), (0, pad1)], constant_values=0)

        labels = labels.astype('float32')
        labels /= 255.0  # Normalizing?
    else:
        labels = None

    return images, labels

In [5]:
def get_train_valid_dataset():
    train_images = []
    train_labels = []

    valid_images = []
    valid_labels = []
    valid_xyxys = []

    for frag_id in range(3, 5):
        print(f"Load images for fragment: {frag_id}")
        image, label = read_image_and_labels(frag_id)

        x1_list = list(range(0, image.shape[1]-CFG.tile_size+1, CFG.stride))
        y1_list = list(range(0, image.shape[0]-CFG.tile_size+1, CFG.stride))

        for y1 in y1_list:
            y2 = y1 + CFG.tile_size
            for x1 in x1_list:
                x2 = x1 + CFG.tile_size

                if frag_id == CFG.valid_id:
                    valid_images.append(image[y1:y2, x1:x2])
                    valid_labels.append(label[y1:y2, x1:x2, None])

                    valid_xyxys.append([x1, y1, x2, y2])
                else:
                    train_images.append(image[y1:y2, x1:x2])
                    train_labels.append(label[y1:y2, x1:x2, None])

    return train_images, train_labels, valid_images, valid_labels, valid_xyxys

def get_transforms(data, cfg):
    return A.Compose(cfg.train_aug_list) if data == 'train' else A.Compose(cfg.valid_aug_list)

## Create data object

In [6]:
from Scripts.segmentation_model import ImageDataset

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)
train_images, train_masks, valid_images, valid_masks, valid_xyxys = get_train_valid_dataset()
valid_xyxys = np.stack(valid_xyxys)

train_dataset = ImageDataset(train_images, labels=train_masks, transform=get_transforms(data='train', cfg=CFG))
train_loader = DataLoader(train_dataset,
                          batch_size=CFG.train_batch_size,
                          shuffle=True,
                          num_workers=CFG.num_workers, pin_memory=True, drop_last=True,
                          )
for images, labels in train_loader:
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)
    break

  0%|          | 0/4 [00:00<?, ?it/s]

cuda
Load images for fragment: 3


100%|██████████| 4/4 [00:03<00:00,  1.08it/s]
  0%|          | 0/4 [00:00<?, ?it/s]

Length of image stack: 163774464
Load images for fragment: 4


100%|██████████| 4/4 [00:03<00:00,  1.18it/s]


Length of image stack: 293429248


In [11]:
print(images[:, :3, :, :].shape)
print(images.shape)

torch.Size([1, 3, 224, 224])
torch.Size([1, 4, 224, 224])


# Define Your Model

In [53]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, height, width):
        super(PositionalEncoding, self).__init__()
        self.height = height
        self.width = width
        self.pe = nn.Parameter(torch.zeros(1, d_model, height, width))

    def forward(self, x):
        x = x + self.pe[:, :, :x.size(2), :x.size(3)]
        return x


class TransformerEncoder(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward, dropout):
        super(TransformerEncoder, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model, num_heads, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)

    def forward(self, x):
        x = x.permute(2, 0, 1, 3)  # Reshape input tensor [batch_size, channels, height, width] to [height, batch_size, channels, width]
        x = x.flatten(2)  # Flatten the height and channel dimensions
        x = x.permute(1, 0, 2)  # Reshape back to [batch_size, sequence_length, channels]
        x = self.transformer_encoder(x)
        x = x.permute(1, 0, 2)  # Reshape back to [batch_size, sequence_length, channels]
        return x


class FCTransformer(nn.Module):
    def __init__(self, d_model=301056, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1, num_classes=2):
        super(FCTransformer, self).__init__()
        self.embedding_dim = d_model
        self.pos_encoder = PositionalEncoding(d_model=d_model, height=224, width=224)
        self.transformer_encoder = TransformerEncoder(d_model, nhead, dim_feedforward, dropout)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return x


class FCTSegmentationModel(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(FCTSegmentationModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
        )
        # self.transformer = TransformerEncoder(d_model=64, num_heads=8, dim_feedforward=256, dropout=0.1)
        self.transformer = FCTransformer()
        self.decoder = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, num_classes, kernel_size=1, stride=1),
        )
        self.positional_encoding = PositionalEncoding(d_model=64, height=224, width=224)

    def forward(self, x):
        print("before encoder")
        print(x.shape)
        x = self.encoder(x)
        print("before positional encoder")
        print(x.shape)
        x = self.positional_encoding(x)
        print("before transformer")
        print(x.shape)
        x = self.transformer(x)
        print("before decoder")
        print(x.shape)
        x = self.decoder(x)
        print("Returning")
        print(x.shape)
        return x

# Begin your testing

In [55]:
model = FCTSegmentationModel(in_channels=4, num_classes=2)
# inputs = torch.randn(1, 3, 256, 256)  # Example input tensor
model.to(DEVICE)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of model params is: {num_params:,}")
outputs = model(images)  # [:, :3, :, :]

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 60423143424 bytes.

In [16]:
patch_embed = nn.Conv2d(4, 256, kernel_size=16, stride=16)
patch_embed(images)

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [21]:
conv = nn.Conv2d(4, 8, kernel_size=8, stride=8, padding=0).to(DEVICE)
r = conv(images)
r.shape

torch.Size([1, 8, 28, 28])

In [None]:
voxel = torch.randn(1, 8, 224, 224)
vox_conv = nn.Conv2d(in_channels=8, out_channels=1, kernel_size=8, stride=8, padding=0)
vox_conv(voxel).shape

In [None]:
voxel = torch.randn(1, 8, 224, 224)
vox_conv = nn.Conv2d(in_channels=8, out_channels=1, kernel_size=8, stride=8, padding=0)
vox_conv(voxel).shape

---

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class FCTBlock(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(FCTBlock, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        attn_output, _ = self.self_attn(x, x, x)
        x2 = x + self.dropout1(attn_output)
        x = self.norm1(x2)
        x2 = self.linear2(self.dropout(F.relu(self.linear1(x))))
        x = x + self.dropout2(x2)
        x = self.norm2(x)
        return x


class FCTSegmentationModel(nn.Module):
    def __init__(self, in_channels, d_model, nhead, num_layers, dim_feedforward, dropout, num_classes):
        super(FCTSegmentationModel, self).__init__()
        self.embedding_dim = d_model
        self.conv = nn.Conv2d(in_channels, d_model, kernel_size=3, stride=1, padding=1)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.ModuleList([
            FCTBlock(d_model, nhead, dim_feedforward, dropout=dropout) for _ in range(num_layers)
        ])
        self.fc = nn.Conv2d(d_model, num_classes, kernel_size=1)

    def forward(self, x):
        x = self.conv(x)
        x = x.flatten(2).transpose(1, 2)
        x = self.pos_encoder(x)
        for transformer_block in self.transformer:
            x = transformer_block(x)
        x = x.transpose(1, 2).reshape(x.shape[0], self.embedding_dim, x.shape[3], x.shape[2])
        x = self.fc(x)
        return x


# Usage example
model = FCTSegmentationModel(in_channels=4, d_model=256, nhead=4, num_layers=6, dim_feedforward=1024, dropout=0.1, num_classes=1).to(DEVICE)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of model params is: {num_params:,}")

Number of model params is: 4,748,289


In [9]:
model(images)

OutOfMemoryError: CUDA out of memory. Tried to allocate 196.00 MiB (GPU 0; 6.00 GiB total capacity; 5.14 GiB already allocated; 0 bytes free; 5.15 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()