In [1]:
!pip install -q segmentation_models_pytorch
!pip install -qU wandb

In [2]:
!git clone https://github.com/labmlai/annotated_deep_learning_paper_implementations

Cloning into 'annotated_deep_learning_paper_implementations'...
remote: Enumerating objects: 15544, done.[K
remote: Counting objects: 100% (3235/3235), done.[K
remote: Compressing objects: 100% (364/364), done.[K
remote: Total 15544 (delta 2919), reused 3092 (delta 2864), pack-reused 12309[K
Receiving objects: 100% (15544/15544), 146.87 MiB | 18.75 MiB/s, done.
Resolving deltas: 100% (10415/10415), done.


In [3]:
from annotated_deep_learning_paper_implementations.labml_nn.unet import UNet

In [4]:
import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"
import random
from glob import glob
import os, shutil
from tqdm import tqdm
tqdm.pandas()
import time
import copy
from matplotlib.patches import Rectangle
# import joblib
from collections import defaultdict
import gc
# from IPython import display as ipd

# visualization
import cv2
import matplotlib.pyplot as plt

# Sklearn
from sklearn.model_selection import StratifiedGroupKFold

# PyTorch 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A

import rasterio
from joblib import Parallel, delayed

# For colored terminal text
from colorama import Fore, Back, Style
c_  = Fore.GREEN
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import wandb
import segmentation_models_pytorch as smp




In [5]:
api_key = "5793e9c9290a2733cb520d9e4f5fa0dfa4e6284f" #my api
wandb.login(key=api_key)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
class CFG:
    seed          = 101
    debug         = False # set debug=False for Full Training
    model_name    = 'Unet'
    backbone      = 'efficientnet-b1'
    train_bs      = 64
    valid_bs      = train_bs
    img_size      = [224, 224]
    epochs        = 10
    lr            = 2e-3
    scheduler     = 'CosineAnnealingLR'
    min_lr        = 1e-6
    T_max         = int(30000/train_bs*epochs)+50
    T_0           = 25
    warmup_epochs = 0
    wd            = 1e-6
    n_accumulate  = max(1, 32//train_bs)
    n_fold        = 5
    num_classes   = 3
    fold          = 0
    device        = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
np.random.seed(CFG.seed)
random.seed(CFG.seed)
torch.manual_seed(CFG.seed)
torch.cuda.manual_seed(CFG.seed)
# When running on the CuDNN backend, two further options must be set
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Set a fixed value for the hash seed
os.environ['PYTHONHASHSEED'] = str(CFG.seed)

In [8]:
image_base_path = "/kaggle/input/uw-madison-gi-tract-image-segmentation"
mask_base_path  = "/kaggle/input/uwmgi-mask-dataset"
df = pd.read_csv(mask_base_path + "/train.csv")
df["segmentation"] = df["segmentation"].notna() 
df["mask_path"] = df["mask_path"].str.replace("/png/", "/np").str.replace(".png", ".npy")
df = df.groupby(["id"]).head(1).reset_index(drop = True)
segmented, unsegmented = df['segmentation'].value_counts()
print(f"There are {segmented} segmented slices")
print(f"There are {unsegmented} unsegmented slices")

There are 24411 segmented slices
There are 14085 unsegmented slices


In [9]:
def load_img(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    img = img.astype("float32")
    img = img / np.max(img)
    return img

def load_msk(mask_path_npy):
    mask = np.load(mask_path_npy)
    mask = mask.astype("float32")
    mask = mask / 255.0
    return mask

In [10]:
skf = StratifiedGroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['segmentation'], groups = df["case"])):
    df.loc[val_idx, 'fold'] = fold
display(df.groupby(['fold','segmentation'])['id'].count())

fold  segmentation
0.0   False           4577
      True            2655
1.0   False           5225
      True            3303
2.0   False           5161
      True            2775
3.0   False           4841
      True            2503
4.0   False           4607
      True            2849
Name: id, dtype: int64

In [11]:
data_transforms = {
    "train": A.Compose([
        A.Resize(*CFG.img_size, interpolation=cv2.INTER_NEAREST),
        A.HorizontalFlip(p=0.5),
#         A.VerticalFlip(p=0.5),
        A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.05, rotate_limit=10, p=0.5),
        A.OneOf([
            A.GridDistortion(num_steps=5, distort_limit=0.05, p=1.0),
# #             A.OpticalDistortion(distort_limit=0.05, shift_limit=0.05, p=1.0),
            A.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=1.0)
        ], p=0.25),
        A.CoarseDropout(max_holes=8, max_height=CFG.img_size[0]//20, max_width=CFG.img_size[1]//20,
                         min_holes=5, fill_value=0, mask_fill_value=0, p=0.5),
        ], p=1.0),
    
    "valid": A.Compose([
        A.Resize(*CFG.img_size, interpolation=cv2.INTER_NEAREST),
        ], p=1.0)
}

In [12]:
class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, df, label=True, transforms=None):
        self.df         = df
        self.label      = label
        self.img_paths  = df['image_path'].tolist()
        self.msk_paths  = df['mask_path'].tolist()
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path  = self.img_paths[index]
        img = []
        img = load_img(img_path)
        
        if self.label:
            msk_path = self.msk_paths[index]
            msk = load_msk(msk_path)
            if self.transforms:
                data = self.transforms(image=img, mask=msk)
                img  = data['image']
                msk  = data['mask']
            img = np.transpose(img, (2, 0, 1))
            msk = np.transpose(msk, (2, 0, 1))
            return torch.tensor(img), torch.tensor(msk)
        else:
            if self.transforms:
                data = self.transforms(image=img)
                img  = data['image']
            img = np.transpose(img, (2, 0, 1))
            return torch.tensor(img)

In [13]:
def prepare_loaders(fold, debug=False):
    train_df = df.query("fold!=@fold").reset_index(drop=True)
    valid_df = df.query("fold==@fold").reset_index(drop=True)
    if debug:
        train_df = train_df.head(32*5).query("segmentation==True")
        valid_df = valid_df.head(32*3).query("segmentation==True")
    train_dataset = BuildDataset(train_df, transforms=data_transforms['train'])
    valid_dataset = BuildDataset(valid_df, transforms=data_transforms['valid'])

    train_loader = DataLoader(train_dataset, batch_size=CFG.train_bs if not debug else 20, 
                              num_workers=4, shuffle=True, pin_memory=True, drop_last=False)
    valid_loader = DataLoader(valid_dataset, batch_size=CFG.valid_bs if not debug else 20, 
                              num_workers=4, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader


In [14]:
train_loader, valid_loader = prepare_loaders(fold=0, debug=True)

In [15]:
imgs, msks = next(iter(train_loader))
imgs.size(), msks.size()

(torch.Size([20, 3, 224, 224]), torch.Size([20, 3, 224, 224]))

In [16]:
def criterion(y_pred, y_true):
    BCELoss     = smp.losses.SoftBCEWithLogitsLoss()
    return BCELoss(y_pred, y_true)

In [17]:
def dice_coef(y_true, y_pred, thr=0.5, dim=(2,3), epsilon=0.001):
    y_true = y_true.to(torch.float32)
    y_pred = (y_pred>thr).to(torch.float32)
    inter = (y_true*y_pred).sum(dim=dim)
    den = y_true.sum(dim=dim) + y_pred.sum(dim=dim)
    dice = ((2*inter+epsilon)/(den+epsilon)).mean(dim=(1,0))
    return dice

def iou_coef(y_true, y_pred, thr=0.5, dim=(2,3), epsilon=0.001):
    y_true = y_true.to(torch.float32)
    y_pred = (y_pred>thr).to(torch.float32)
    inter = (y_true*y_pred).sum(dim=dim)
    union = (y_true + y_pred - y_true*y_pred).sum(dim=dim)
    iou = ((inter+epsilon)/(union+epsilon)).mean(dim=(1,0))
    return iou

In [18]:
gc.collect()

267

In [19]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    torch.cuda.empty_cache()
    scaler = amp.GradScaler()
    '''
    If the forward pass for a particular op has float16 inputs, the backward pass for that op will
    produce float16 gradients. Gradient values with small magnitudes may not be representable in float16. 
    These values will flush to zero (“underflow”), so the update for the corresponding parameters will be
    lost. 
    To prevent underflow, “gradient scaling” multiplies the network’s loss(es) by a scale factor and invokes
    a backward pass on the scaled loss(es). Gradients flowing backward through the network are then scaled by
    the same factor. In other words, gradient values have a larger magnitude, so they don’t flush to zero.
    '''
    
    dataset_size = 0
    running_loss = 0.0
    
    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc='Train ')
    for step, (images, masks) in pbar:         
        images = images.to(device, dtype=torch.float)
        masks  = masks.to(device, dtype=torch.float)
        
        batch_size = images.size(0)
                # Enables autocasting for the forward pass (model + loss)
        with amp.autocast(enabled=True):
            y_pred = model(images)
            loss   = criterion(y_pred, masks)
            loss   = loss / CFG.n_accumulate
        # Exits the context manager before backward() 
        scaler.scale(loss).backward()
    
        if (step + 1) % CFG.n_accumulate == 0:
            '''
            All optimizers implement a step() method, that updates the parameters.This is a simplified version supported by most optimizers. 
            The function can be called once the gradients are computed using e.g. backward().
            '''
            scaler.step(optimizer)
            '''
            update()
            Updates the scale factor.
            If any optimizer steps were skipped the scale is multiplied by backoff_factor to reduce it.
            If growth_interval unskipped iterations occurred consecutively, the scale is multiplied by growth_factor
            to increase it.
            Passing new_scale sets the new scale value manually. (new_scale is not used directly, it’s used to
            fill GradScaler’s internal scale tensor. So if new_scale was a tensor, later in-place changes to that
            tensor will not further affect the scale GradScaler uses internally.)
            '''
            scaler.update()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        '''  
        torch.cuda.memory_reserved()
        Returns the current GPU memory managed by the caching allocator in bytes for a given device.
        '''       
        epoch_loss = running_loss / dataset_size
        
        mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(train_loss=f'{epoch_loss:0.4f}',
                        lr=f'{current_lr:0.5f}',
                        gpu_mem=f'{mem:0.2f} GB')
    '''
    torch.cuda.empty_cache()
    Releases all unoccupied cached memory currently held by the caching allocator so that those
    can be used in other GPU application and visible in nvidia-smi.
    '''
    torch.cuda.empty_cache()
    gc.collect()
    
    return epoch_loss

In [20]:
@torch.no_grad()

def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    val_scores = []
    
    pbar = tqdm(enumerate(dataloader), total=len(dataloader), desc='Valid ')
    for step, (images, masks) in pbar:        
        images  = images.to(device, dtype=torch.float)
        masks   = masks.to(device, dtype=torch.float)
        
        batch_size = images.size(0)
        
        y_pred  = model(images)
        loss    = criterion(y_pred, masks)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        y_pred = nn.Sigmoid()(y_pred)
        val_dice = dice_coef(masks, y_pred).cpu().detach().numpy()
        val_iou = iou_coef(masks, y_pred).cpu().detach().numpy()
        val_scores.append([val_dice, val_iou])
        
        mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0
        current_lr = optimizer.param_groups[0]['lr']
        pbar.set_postfix(valid_loss=f'{epoch_loss:0.4f}',
                        lr=f'{current_lr:0.5f}',
                        gpu_memory=f'{mem:0.2f} GB')
    val_scores  = np.mean(val_scores, axis=0)
    torch.cuda.empty_cache()
    gc.collect()
    
    return epoch_loss, val_scores

In [21]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    wandb.watch(model, log_freq=100)
    if torch.cuda.is_available():
        print("cuda: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_dice      = -np.inf
    best_epoch     = -1
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        print(f'Epoch {epoch}/{num_epochs}', end='')
        train_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CFG.device, epoch=epoch)
        
        val_loss, val_scores = valid_one_epoch(model, valid_loader, 
                                                 device=CFG.device, 
                                                 epoch=epoch)
        val_dice, val_iou = val_scores
    
        history['Train Loss'].append(train_loss)
        history['Valid Loss'].append(val_loss)
        history['Valid Dice'].append(val_dice)
        history['Valid IOU'].append(val_iou)
        
        # Log the metrics
        wandb.log({"Train Loss": train_loss, 
                   "Valid Loss": val_loss,
                   "Valid Dice": val_dice,
                   "Valid IOU": val_iou,
                   "LR":scheduler.get_last_lr()[0]})
        
        print(f'Valid Dice: {val_dice:0.4f} | Valid IOU: {val_iou:0.4f}')
        
        # deep copy the model
        if val_dice >= best_dice:
            best_dice = val_dice
            best_iou = val_iou
            best_epoch = epoch
            run.summary["Best Dice"]    = best_dice
            run.summary["Best IOU"] = best_iou
            run.summary["Best Epoch"]   = best_epoch
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f"best_epoch-{fold:02d}.bin")
            # Save a model file from the current directory
#             wandb.save( f"best_epoch-{fold:02d}.bin")
            print(f"Model Saved{sr_}")
        last_model_wts = copy.deepcopy(model.state_dict())
        torch.save(model.state_dict(), f"last_epoch-{fold:02d}.bin")
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Score: {:.4f}".format(max(best_iou, best_dice)))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [22]:
fold = 4
run = wandb.init(project="orginal UNET", 
                 config={k:v for k, v in dict(vars(CFG)).items() if '__' not in k},
                 name=f"fold-{fold}|dim-{CFG.img_size[0]}x{CFG.img_size[1]} | orginal-unet | official"
                )
train_loader, valid_loader = prepare_loaders(fold=fold, debug= CFG.debug)
model = UNet(3, 3).to(CFG.device)
optimizer = optim.Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.wd)
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CFG.T_0, eta_min=CFG.min_lr)
model, history = run_training(model, optimizer, scheduler,
                              device=CFG.device,
                              num_epochs=CFG.epochs)
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mlamthanhdo45[0m ([33mlamlam452002[0m). Use [1m`wandb login --relogin`[0m to force relogin


cuda: Tesla P100-PCIE-16GB

Epoch 1/10

Train : 100%|██████████| 485/485 [10:06<00:00,  1.25s/it, gpu_mem=15.16 GB, lr=0.00131, train_loss=0.1463]
Valid : 100%|██████████| 117/117 [01:05<00:00,  1.79it/s, gpu_memory=12.58 GB, lr=0.00131, valid_loss=0.0252]


Valid Dice: 0.6990 | Valid IOU: 0.6990
Model Saved[0m
Epoch 2/10

Train : 100%|██████████| 485/485 [09:57<00:00,  1.23s/it, gpu_mem=15.64 GB, lr=0.00019, train_loss=0.0212]
Valid : 100%|██████████| 117/117 [01:04<00:00,  1.83it/s, gpu_memory=12.54 GB, lr=0.00019, valid_loss=0.0227]


Valid Dice: 0.6883 | Valid IOU: 0.6874
Epoch 3/10

Train : 100%|██████████| 485/485 [09:57<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00181, train_loss=0.0194]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.84it/s, gpu_memory=12.45 GB, lr=0.00181, valid_loss=0.0224]


Valid Dice: 0.6977 | Valid IOU: 0.6977
Epoch 4/10

Train : 100%|██████████| 485/485 [09:57<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00069, train_loss=0.0180]
Valid : 100%|██████████| 117/117 [01:04<00:00,  1.83it/s, gpu_memory=12.44 GB, lr=0.00069, valid_loss=0.0202]


Valid Dice: 0.6811 | Valid IOU: 0.6800
Epoch 5/10

Train : 100%|██████████| 485/485 [09:58<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00200, train_loss=0.0170]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.83it/s, gpu_memory=12.45 GB, lr=0.00200, valid_loss=0.0185]


Valid Dice: 0.6863 | Valid IOU: 0.6722
Epoch 6/10

Train : 100%|██████████| 485/485 [09:57<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00131, train_loss=0.0152]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.84it/s, gpu_memory=12.44 GB, lr=0.00131, valid_loss=0.0187]


Valid Dice: 0.6636 | Valid IOU: 0.6342
Epoch 7/10

Train : 100%|██████████| 485/485 [09:56<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00019, train_loss=0.0132]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.83it/s, gpu_memory=12.45 GB, lr=0.00019, valid_loss=0.0143]


Valid Dice: 0.7464 | Valid IOU: 0.7147
Model Saved[0m
Epoch 8/10

Train : 100%|██████████| 485/485 [09:56<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00181, train_loss=0.0118]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.83it/s, gpu_memory=12.44 GB, lr=0.00181, valid_loss=0.0130]


Valid Dice: 0.7551 | Valid IOU: 0.7182
Model Saved[0m
Epoch 9/10

Train : 100%|██████████| 485/485 [09:56<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00069, train_loss=0.0108]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.84it/s, gpu_memory=12.45 GB, lr=0.00069, valid_loss=0.0120]


Valid Dice: 0.7970 | Valid IOU: 0.7625
Model Saved[0m
Epoch 10/10

Train : 100%|██████████| 485/485 [09:55<00:00,  1.23s/it, gpu_mem=15.54 GB, lr=0.00200, train_loss=0.0099]
Valid : 100%|██████████| 117/117 [01:03<00:00,  1.83it/s, gpu_memory=12.45 GB, lr=0.00200, valid_loss=0.0106]


Valid Dice: 0.8084 | Valid IOU: 0.7738
Model Saved[0m
Training complete in 1h 50m 34s
Best Score: 0.8084


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
LR,▅▁▇▃█▅▁▇▃█
Train Loss,█▂▁▁▁▁▁▁▁▁
Valid Dice,▃▂▃▂▂▁▅▅▇█
Valid IOU,▄▄▄▃▃▁▅▅▇█
Valid Loss,█▇▇▆▅▅▃▂▂▁

0,1
Best Dice,0.80844
Best Epoch,10.0
Best IOU,0.77379
LR,0.002
Train Loss,0.00988
Valid Dice,0.80844
Valid IOU,0.77379
Valid Loss,0.01057


In [23]:
!rm -r ./wandb