# Severstel Steel Defect Detection

In [0]:
import time
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.optim as optim
from torchvision import transforms
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from models import ModelBuilder, SegmentationModule

In [0]:
seed = 42
batch_size = 4
num_epochs = 20
test_percent = 0.20
learning_rate = 3e-4
# Accumulates gradient of four mini-batches
# Effectively making the batch_size equal to accumulation_steps*times the batch_size
accumulation_steps = 4   

num_workers = 4

random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_folder = 'data/train_images'

## RLE-Mask utility functions

In [0]:
#https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def mask2rle(img):
    '''
    img: numpy array, 1 -> mask, 0 -> background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def make_mask(row_id, df):
    '''Given a row index, return image_id and mask (256, 1600, 4) from the dataframe `df`'''
    fname = df.iloc[row_id].name
    labels = df.iloc[row_id][:4]
    masks = np.zeros((256, 1600, 4), dtype=np.float32) # float32 is V.Imp
    # 4:class 1～4 (ch:0～3)

    for idx, label in enumerate(labels.values):
        if label is not np.nan:
            label = label.split(" ")
            positions = map(int, label[0::2])
            length = map(int, label[1::2])
            mask = np.zeros(256 * 1600, dtype=np.uint8)
            for pos, le in zip(positions, length):
                mask[pos:(pos + le)] = 1
            masks[:, :, idx] = mask.reshape(256, 1600, order='F')
    return fname, masks

## Dataloader

In [0]:
class SteelDataset(Dataset):
    def __init__(self, df, data_folder, mean, std, phase):
        self.df = df
        self.root = data_folder
        self.mean = mean
        self.std = std
        self.phase = phase
        self.fnames = self.df.index.tolist()
        
        # normalize
        self.normalize = transforms.Normalize(mean=mean, std=std)

    def __getitem__(self, idx):
        image_id, mask = make_mask(idx, self.df)
        image_path = os.path.join(self.root, "train_images",  image_id)
        img = cv2.imread(image_path)
        img, mask = self.transforms(image=img, mask=mask)
#         img = augmented['image']
#         mask = augmented['mask'] # 1x256x1600x4
        mask = mask[0].permute(2, 0, 1) # 1x4x256x1600
        return img, mask

    def __len__(self):
        return len(self.fnames)

    def transform(self, phase, img, mask):        
        # Resize
        resize = transforms.Resize(size=(256, 256))
        img = resize(img)
        mask = resize(mask)

        if phase=='train':
            # # Random Crop
            if random.random() > 0.50:
                i, j, h, w = transforms.RandomCrop.get_params(img, output_size=(256, 256))
                img = F.crop(img, i, j, h, w)
                mask = F.crop(mask ,i, j, h, w)

            # # Random Horizontal Flipping
            if random.random() > 0.50:
                img = F.hflip(img)
                mask = F.hflip(mask)

            # # Random Vertical Flipping
            if random.random() > 0.50:
                img = F.vflip(img)
                mask = F.vflip(mask)

        # # To tensor
        img = F.to_tensor(img)
        mask = F.to_tensor(mask)

        # # Normalize
        img = self.normalize(img)
        
        return img, mask

def provider(data_folder, df_path, phase, mean=None, std=None, batch_size=8, num_workers=4):
    '''Returns dataloader for the model training'''
    df = pd.read_csv(df_path)
    # https://www.kaggle.com/amanooo/defect-detection-starter-u-net
    df['ImageId'], df['ClassId'] = zip(*df['ImageId_ClassId'].str.split('_'))
    df['ClassId'] = df['ClassId'].astype(int)
    df = df.pivot(index='ImageId',columns='ClassId',values='EncodedPixels')
    df['defects'] = df.count(axis=1)
    
    train_df, val_df = train_test_split(df, test_size=test_percent, stratify=df["defects"], random_state=seed)
    df = train_df if phase == "train" else val_df
    image_dataset = SteelDataset(df, data_folder, mean, std, phase)
    dataloader = DataLoader(image_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=True, shuffle=True)

    return dataloader

## Metrics: IoU and Dice 

In [0]:
def metric(probability, truth, threshold=0.5, reduction='none'):
    '''Calculates dice of positive and negative images seperately'''
    '''probability and truth must be torch tensors'''
    batch_size = len(truth)
    with torch.no_grad():
        probability = probability.view(batch_size, -1)
        truth = truth.view(batch_size, -1)
        assert(probability.shape == truth.shape)

        p = (probability > threshold).float()
        t = (truth > 0.5).float()

        t_sum = t.sum(-1)
        p_sum = p.sum(-1)
        neg_index = torch.nonzero(t_sum == 0)
        pos_index = torch.nonzero(t_sum >= 1)

        dice_neg = (p_sum == 0).float()
        dice_pos = 2 * (p*t).sum(-1)/((p+t).sum(-1))

        dice_neg = dice_neg[neg_index]
        dice_pos = dice_pos[pos_index]
        dice = torch.cat([dice_pos, dice_neg])

        dice_neg = np.nan_to_num(dice_neg.mean().item(), 0)
        dice_pos = np.nan_to_num(dice_pos.mean().item(), 0)
        dice = dice.mean().item()

        num_neg = len(neg_index)
        num_pos = len(pos_index)

    return dice, dice_neg, dice_pos, num_neg, num_pos


def compute_ious(pred, label, classes, ignore_index=255, only_present=True):
    '''computes iou for one ground truth mask and predicted mask'''
    pred[label == ignore_index] = 0
    ious = []
    for c in classes:
        label_c = label == c
        if only_present and np.sum(label_c) == 0:
            ious.append(np.nan)
            continue
        pred_c = pred == c
        intersection = np.logical_and(pred_c, label_c).sum()
        union = np.logical_or(pred_c, label_c).sum()
        if union != 0:
            ious.append(intersection / union)
    return ious if ious else [1]


def compute_iou_batch(outputs, labels, classes=None):
    '''computes mean iou for a batch of ground truth masks and predicted masks'''
    ious = []
    preds = np.copy(outputs) # copy is imp
    labels = np.array(labels) # tensor to np
    for pred, label in zip(preds, labels):
        ious.append(np.nanmean(compute_ious(pred, label, classes)))
    iou = np.nanmean(ious)
    return iou

class Meter:
    '''A meter to keep track of iou and dice scores throughout an epoch'''
    def __init__(self, phase, epoch):
        self.base_threshold = 0.5 # <<<<<<<<<<< here's the threshold
        self.base_dice_scores = []
        self.dice_neg_scores = []
        self.dice_pos_scores = []
        self.iou_scores = []

    def update(self, targets, outputs):
        probs = torch.sigmoid(outputs)
        dice, dice_neg, dice_pos, _, _ = metric(probs, targets, self.base_threshold)
        self.base_dice_scores.append(dice)
        self.dice_pos_scores.append(dice_pos)
        self.dice_neg_scores.append(dice_neg)
        preds = predict(probs, self.base_threshold)
        iou = compute_iou_batch(preds, targets, classes=[1])
        self.iou_scores.append(iou)

    def get_metrics(self):
        dice = np.mean(self.base_dice_scores)
        dice_neg = np.mean(self.dice_neg_scores)
        dice_pos = np.mean(self.dice_pos_scores)
        dices = [dice, dice_neg, dice_pos]
        iou = np.nanmean(self.iou_scores)
        return dices, iou

## Logger Functions

In [0]:
def epoch_log(phase, epoch, epoch_loss, meter, start):
    '''logging the metrics at the end of an epoch'''
    dices, iou = meter.get_metrics()
    dice, dice_neg, dice_pos = dices
    print("Loss: %0.4f | IoU: %0.4f | dice: %0.4f | dice_neg: %0.4f | dice_pos: %0.4f" % (epoch_loss, iou, dice, dice_neg, dice_pos))
    return dice, iou

In [0]:
def predict(X, threshold):
    '''X is sigmoid output of the model'''
    X_p = np.copy(X)
    preds = (X_p > threshold).astype('uint8')
    return preds

## Load the data

In [0]:
phases = ["train", "val"]
dataloaders = {phase: provider(data_folder='data', df_path='data/train.csv', phase=phase,
                               mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), 
                               batch_size=batch_size, num_workers=num_workers) for phase in phases}
losses = {phase: [] for phase in phases}
iou_scores = {phase: [] for phase in phases}
dice_scores = {phase: [] for phase in phases}

## Model

In [0]:
net_encoder = ModelBuilder.build_encoder(arch="resnet18dilated")#, fc_dim=cfg.MODEL.fc_dim,  weights=cfg.MODEL.weights_encoder)
net_decoder = ModelBuilder.build_decoder(arch="ppm_deepsup", num_class=4)#fc_dim=cfg.MODEL.fc_dim,num_class=cfg.DATASET.num_class,weights=cfg.MODEL.weights_decoder)

Downloading: "http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth" to ./pretrained/resnet18-imagenet.pth


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [0]:
net_encoder = net_encoder.to(device)
net_decoder = net_decoder.to(device)
criterion = torch.nn.BCEWithLogitsLoss()
encoder_optimizer = optim.Adam(net_encoder.parameters(), lr=learning_rate, weight_decay=1e-5)
decoder_optimizer = optim.Adam(net_decoder.parameters(), lr=learning_rate, weight_decay=1e-5)
encoder_scheduler = ReduceLROnPlateau(encoder_optimizer, mode="min", patience=3, verbose=True)
decoder_scheduler = ReduceLROnPlateau(decoder_optimizer, mode="min", patience=3, verbose=True)

## Training and Validation

In [0]:
epoch = float('inf')
for epoch in range(num_epochs):
    for phase in phases:
        running_loss = 0.0
        meter = Meter(phase, epoch)
        time = time.strftime("%H:%M:%S")
        
        print(f"Starting epoch: {epoch} | phase: {phase} | ⏰: {start}")
        
        # Each epoch has a training and validation phase
        if phase=="train":
            net_encoder.train()
            net_decoder.train()
        else:
            net_encoder.eval()
            net_decoder.eval()
        
        #
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        for itr, batch in enumerate(dataloader):
            images, masks = batch
            images = images.to(device)
            masks = masks.to(device)
            
            # forward
            # track history if only in train
            with torch.set_grad_enabled(phase=='train'):
                outputs = net_decoder(net_encoder(images))
                loss = criterion(masks, targets)
                loss /= accumulation_steps

                if phase=='train':
                    loss.backward()
                    if (itr+1)%accumulation_steps == 0:
                        encoder_optimizer.step()
                        decoder_optimizer.step()
                        encoder_optimizer.zero_grad()
                        decoder_optimizer.zero_grad()
            
            running_loss += loss.item()
            outputs = outputs.detach().cpu()
            meter.update(masks.cpu(), outputs)
    
        epoch_loss = (running_loss * accumulation_steps) / total_batches
        dice, iou = epoch_log(phase, epoch, epoch_loss, meter, start)
        losses[phase].append(epoch_loss)
        dice_scores[phase].append(dice)
        iou_scores[phase].append(iou)
        torch.cuda.empty_cache()
        
        state = {
            "epoch": epoch,
            "best_loss": best_loss,
            "encoder_state_dict": net_encoder.state_dict(),
            "decoder_state_dict": net_decoder.state_dict(),
            "encoder_optimizer": encoder_optimizer.state_dict(),
            "decoder_optimizer": decoder_optimizer.state_dict()
        }
        
        if phase=='valid':
            encoder_scheduler.step(epoch_loss)
            decoder_scheduler.step(epoch_loss)
            if epoch_loss < best_loss:
                print("******** New optimal found, saving state ********")
                state["best_loss"] = best_loss = val_epoch
                torch.save(state, f"./model_{epoch}_{time}.pth")
    print() 

In [0]:
# class Trainer(object):
#     '''This class takes care of training and validation of our model'''
#     def __init__(self, model):
#         self.best_loss = float("inf")
#         self.phases = ["train", "val"]
#         self.net = model 
#         self.net = self.net.to(self.device)
#         cudnn.benchmark = True
#         self.dataloaders = {
#             phase: provider(
#                 data_folder=data_folder,
#                 df_path=train_df_path,
#                 phase=phase,
#                 mean=(0.485, 0.456, 0.406),
#                 std=(0.229, 0.224, 0.225),
#                 batch_size=self.batch_size[phase],
#                 num_workers=self.num_workers,
#             )
#             for phase in self.phases
#         }
#         self.losses = {phase: [] for phase in self.phases}
#         self.iou_scores = {phase: [] for phase in self.phases}
#         self.dice_scores = {phase: [] for phase in self.phases}
        
#     def forward(self, images, targets):
#         images = images.to(self.device)
#         masks = targets.to(self.device)
#         outputs = self.net(images)
#         loss = self.criterion(outputs, masks)
#         return loss, outputs

#     def iterate(self, epoch, phase):
#         meter = Meter(phase, epoch)
#         start = time.strftime("%H:%M:%S")
#         print(f"Starting epoch: {epoch} | phase: {phase} | ⏰: {start}")
#         batch_size = self.batch_size[phase]
#         self.net.train(phase == "train")
#         dataloader = self.dataloaders[phase]
#         running_loss = 0.0
#         total_batches = len(dataloader)
# #         tk0 = tqdm(dataloader, total=total_batches)
#         self.optimizer.zero_grad()
#         for itr, batch in enumerate(dataloader): # replace `dataloader` with `tk0` for tqdm
#             images, targets = batch
#             loss, outputs = self.forward(images, targets)
#             loss = loss / self.accumulation_steps
#             if phase == "train":
#                 loss.backward()
#                 if (itr + 1 ) % self.accumulation_steps == 0:
#                     self.optimizer.step()
#                     self.optimizer.zero_grad()
#             running_loss += loss.item()
#             outputs = outputs.detach().cpu()
#             meter.update(targets, outputs)
# #             tk0.set_postfix(loss=(running_loss / ((itr + 1))))
#         epoch_loss = (running_loss * self.accumulation_steps) / total_batches
#         dice, iou = epoch_log(phase, epoch, epoch_loss, meter, start)
#         self.losses[phase].append(epoch_loss)
#         self.dice_scores[phase].append(dice)
#         self.iou_scores[phase].append(iou)
#         torch.cuda.empty_cache()
#         return epoch_loss

#     def start(self):
#         for epoch in range(self.num_epochs):
#             self.iterate(epoch, "train")
#             state = {
#                 "epoch": epoch,
#                 "best_loss": self.best_loss,
#                 "state_dict": self.net.state_dict(),
#                 "optimizer": self.optimizer.state_dict(),
#             }
#             with torch.no_grad():
#                 val_loss = self.iterate(epoch, "val")
#                 self.scheduler.step(val_loss)
#             if val_loss < self.best_loss:
#                 print("******** New optimal found, saving state ********")
#                 state["best_loss"] = self.best_loss = val_loss
#                 torch.save(state, "./model.pth")
#             print()

In [0]:
# model_trainer = Trainer(model)
# model_trainer.start()

## Plotting

In [0]:
# PLOT TRAINING
# losses = model_trainer.losses
# dice_scores = model_trainer.dice_scores # overall dice
# iou_scores = model_trainer.iou_scores

def plot(scores, name):
    plt.figure(figsize=(15,5))
    plt.plot(range(len(scores["train"])), scores["train"], label=f'train {name}')
    plt.plot(range(len(scores["train"])), scores["val"], label=f'val {name}')
    plt.title(f'{name} plot'); plt.xlabel('Epoch'); plt.ylabel(f'{name}');
    plt.legend(); 
    plt.show()

plot(losses, "BCE loss")
plot(dice_scores, "Dice score")
plot(iou_scores, "IoU score")