# This is the code to create the dataset for training/validation

# Import libraries

In [1]:

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss
import pickle
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import warnings
import sys
import pandas as pd
import os
import gc
import sys
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
import cv2

import scipy as sp
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from functools import partial

import argparse
import importlib
import torch
import torch.nn as nn
from torch.optim import Adam, SGD, AdamW

import datetime

In [2]:
import numpy as np
from torch.utils.data import DataLoader, Dataset
import cv2
import torch
import os
import albumentations as A
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

# Configuration

**We can change the stride, tile size, number of channels etc here**

In [3]:
import os
import albumentations as A
from albumentations.pytorch import ToTensorV2

class CFG:
    # ============== comp exp name =============
    comp_name = 'vesuvius'

    # comp_dir_path = './'
    comp_dir_path = '/kaggle/input/'
    comp_folder_name = 'vesuvius-challenge-ink-detection'
    # comp_dataset_path = f'{comp_dir_path}datasets/{comp_folder_name}/'
    comp_dataset_path = f'{comp_dir_path}{comp_folder_name}/'
    
    exp_name = 'vesuvius_2d_slide_exp006'


    target_size = 1
    in_chans = 3 # 65
    seed = 42
    # ============== training cfg =============
    size = 224
    tile_size = 224
    stride = tile_size

    # ============== set dataset path =============
    print('set dataset path')

    outputs_path = f'/kaggle/working/outputs/{comp_name}/{exp_name}/'

    data_dir = outputs_path + \
        f'{comp_name}-data/'

    log_dir = outputs_path + 'logs/'
    log_path = log_dir + f'{exp_name}.txt'

    # ============== augmentation =============
    train_aug_list = [
        # A.RandomResizedCrop(
        #     size, size, scale=(0.85, 1.0)),
        A.Resize(size, size),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.75),
        A.ShiftScaleRotate(p=0.75),
        A.OneOf([
                A.GaussNoise(var_limit=[10, 50]),
                A.GaussianBlur(),
                A.MotionBlur(),
                ], p=0.4),
        A.GridDistortion(num_steps=5, distort_limit=0.3, p=0.5),
        A.CoarseDropout(max_holes=1, max_width=int(size * 0.3), max_height=int(size * 0.3), 
                        mask_fill_value=0, p=0.5),
        # A.Cutout(max_h_size=int(size * 0.6),
        #          max_w_size=int(size * 0.6), num_holes=1, p=1.0),
        A.Normalize(
            mean= [0] * in_chans,
            std= [1] * in_chans
        ),
        ToTensorV2(transpose_mask=True),
    ]

    valid_aug_list = [
        A.Resize(size, size),
        A.Normalize(
            mean= [0] * in_chans,
            std= [1] * in_chans
        ),
        ToTensorV2(transpose_mask=True),
    ]


set dataset path


In [4]:
def init_logger(log_file):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def set_seed(seed=None, cudnn_deterministic=True):
    if seed is None:
        seed = 42

    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = cudnn_deterministic
    torch.backends.cudnn.benchmark = False

In [5]:
def make_dirs(cfg):
    for dir in [cfg.data_dir,cfg.log_dir]:
        os.makedirs(dir, exist_ok=True)

In [6]:
def cfg_init(cfg, mode='train'):
    set_seed(cfg.seed)
    # set_env_name()
    # set_dataset_path(cfg)

    if mode == 'train':
        make_dirs(cfg)

In [7]:
cfg_init(CFG)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Logger = init_logger(log_file=CFG.log_path)

Logger.info('\n\n-------- exp_info -----------------')
# Logger.info(datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M:%S'))



-------- exp_info -----------------


# Change the input channels according to trainig config

In [8]:
def read_image_mask(fragment_id):

    images = []

    # idxs = range(65)
    mid = 65 // 2
    start = 29
    end = 32
#     start = mid - CFG.in_chans // 2
#     end = mid + CFG.in_chans // 2
    idxs = range(start, end)

    for i in tqdm(idxs):
        
        image = cv2.imread(CFG.comp_dataset_path + f"train/{fragment_id}/surface_volume/{i:02}.tif", 0)

        pad0 = (CFG.tile_size - image.shape[0] % CFG.tile_size)
        pad1 = (CFG.tile_size - image.shape[1] % CFG.tile_size)

        image = np.pad(image, [(0, pad0), (0, pad1)], constant_values=0)

        images.append(image)
    images = np.stack(images, axis=2)

    mask = cv2.imread(CFG.comp_dataset_path + f"train/{fragment_id}/inklabels.png", 0)
    mask = np.pad(mask, [(0, pad0), (0, pad1)], constant_values=0)

    mask = mask.astype('float32')
    mask /= 255.0
    
    return images, mask

In [13]:
def get_train_valid_dataset():
    images = []
    masks = []
    xyxys = []
    fragment_no = []

    for fragment_id in range(1, 4):

        image, mask = read_image_mask(fragment_id)
        print(image.shape)

        x1_list = list(range(0, image.shape[1]-CFG.tile_size+1, CFG.stride))
        y1_list = list(range(0, image.shape[0]-CFG.tile_size+1, CFG.stride))

        for y1 in y1_list:
            for x1 in x1_list:
                y2 = y1 + CFG.tile_size
                x2 = x1 + CFG.tile_size
                # xyxys.append((x1, y1, x2, y2))
        
                
                images.append(image[y1:y2, x1:x2])
                masks.append(mask[y1:y2, x1:x2, None])
                fragment_no.append(fragment_id)
                xyxys.append([x1, y1, x2, y2])
               

    return images, masks, xyxys, fragment_no

In [14]:
images, masks, xyxys, frag_no = get_train_valid_dataset()

  0%|          | 0/3 [00:00<?, ?it/s]

(8288, 6496, 3)


  0%|          | 0/3 [00:00<?, ?it/s]

(15008, 9632, 3)


  0%|          | 0/3 [00:00<?, ?it/s]

(7616, 5376, 3)


In [15]:
# xyxys = np.stack(xyxys)   USE THIS CODE FOR TRAINING LATER ON 

In [17]:
len(images),len(masks),len(xyxys),len(frag_no)

(18615, 18615, 18615, 18615)

In [18]:
type(images),type(masks)

(list, list)

In [19]:
df = pd.DataFrame({'Frag_no': pd.Series(frag_no),
                   'Images': pd.Series(images),
                   'Masks': pd.Series(masks),
                   'XYXYS': pd.Series(xyxys)})

In [20]:
df.head(5)

Unnamed: 0,Frag_no,Images,Masks,XYXYS
0,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[0, 0, 224, 224]"
1,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[112, 0, 336, 224]"
2,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[224, 0, 448, 224]"
3,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[336, 0, 560, 224]"
4,1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0...","[448, 0, 672, 224]"


In [None]:
df.shape

In [None]:
data_csv = CFG.data_dir+'data.csv'
df.to_csv(data_csv, index=False)
