In [None]:
import re
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
import torch

from sklearn.model_selection import train_test_split

In [None]:
# RLE 디코딩 함수
def rle_decode(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

# Divide

In [None]:
result = 'data_512_'
os.makedirs(result, exist_ok=True)

img_dir = os.path.join(result, 'images')
os.makedirs(img_dir, exist_ok=True)

mask_dir = os.path.join(result, 'masks')
os.makedirs(mask_dir, exist_ok=True)

df = pd.read_csv('./train2.csv')

In [None]:
global num

def divide_img(filename, dir, image, mask, stride=200, size=224, real_size=512):
    global num
    for a in range(0, 1024-size+1, stride):
        for b in range(0, 1024-size+1, stride):
            image_resized = image[a:a+size, b:b+size, :]
            mask_resized = mask[a:a+size, b:b+size]

            if size != real_size:
                image_resized = cv2.resize(image_resized, (real_size, real_size))
                mask_resized = cv2.resize(mask_resized, (real_size, real_size))

            cv2.imwrite(os.path.join(dir, 'images', f'{filename.replace(".png", "")}_{num}.png'), image_resized)
            cv2.imwrite(os.path.join(dir, 'masks', f'{filename.replace(".png", "")}_{num}.png'), mask_resized)
            num += 1

In [None]:
for img_path, mask_rle in zip(tqdm(df['img_path']), df['mask_rle']):
    filename = os.path.basename(img_path)
    image = cv2.imread(img_path)
    mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

    num = 0
    divide_img(filename=os.path.basename(img_path), 
               dir=result,
               image=image, 
               mask=mask, 
               stride=256, 
               size=512,
               real_size=512)

In [None]:
for idx in tqdm(range(len(df))):
    img_path = os.path.join('train_img', os.path.basename(df.iloc[idx]['img_path']))
    mask_rle = df.iloc[idx]['mask_rle']
    
    image = cv2.imread(img_path)
    mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

    num = 0
    divide_img(filename=os.path.basename(img_path), 
               image=image, 
               mask=mask, 
               stride=256, 
               size=512)

# Train Test Split

In [None]:
global num

def divide_img(filename, dir, image, mask, stride=200, size=224, real_size=512):
    global num
    for a in range(0, 1024-size+1, stride):
        for b in range(0, 1024-size+1, stride):
            image_resized = image[a:a+size, b:b+size, :]
            mask_resized = mask[a:a+size, b:b+size]

            if size != real_size:
                image_resized = cv2.resize(image_resized, (real_size, real_size))
                mask_resized = cv2.resize(mask_resized, (real_size, real_size))

            cv2.imwrite(os.path.join(dir, 'images', f'{filename.replace(".png", "")}_{num}.png'), image_resized)
            cv2.imwrite(os.path.join(dir, 'masks', f'{filename.replace(".png", "")}_{num}.png'), mask_resized)
            num += 1

In [None]:
result = 'data_512_91_masked'
os.makedirs(result, exist_ok=True)

train_dir = os.path.join(result, 'train')
os.makedirs(os.path.join(train_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(train_dir, 'masks'), exist_ok=True)

val_dir = os.path.join(result, 'val')
os.makedirs(os.path.join(val_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(val_dir, 'masks'), exist_ok=True)

In [None]:
# df = pd.read_csv('./train2.csv')
# t_df = pd.read_csv('./my_train.csv')
# x_train, y_train = [], []

# for i in range(len(t_df)):
#     img_path = df[df['img_id'] == t_df.iloc[i][0]]['img_path'].iloc[0]
#     mask_rle = df[df['img_id'] == t_df.iloc[i][0]]['mask_rle'].iloc[0]

#     x_train.append(img_path)
#     y_train.append(mask_rle)

# v_df = pd.read_csv('./my_val.csv')
# x_val, y_val = [], []

# for i in range(len(v_df)):
#     img_path = df[df['img_id'] == v_df.iloc[i][0]]['img_path'].iloc[0]
#     mask_rle = df[df['img_id'] == v_df.iloc[i][0]]['mask_rle'].iloc[0]

#     x_val.append(img_path)
#     y_val.append(mask_rle)

In [None]:
df = pd.read_csv('./train2.csv')

x_train, x_val, y_train, y_val = train_test_split(df['img_path'], 
                                                  df['mask_rle'], 
                                                  shuffle=True, 
                                                  test_size=0.1,
                                                  random_state=18)

In [None]:
for img_path, mask_rle in zip(tqdm(x_train), y_train):
    filename = os.path.basename(img_path)
    image = cv2.imread(img_path)
    mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

    num = 0

    # 1024 -> split 512 * 9
    divide_img(filename=os.path.basename(img_path), 
               dir=train_dir,
               image=image, 
               mask=mask, 
               stride=256, 
               size=512,
               real_size=512)

In [None]:
for img_path, mask_rle in zip(tqdm(x_val), y_val):
    filename = os.path.basename(img_path)
    image = cv2.imread(img_path)
    mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

    num = 0
    divide_img(filename=os.path.basename(img_path), 
               dir=val_dir,
               image=image, 
               mask=mask, 
               stride=200, 
               size=224,
               real_size=224)

# STD, MEAN

In [None]:
df = pd.read_csv('./train.csv')

In [None]:
mean = torch.zeros(3)
std = torch.zeros(3)

for img_path in tqdm(df['img_path']):
    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    for c in range(3):
        mean[c] += image[:,:,c].mean()
        std[c] += image[:,:,c].std()
    
mean /= len(df)
std /= len(df)

In [None]:
print('mean', mean)
print('std', std)

# RLE TO MASK

In [None]:
# RLE 디코딩 함수
def rle_decode(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
result = 'train2'
os.makedirs(result, exist_ok=True)

img_dir = os.path.join(result, 'images')
os.makedirs(img_dir, exist_ok=True)

mask_dir = os.path.join(result, 'masks')
os.makedirs(mask_dir, exist_ok=True)

In [None]:
df = pd.read_csv('./train.csv')

In [None]:
for idx in tqdm(range(len(df))):
    img_path = os.path.join('train_img', os.path.basename(df.iloc[idx]['img_path']))
    mask_rle = df.iloc[idx]['mask_rle']
        
    image = cv2.imread(img_path)
    mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

    cv2.imwrite(os.path.join(img_dir, f'{os.path.basename(img_path)}.png'), image)
    cv2.imwrite(os.path.join(mask_dir, f'{os.path.basename(img_path)}.png'), mask)

# KFOLD

In [None]:
from glob import glob
from tqdm import tqdm
import numpy as np
import cv2
import os
import pandas as pd
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import re

# Change mask_rle to mask

In [None]:
def rle_decode(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

In [None]:
df = pd.read_csv('./train2.csv')

In [None]:
mask_dir = 'train_mask'
os.makedirs(mask_dir, exist_ok=True)

In [None]:
for i in tqdm(range(len(df))):
    mask_rle = df.iloc[i]['mask_rle']
    mask = rle_decode(mask_rle, (1024, 1024))

    filename = df.iloc[i]['img_path'].replace('train_img', mask_dir)
    cv2.imwrite(filename, mask)

## KFold

In [None]:
pattern = r'TRAIN_\d{4}'
num = 5

In [None]:
kfold_dir = 'kfold'
os.makedirs(kfold_dir, exist_ok=True)

for i in range(num):
    os.makedirs(os.path.join(kfold_dir, f'kfold_{i}'), exist_ok=True)

In [None]:
images = sorted(glob("./train_img/*.png"))

filenames = [re.findall(pattern, path)[0] for path in images]
filenames = np.array(filenames)

print(f'filenames: {len(filenames)}')

In [None]:
kfold = KFold(n_splits=num)

In [None]:
for idx, (train_idx, val_idx) in enumerate(kfold.split(filenames)):
    train, val = filenames[train_idx], filenames[val_idx]

    with open(os.path.join(kfold_dir, f'kfold_{idx}', 'train.txt'), 'w') as file:
        for img in train:
            for i in range(9):
                file.write(f'{img}_{i}\n')

    with open(os.path.join(kfold_dir, f'kfold_{idx}', 'val.txt'), 'w') as file:
        for img in val:
            file.write(f'{img}\n')

# (1024x1024) -> (512x512) x 9

In [None]:
result_dir = 'data_512'
os.makedirs(result_dir, exist_ok=True)

os.makedirs(os.path.join(result_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(result_dir, 'masks'), exist_ok=True)

In [None]:
global num

def divide_img(filename, dir, image, mask, stride=200, size=224, real_size=512):
    global num
    for a in range(0, 1024-size+1, stride):
        for b in range(0, 1024-size+1, stride):
            image_resized = image[a:a+size, b:b+size, :]
            mask_resized = mask[a:a+size, b:b+size]

            if size != real_size:
                image_resized = cv2.resize(image_resized, (real_size, real_size))
                mask_resized = cv2.resize(mask_resized, (real_size, real_size))

            cv2.imwrite(os.path.join(dir, 'images', f'{filename.replace(".png", "")}_{num}.png'), image_resized)
            cv2.imwrite(os.path.join(dir, 'masks', f'{filename.replace(".png", "")}_{num}.png'), mask_resized)
            num += 1

In [None]:
for img_path, mask_rle in zip(tqdm(images), df['mask_rle']):
    filename = os.path.basename(img_path)
    image = cv2.imread(img_path)
    mask = rle_decode(mask_rle, (image.shape[0], image.shape[1]))

    num = 0
    # 1024 -> split 512 * 9
    divide_img(filename=os.path.basename(img_path), 
               dir=result_dir,
               image=image, 
               mask=mask, 
               stride=256, 
               size=512,
               real_size=512)

In [None]:
with open(os.path.join('./kfold', f'kfold_0/train.txt'), 'r') as f:
    train_list = [line.strip() for line in f]

with open(os.path.join('./kfold', f'kfold_0/val.txt'), 'r') as f:
    val_list = [line.strip() for line in f]

In [None]:
val_images, val_masks = [], []
for filename in tqdm(val_list):
    val_images.append(f'./data/train_img/{filename}.png')
    val_masks.append(f'./data/train_mask/{filename}.png')

In [None]:
val_img_dir = 'val_img'
val_mask_dir = 'val_mask'

os.makedirs(val_img_dir, exist_ok=True)
os.makedirs(val_mask_dir, exist_ok=True)

In [None]:
global num

def divide_img(filename, image, mask, stride=200, size=224, real_size=512):
    global num
    for a in range(0, 1024-size+1, stride):
        for b in range(0, 1024-size+1, stride):
            image_resized = image[a:a+size, b:b+size, :]
            mask_resized = mask[a:a+size, b:b+size]

            if size != real_size:
                image_resized = cv2.resize(image_resized, (real_size, real_size))
                mask_resized = cv2.resize(mask_resized, (real_size, real_size))

            cv2.imwrite(os.path.join(val_img_dir, f'{filename.replace(".png", "")}_{num}.png'), image_resized)
            cv2.imwrite(os.path.join(val_mask_dir, f'{filename.replace(".png", "")}_{num}.png'), mask_resized)
            num += 1

In [None]:
for img_path, mask_path in zip(tqdm(val_images), val_masks):
    filename = os.path.basename(img_path)
    image = cv2.imread(img_path.replace('/data', ''))
    mask = cv2.imread(mask_path.replace('/data', ''), cv2.IMREAD_GRAYSCALE)
    
    num = 0
    divide_img(filename=os.path.basename(img_path), 
               image=image, 
               mask=mask, 
               stride=200, 
               size=224,
               real_size=224)