# 준비

## 데이터셋 다운로드

In [2]:
import kaggle

In [4]:
# !kaggle competitions download -c histopathologic-cancer-detection

## 라이브러리 호출

In [1]:
import tensorflow as tf
import tensorflow_io as tfio
import numpy as np
import pandas as pd

## 데이터 info

In [2]:
all_data = pd.read_csv("D:Dataset/dataset/Kaggle/histopathologic-cancer-detection/train_labels.csv")

def id_path(x):
    return "D:Dataset/dataset/Kaggle/histopathologic-cancer-detection/train/"+x+".tif"

all_data["id"] = all_data["id"].apply(id_path)
display(all_data)

Unnamed: 0,id,label
0,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
1,D:Dataset/dataset/Kaggle/histopathologic-cance...,1
2,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
3,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
4,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
...,...,...
220020,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
220021,D:Dataset/dataset/Kaggle/histopathologic-cance...,1
220022,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
220023,D:Dataset/dataset/Kaggle/histopathologic-cance...,0


In [3]:
from glob import glob
all_data_path = glob("D:Dataset/dataset/Kaggle/histopathologic-cancer-detection/train/*")
print("all_data counts :", len(all_data_path))
print("all_data[0] sample :", all_data_path[0])

all_data counts : 220025
all_data[0] sample : D:Dataset/dataset/Kaggle/histopathologic-cancer-detection/train\00001b2b5609af42ab0ab276dd4cd41c3e7745b5.tif


In [5]:
from sklearn.model_selection import train_test_split as tts
train_data, valid_data = tts(all_data, test_size=0.1, stratify=all_data["label"], shuffle=True, random_state=905)
print("train_data.shape : ", train_data.shape)
print("valid_data.shape : ", valid_data.shape)
display(train_data)

train_data.shape :  (198022, 2)
valid_data.shape :  (22003, 2)


Unnamed: 0,id,label
27656,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
170078,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
136124,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
151390,D:Dataset/dataset/Kaggle/histopathologic-cance...,1
94886,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
...,...,...
101866,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
20635,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
11131,D:Dataset/dataset/Kaggle/histopathologic-cance...,0
105157,D:Dataset/dataset/Kaggle/histopathologic-cance...,1


## 데이터 셋

In [30]:
from PIL import Image
import cv2
from numpy import random

resize, horizontal_filp, crop, vertical_filp, rotate, hue, contrast

In [32]:
def do_random_flip(image):
    if np.random.rand()>0.5:
        image = cv2.flip(image,0)
    if np.random.rand()>0.5:
        image = cv2.flip(image,1)
    if np.random.rand()>0.5:
        image = image.transpose(1,0,2) # y=x 대칭 #(0,1,2)->(1,0,2)
    image = np.ascontiguousarray(image)
    return image

def do_random_rot90(image):
    r = np.random.choice([
        0,
        cv2.ROTATE_90_CLOCKWISE,
        cv2.ROTATE_90_COUNTERCLOCKWISE,
        cv2.ROTATE_180,
    ])
    if r==0:
        return image
    else:
        image = cv2.rotate(image, r)
        return image
        
def do_random_contast(image, mag=0.3):
    alpha = 1 + random.uniform(-1,1)*mag # -0.3 ~ 0.3 -> 0.7~1.3 
    image = image * alpha
    image = np.clip(image,0,1)
    return image

def do_random_hsv(image, mag=[0.15,0.25,0.25]):
    image = (image*255).astype(np.uint8)
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    h = hsv[:, :, 0].astype(np.float32)  # hue
    s = hsv[:, :, 1].astype(np.float32)  # saturation
    v = hsv[:, :, 2].astype(np.float32)  # value
    h = (h*(1 + random.uniform(-1,1)*mag[0]))%180
    s =  s*(1 + random.uniform(-1,1)*mag[1])
    v =  v*(1 + random.uniform(-1,1)*mag[2])

    hsv[:, :, 0] = np.clip(h,0,180).astype(np.uint8)
    hsv[:, :, 1] = np.clip(s,0,255).astype(np.uint8)
    hsv[:, :, 2] = np.clip(v,0,255).astype(np.uint8)
    image = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    image = image.astype(np.float32)/255
    return image

def do_random_noise(image, mag=0.1):
    height, width = image.shape[:2]
    noise = np.random.uniform(-1,1, (height, width,1))*mag
    image = image + noise
    image = np.clip(image,0,1)
    return image

def do_random_rotate_scale(image, angle=30, scale=[0.8,1.2] ):
    angle = np.random.uniform(-angle, angle)
    scale = np.random.uniform(*scale) if scale is not None else 1
    
    height, width = image.shape[:2]
    center = (height // 2, width // 2)
    
    transform = cv2.getRotationMatrix2D(center, angle, scale)
    image = cv2.warpAffine( image, transform, (width, height), flags=cv2.INTER_LINEAR,
                            borderMode=cv2.BORDER_CONSTANT, borderValue=(0,0,0))
    return image

In [33]:
def train_augment5b(image):
    image = do_random_flip(image)
    image = do_random_rot90(image)

    for fn in np.random.choice([
        lambda image: (image),
        lambda image: do_random_noise(image, mag=0.1),
        lambda image: do_random_contast(image, mag=0.40),
        lambda image: do_random_hsv(image, mag=[0.40, 0.40, 0])
    ], 2): image = fn(image)

    for fn in np.random.choice([
        lambda image: (image),
        lambda image: do_random_rotate_scale(image, angle=45, scale=[0.50, 2.0]),
    ], 1): image = fn(image)

    return image

In [34]:
class ImageDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, batch_size=32, train=False):
        self.df = df.reset_index(drop=True)
        self.dim = img_size
        self.train = train
        self.batch_size = batch_size
    
    def __len__(self):
        return np.ceil(len(self.df) / self.batch_size).astype(int)
    
    def on_epoch_end(self):
        if self.train: #Reshuffle train on end of epoch
            self.df = self.df.sample(frac=1.0).reset_index(drop=True)
            
    def __getitem__(self, idx):
        batch_x = self.df.iloc[idx*self.batch_size:(idx+1)*self.batch_size].id.values # df[0:16].id , [16:32]
        batch_y = self.df.iloc[idx*self.batch_size:(idx+1)*self.batch_size].label.values

        X = np.zeros((batch_x.shape[0], self.dim, self.dim, 3))
        Y = np.zeros((batch_x.shape[0]))

        for i in range(batch_x.shape[0]):
            image = Image.open(f"{batch_x[i]}")
            image = image.resize((self.dim, self.dim))
            image = np.array(image) / 255.
            if train:
                image = train_augment5b(image)
            X[i] = image
        
        return X, batch_y

In [52]:
batch_size=16
img_size=224
train_loader = ImageDataGenerator(train_data, batch_size, True)
valid_loader = ImageDataGenerator(train_data, batch_size, False)

In [40]:
for x, y in train_loader:
    print(x.shape)
    print(y.shape)
    break

(16, 224, 224, 3)
(16,)


# 모델

In [65]:
def build_model():
    x = tf.keras.layers.Input(shape=(224,224,3))
    base_model = tf.keras.applications.ResNet50(include_top=False)(x)
    y = tf.keras.layers.GlobalAveragePooling2D()(base_model)
    y = tf.keras.layers.Dense(64, activation="leaky_relu")(y)
    y = tf.keras.layers.Dense(1, activation="sigmoid")(y)
    
    return tf.keras.models.Model(x,y)

In [66]:
model = build_model()
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 224, 224, 3)]     0         
                                                                 
 resnet50 (Functional)       (None, None, None, 2048)  23587712  
                                                                 
 global_average_pooling2d_1   (None, 2048)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_9 (Dense)             (None, 64)                131136    
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 23,718,913
Trainable params: 23,665,793
Non-trainable params: 53,120
__________________________________________

In [69]:
import tensorflow_addons as tfa
losses = tf.keras.losses.BinaryCrossentropy()
optimizers = tfa.optimizers.AdamW(weight_decay=0.001, learning_rate=0.0001)

In [71]:
model.compile(loss=losses, optimizer=optimizers, metrics=["acc"])
model.fit(train_loader, epochs=2, validation_data=valid_loader)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2a149fb6b20>

In [72]:
model.save("keep.h5")