In [None]:
import pandas as pd
import numpy as np

pd.set_option('max_colwidth', 100)

train_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/train.csv')
test_df = pd.read_csv('../input/plant-pathology-2020-fgvc7/test.csv')

train_df.head()

In [None]:
IMAGE_DIR = '/kaggle/input/plant-pathology-2020-fgvc7/images'
train_df['path'] = IMAGE_DIR + '/' + train_df['image_id'] + '.jpg'

# 이미지를 Labeling - 데이터를 명확히 파악하기 위함입니다.
def get_label(x):
    if x['healthy'] == 1:
        return 'healthy'
    elif x['multiple_diseases'] == 1:
        return 'multiple_diseases'
    elif x['rust'] == 1:
        return 'rust'ㅇ
    elif x['scab'] == 1:
        return 'scab'

train_df['label'] = train_df.apply(lambda x : get_label(x), axis = 1)
train_df.head()

In [None]:
train_df['label'].value_counts()

# Image 데이터 확인하기

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
%matplotlib inline

def show_images(image_path_list, augmentator = None, ncols = 4, title = None):
    figure, axs = plt.subplots(figsize = (22, 4), nrows = 1, ncols = ncols)
    for i in range(ncols):
        image = cv2.cvtColor(cv2.imread(image_path_list[i]), cv2.COLOR_BGR2RGB)
        
        if augmentator is not None:
            image = augmentator(image = image)['image']
        
        axs[i].imshow(image)
        axs[i].axis('off')
        axs[i].set_title(title)

rust_image_list = train_df[train_df['label'] == 'rust']['path'].iloc[:6].tolist()
scab_image_list = train_df[train_df['label'] == 'scab']['path'].iloc[:6].tolist()
healthy_image_list = train_df[train_df['label'] == 'healthy']['path'].iloc[:6].tolist()
multiple_diseases_list = train_df[train_df['label'] == 'multiple_diseases']['path'].iloc[:6].tolist()

show_images(rust_image_list, title = 'rust' , ncols = 6)
show_images(scab_image_list, title = 'scab' , ncols = 6)
show_images(healthy_image_list, title = 'healthy' , ncols = 6)
show_images(multiple_diseases_list, title = 'multiple_disease' , ncols = 6)

# Image Augmentator 적용
    - Image Classification시 문제를 야기할 수 있는 것은 적용하지 않는 것이 좋습니다.
    - Image가 중앙에 위치해 있으므로, Scale 을 적용할 수 있습니다.    

In [None]:
import albumentations as A

augmentator_01 = A.Compose([
    A.HorizontalFlip(p = 0.5),
    A.VerticalFlip(p = 0.5),
    A.ShiftScaleRotate(scale_limit = (0.7, 0.9), p = 0.5, rotate_limit = 30),
    A.RandomBrightnessContrast(brightness_limit = (-0.2, 0.2), contrast_limit = (-0.2, 0.2), p = 0.5),
    A.Blur(p = 0.2)
])

show_images(rust_image_list, title = 'rust' , augmentator = augmentator_01,  ncols = 6)
show_images(scab_image_list, title = 'scab' , augmentator = augmentator_01, ncols = 6)
show_images(healthy_image_list, title = 'healthy' , augmentator = augmentator_01, ncols = 6)
show_images(multiple_diseases_list, title = 'multiple_disease' , augmentator = augmentator_01, ncols = 6)

# Sequence Dataset 생성
    - ImageSize 가 다르므로, image_size 를 튜플로 입력
    - TestData에는 Label이 없는 경우를 고려하기

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence
import sklearn
import cv2

class Plant_dataset(Sequence):
    def __init__(self, image_filenames, labels, image_size = (224, 224), batch_size = 64,
                 augmentator = None, shuffle = False, pre_func = None):
    
        self.image_filenames = image_filenames
        self.labels = labels
        self.image_size = image_size
        self.batch_size = batch_size
        self.augmentator = augmentator
        self.pre_func = pre_func
        self.shuffle = shuffle
        
    
    def __len__(self):
        return int(np.ceil(len(self.image_filenames) / self.batch_size ))
    
    
    def __getitem__(self, index):
        
        image_name_batch = self.image_filenames[index*self.batch_size : (index + 1)*self.batch_size]
        if self.labels is not None:
            label_batch = self.labels[index*self.batch_size : (index + 1)*self.batch_size]
        
        else:
            label_batch = None
        
        image_batch = np.zeros((image_name_batch.shape[0], self.image_size[0], self.image_size[1], 3), dtype = 'float32')
        
        
        for image_index in range(image_name_batch.shape[0]):
            image = cv2.cvtColor(cv2.imread(image_name_batch[image_index]), cv2.COLOR_BGR2RGB)
            if self.augmentator is not None:
                image = self.augmentator(image = image)['image']
            # cv2는 이미지 높이, 너비 순으로 받습니다.
            image = cv2.resize(image, (self.image_size[1], self.image_size[0]))
            
            if self.pre_func is not None:
                image = self.pre_func(image)
            
            image_batch[image_index] = image
        
        return image_batch, label_batch
    

In [None]:
def get_train_valid(train_df, valid_size = 0.2, random_state= 256):
    train_path = train_df['path'].values
    
    train_label = train_df[['healthy', 'multiple_diseases', 'rust', 'scab']].values
    
    train_path, valid_path, train_label, valid_label = train_test_split(train_path, train_label, test_size = valid_size, random_state = random_state)
    
    return train_path, valid_path, train_label, valid_label

In [None]:
from tensorflow.keras.applications.xception import preprocess_input as xcp_preprocess_input
from tensorflow.keras.applications.efficientnet import preprocess_input as eff_preprocess_input

IMAGE_SIZE = (224, 224)
BATCH_SIZE = 64

train_path, valid_path, train_label, valid_label = get_train_valid(train_df)

tr_ds = Plant_dataset(train_path, train_label, image_size= IMAGE_SIZE, batch_size= BATCH_SIZE,
                      augmentator= augmentator_01, pre_func= xcp_preprocess_input)
va_ds = Plant_dataset(valid_path, valid_label, image_size= IMAGE_SIZE, batch_size= BATCH_SIZE, pre_func= xcp_preprocess_input)

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Conv2D, Dropout, Flatten, Activation, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.metrics import AUC

from tensorflow.keras.applications import Xception, ResNet50V2, EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3
from tensorflow.keras.applications import EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7
import tensorflow as tf

def create_model(model_type = 'efficientnetb0', in_shape = (224, 224, 3), n_classes = 4):
    input_tensor = Input(shape = in_shape)
    
    if model_type == 'resnet50v2':
        base_model = tf.keras.applications.ResNet50V2(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'xception':
        base_model = tf.keras.applications.Xception(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb0':
        base_model = tf.keras.applications.EfficientNetB0(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb1':
        base_model = tf.keras.applications.EfficientNetB1(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb2':
        base_model = tf.keras.applications.EfficientNetB2(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb3':
        base_model = tf.keras.applications.EfficientNetB3(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb4':
        base_model = tf.keras.applications.EfficientNetB4(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb5':
        base_model = tf.keras.applications.EfficientNetB5(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb6':
        base_model = tf.keras.applications.EfficientNetB6(include_top=False, weights='imagenet', input_tensor=input_tensor)
    elif model_type == 'efficientnetb7':
        base_model = tf.keras.applications.EfficientNetB7(include_top=False, weights='imagenet', input_tensor=input_tensor)
        
    
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation = 'relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(4, activation = 'softmax')(x)
    model = Model(inputs = input_tensor, outputs = preds)
    
    return model

In [None]:
def make_submit_df(test_df, model, config = None):
    test_path = test_df['path'].values
    
    test_ds = Plant_dataset(image_filenames= test_path, labels = None, image_size=IMAGE_SIZE,
                            batch_size= BATCH_SIZE, pre_func= xcp_preprocess_input)
    
    preds = model.predict(test_ds)
    preds_df = pd.DataFrame(preds)
    preds_df.columns = ['healthy', 'multiple_diseases', 'rust', 'scab']
    submit_df = pd.concat([test_df['image_id'], preds_df], axis = 1)
    
    return submit_df

# 일반적으로 Class를 만들어 적용하는 게 좋습니다.
 - 다양한 학습을 편리하게 할 수 있기 때문입니다.

In [None]:
from tensorflow.keras.applications.efficientnet import preprocess_input as eff_preprocess_input
from tensorflow.keras.applications.xception import preprocess_input as xcp_preprocess_input
import tensorflow as tr

# Learning_Rate Scheduler 생성

def lrfn_01(epoch):
    LR_START = 1e-5
    LR_MAX = 1e-4
    LR_RAMPUP_EPOCHS = 2
    LR_SUSTAIN_EPOCHS = 1
    LR_STEPS_DECAY = 0.75
    
    def calc_fn(epoch):
        if epoch < LR_RAMPUP_EPOCHS:
            lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
        elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
            lr = LR_MAX
        else:
            lr = LR_MAX * LR_STEPS_DECAY**((epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) // 2)
        
        return lr
    
    return calc_fn(epoch)

def lrfn_02(epoch):
    LR_START = 1e-6
    LR_MAX = 2e-5
    LR_RAMPUP_EPOCHS = 2
    LR_SUSTAIN_EPOCHS = 1
    LR_STEPS_DECAY = 0.75
    
    def calc_fn(epoch):
        if epoch < LR_RAMPUP_EPOCHS:
            lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
        elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
            lr = LR_MAX
        else:
            lr = LR_MAX*LR_STEPS_DECAY**((epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) // 2)
        
        return lr
    
    return calc_fn(epoch)

lr01_cb = LearningRateScheduler(lrfn_01, verbose = 1)
lr02_cb = LearningRateScheduler(lrfn_02, verbose = 1)
rlr_cb = ReduceLROnPlateau(monitor= 'val_loss', factor = 0.2, patience = 3, mode = 'min', verbose = 1)
ely_cb = EarlyStopping(monitor = 'val_loss', patience = 10, mode = 'min', verbose = 1)

augmentor_01 = A.Compose([
    A.HorizontalFlip(p = 0.5),
    A.VerticalFlip(p = 0.5),
    A.ShiftScaleRotate(scale_limit=(0.7, 0.9), p = 0.5, rotate_limit= 30),
    A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p = 0.5),
    A.Blur(p = 0.2)
])


class Config:
    MODEL_TYPE = 'xception'
    IMAGE_SIZE = (320, 512)
    BATCH_SIZE = 32
    N_EPOCHS = 10
    IS_FINE_TUNING = False
    FIRST_EPOCHS = 15
    SECOND_EPOCHS = 15
    FIRST_CALLBACKS = [lr01_cb, ely_cb]
    SECOND_CALLBACKS = [lr02_cb, ely_cb]
    AUGMENTOR = augmentor_01
    PRE_FUNC = xcp_preprocess_input
    INITIAL_LR = 0.0001
    DEBUG = True

# Train_Model 생성하기


In [None]:
from tensorflow.keras.optimizers import Adam

def train_model(train_df, config = Config):
    train_path, valid_path, train_label, valid_label = get_train_valid(train_df, valid_size = 0.2, random_state = 256)
    
    tr_ds = Plant_dataset(train_path, train_label, image_size = config.IMAGE_SIZE, batch_size = config.BATCH_SIZE,
                         augmentator = config.AUGMENTOR, pre_func = config.PRE_FUNC)
    val_ds = Plant_dataset(valid_path, valid_label, image_size = config.IMAGE_SIZE, batch_size = config.BATCH_SIZE,
                           augmentator = None, pre_func= config.PRE_FUNC)
    
    if config.DEBUG:
#         tr_image_batch = next(iter(tr_ds))[0]
#         val_image_batch = next(iter(val_ds))[0]
#         print(tr_image_batch.shape, val_image_batch.shape)
#         print(tr_image_batch[0], val_image_batch[0])
        pass
    
    
    # Model 생성하기
    print('### 모델 생성 : ', config.MODEL_TYPE)
    model = create_model(model_type=config.MODEL_TYPE, in_shape= (config.IMAGE_SIZE[0], config.IMAGE_SIZE[1], 3))
    model.compile(optimizer = Adam(lr = config.INITIAL_LR), loss = 'categorical_crossentropy', metrics = [AUC()])
    
    
    if config.IS_FINE_TUNING:
        print('### 미세조정')
        for layer in model.layers[:-4]:
            layer.trainable = False
        
        print('### Classification Layer 학습')
        history = model.fit(tr_ds, epochs = config.FIRST_EPOCHS, validation_data = val_ds,
                            callbacks = (config.FIRST_CALLBACKS), verbose = 1)

        for layer in model.layers:
            if config.MODEL_TYPE in 'efficientnet':
                if not isinstance(layer, layers.BatchNormalization):
                    layer.trainable = True
            else:
                layer.trainable = True        
        
        print('### Fine Tunning 미세 조정 완료')
        print('### 두번째 학습 시행')
        history = model.fit(tr_df, epochs = config.FIRST_EPOCHS, validation_data = val_ds, callbacks = (config.SECOND_CALLBACKS), verbose = 1)
        
        
    else:
        print('학습 수행')
        history = model.fit(tr_ds, epochs = config.FIRST_EPOCHS, validation_data = val_ds,
                            callbacks = (config.FIRST_CALLBACKS), verbose = 1)
    
    
    return model, history

# EiffcientNetB7로 Pretrained 수행

In [None]:
class Config:
    MODEL_TYPE = 'efficientnetb7'
    IMAGE_SIZE = (456, 456)
    BATCH_SIZE = 4
    N_EPOCHS = 20
    IS_FINE_TUNING = False
    FIRST_EPOCHS = 15
    SECOND_EPOCHS = 15
    FIRST_CALLBACKS = [lr01_cb, ely_cb]
    SECOND_CALLBACKS = [lr02_cb, ely_cb]
    AUGMENTOR = augmentor_01
    PRE_FUNC = eff_preprocess_input
    INITIAL_LR = 0.0001
    DEBUG = True

In [None]:
effb7_model, history = train_model(train_df, config = Config)

In [None]:
def make_submit_df(test_df, model, config=Config):
    test_path = test_df['path'].values
    # labels는 None을 입력하고 Dataset 생성. 
    test_ds = Plant_dataset(image_filenames=test_path, labels=None, image_size=config.IMAGE_SIZE, batch_size=config.BATCH_SIZE, 
                            augmentator=None, shuffle=False, pre_func=config.PRE_FUNC)
    
    #predict()로 예측 수행. 
    preds = model.predict(test_ds)
    
    # 예측한 결과를 기반으로 별도의 결과 DataFrame을 생성.
    preds_df = pd.DataFrame(preds)
    preds_df.columns = ['healthy', 'multiple_diseases', 'rust', 'scab']
    # 테스트용 DataFrame에 바로 위에서 생성한 결과 DataFrame을 합친 뒤 이를 이용하여 submit용 DataFrame 생성.  
    submit_df = pd.concat([test_df['image_id'], preds_df], axis = 1)
    
    return submit_df

In [None]:
IMAGE_DIR = '/kaggle/input/plant-pathology-2020-fgvc7/images'
test_df = pd.read_csv("../input/plant-pathology-2020-fgvc7/test.csv")
test_df['path'] = IMAGE_DIR + '/' + test_df['image_id'] + '.jpg'
# history

submit_df = make_submit_df(test_df, history[0], config=Config)

In [None]:
submit_df

In [None]:
submit_df.to_csv('submission.csv', index = False)