In [2]:
# Hunter Mitchell - 5/26/20 - Plant Pathology 2020 Competition Code

# This code uses TPU and an ensemble of multiple deep learning models to classify the diseases on images of apple leaves



!pip install -q efficientnet


import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
import tensorflow as tf

import cv2

import tensorflow.keras.layers as L
import efficientnet.tfkeras as efn
from tensorflow.keras.applications import DenseNet201,InceptionResNetV2,Xception,ResNet50V2
from keras.preprocessing.image import ImageDataGenerator

from kaggle_datasets import KaggleDatasets

from sklearn.model_selection import train_test_split,KFold
from sklearn.utils.class_weight import compute_class_weight







### SETTINGS ###

# Image Size (Original images are 1365 x 2048)
img_size_x = 512
img_size_y = 512

# Cross Validation Settings
kfolding = False
FOLDS = 5 

# Splitting Settings
TEST_SIZE = .15 
SEED = 2020

# How many different models to ensemble (up to 5)
number_of_models = 3

# Model Settings
BATCH_SIZE = 8
EPOCHS = 35







### TPU Stuff
def setUpTPU():
    
    # Detect hardware, return appropriate distribution strategy
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

    print ('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    return strategy



def format_path(st):
    #return "../input/plant-pathology-2020-fgvc7/images/" + st + '.jpg'
    return GCS_DS_PATH + '/images/' + st + '.jpg'



def format_paths():
    train_paths = train['image_id'].apply(format_path).values
    test_paths = test['image_id'].apply(format_path).values
    train_labels = train.loc[:, 'healthy':].values
    return train_paths,test_paths,train_labels




### Learning rate schedule - adapted from https://www.kaggle.com/tarunpaparaju/plant-pathology-2020-eda-models
def lrfn(epoch):
    
    lr_start=0.00001
    lr_max=0.0001 * strategy.num_replicas_in_sync # maybe change this
    lr_min=0.00001
    lr_rampup_epochs=8
    lr_sustain_epochs=3
    lr_exp_decay=.8
    
    if epoch < lr_rampup_epochs:
        lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
    elif epoch < lr_rampup_epochs + lr_sustain_epochs:
        lr = lr_max
    else:
        lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
    return lr



### Plotting the learning rate schedule
def plot_lr():
    rng = [i for i in range(EPOCHS)]
    y = [lrfn(x) for x in rng]
    plt.plot(rng, y)
    print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))



def print_count(df,string):
    temp = df[df[string] == 1]
    print(string,'column has', temp.shape[0], 'values')


### EfficientNet Model
def getModel1(): 
    with strategy.scope():
        model = tf.keras.Sequential([
            efn.EfficientNetB7(
                input_shape=(img_size_y, img_size_x, 3),
                weights='imagenet',
                include_top=False
            ),
            L.GlobalAveragePooling2D(),
            L.Dense(train_labels.shape[1], activation='softmax')
        ])
    
        model.compile(
            optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
    
        return model


### Inception ResNet Model
def getModel2():
    with strategy.scope():
        model = tf.keras.Sequential([
            InceptionResNetV2(
                input_shape=(img_size_y, img_size_x, 3),
                weights='imagenet',
                include_top=False
            ),
            L.GlobalAveragePooling2D(),
            L.Dense(train_labels.shape[1], activation='softmax')
        ])
    
        model.compile(
            optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
    
        return model


### DenseNet Model
def getModel3():
    with strategy.scope():
        model = tf.keras.Sequential([
            DenseNet201(
                input_shape=(img_size_y, img_size_x, 3),
                weights='imagenet',
                include_top=False
            ),
            L.GlobalAveragePooling2D(),
            L.Dense(train_labels.shape[1], activation='softmax')
        ])
    
        model.compile(
            optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
    
        return model


### Xception Model
def getModel4():
    with strategy.scope():
        model = tf.keras.Sequential([
            Xception(
                input_shape=(img_size_y, img_size_x, 3),
                weights='imagenet',
                include_top=False
            ),
            L.GlobalAveragePooling2D(),
            L.Dense(train_labels.shape[1], activation='softmax')
        ])
    
        model.compile(
            optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
    
        return model

    
### ResNet50 Model
def getModel5():
    with strategy.scope():
        model = tf.keras.Sequential([
            ResNet50V2(
                input_shape=(img_size_y, img_size_x, 3),
                weights='imagenet',
                include_top=False
            ),
            L.GlobalAveragePooling2D(),
            L.Dense(train_labels.shape[1], activation='softmax')
        ])
    
        model.compile(
            optimizer='adam',
            loss = 'categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
    
        return model


### Splits data, grabs model, fits, and then computes and returns predictions
def modelFit(i,train_paths,test_paths,train_labels):
    
    if kfolding == False:
        
        train_paths, valid_paths, train_labels, valid_labels = train_test_split(train_paths, train_labels, test_size=TEST_SIZE, random_state=SEED)
        
        class_weights = compute_class_weight('balanced',np.unique(train_labels.argmax(axis=1)),train_labels.argmax(axis=1))
        print('class weights: ',class_weights)
        
        train_dataset,valid_dataset,test_dataset = get_datasets(train_paths,valid_paths,train_labels,valid_labels)
        
        if (i == 0):
            model = getModel1()
        if (i == 1):
            model = getModel2()
        if (i == 2):
            model = getModel3()
        if (i == 3):
            model = getModel4()
        if (i == 4):
            model = getModel5()
        
        model.summary()
        
        STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE
        
        history = model.fit(
            train_dataset,
            epochs=EPOCHS,
            verbose=2,
            callbacks=[lr_schedule],
            steps_per_epoch = STEPS_PER_EPOCH,
            validation_data=valid_dataset,
            class_weight=class_weights
        )
        
        final_predictions = model.predict(test_dataset)
        
        
    if kfolding == True:
    
        predictions = []
        
        for train_index,test_index in KFold(FOLDS,shuffle=True,random_state=SEED).split(train_paths):
        
            print('FOLDING')
        
            train_paths_new, valid_paths = train_paths[train_index],train_paths[test_index]
            train_labels_new, valid_labels= train_labels[train_index],train_labels[test_index]
            
            class_weights = compute_class_weight('balanced',np.unique(train_labels_new.argmax(axis=1)),train_labels_new.argmax(axis=1))
            print('class weights: ',class_weights)
            
            train_dataset,valid_dataset,test_dataset = get_datasets(train_paths_new,valid_paths,train_labels_new,valid_labels)
            
            if (i == 0):
                model = getModel1() 
            if (i == 1):
                model = getModel2() 
            if (i == 2):
                model = getModel3() 
            if (i == 3):
                model = getModel4() 
            if (i == 4):
                model = getModel5() 
    
            
            STEPS_PER_EPOCH = train_labels.shape[0] // BATCH_SIZE
        
            history = model.fit(
                train_dataset,
                epochs=EPOCHS,
                verbose=2,
                callbacks=[lr_schedule],
                steps_per_epoch = STEPS_PER_EPOCH,
                validation_data=valid_dataset,
                class_weight = class_weights
            )
        
            preds = model.predict(test_dataset)
        
            print(preds[0])
        
            predictions.append(preds)
    
        final_predictions = np.mean(predictions,axis=0)
        
    return final_predictions

    
### Gotten from https://www.kaggle.com/tarunpaparaju/plant-pathology-2020-eda-models
def decode_image(filename, label=None, image_size=(img_size_y, img_size_x)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, image_size)
    
    if label is None:
        return image
    else:
        return image, label


### Gotten from https://www.kaggle.com/tarunpaparaju/plant-pathology-2020-eda-models
def data_augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    
    if label is None:
        return image
    else:
        return image, label



### Construct datasets - adapted from https://www.kaggle.com/tarunpaparaju/plant-pathology-2020-eda-models
def get_datasets(train_paths,valid_paths,train_labels,valid_labels):

    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((train_paths, train_labels))
        .map(decode_image, num_parallel_calls=AUTO)
        .cache()
        .map(data_augment, num_parallel_calls=AUTO)
        .repeat()
        .shuffle(512)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((valid_paths, valid_labels))
        .map(decode_image, num_parallel_calls=AUTO)
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(test_paths)
        .map(decode_image, num_parallel_calls=AUTO)
        .batch(BATCH_SIZE)
    )
    
    return train_dataset,valid_dataset,test_dataset
    

    
    
    
    
    

    
    

### Data access and other starting settings
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

AUTO = tf.data.experimental.AUTOTUNE

strategy = setUpTPU()
    
BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync






### Read and store data
train = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/train.csv')
test = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/test.csv')
sub = pd.read_csv('/kaggle/input/plant-pathology-2020-fgvc7/sample_submission.csv')





print('Total Train Data Counts:')
print_count(train,'healthy')
print_count(train,'multiple_diseases')
print_count(train,'rust')
print_count(train,'scab')






#plot_lr() # Plot what the learning rate function looks like




print('Formatting Paths...')

train_paths,test_paths,train_labels = format_paths() 








# set up learning rate schedule
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)





# go through different models and get predictions

predictions = []

for i in range(number_of_models):
    
    preds = modelFit(i,train_paths,test_paths,train_labels)

    predictions.append(preds)






final_predictions = np.mean(predictions,axis=0)




### Submit results
sub.loc[:, 'healthy':] = final_predictions
sub.to_csv('submission.csv', index=False)
sub.head()






You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


Using TensorFlow backend.


Running on TPU  grpc://10.0.0.2:8470
Number of devices: 8
Total Train Data Counts:
healthy column has 516 values
multiple_diseases column has 91 values
rust column has 622 values
scab column has 592 values
Formatting Paths...
FOLDING
class weights:  [0.88349515 4.91891892 0.73833671 0.76310273]
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b7_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Train for 7 steps, validate for 2 steps

Epoch 00001: LearningRateScheduler reducing learning rate to 1e-05.
Epoch 1/5
7/7 - 338s - loss: 1.3973 - categorical_accuracy: 0.2606 - val_loss: 1.4761 - val_categorical_accuracy: 0.0795

Epoch 00002: LearningRateScheduler reducing learning rate to 0.00010875.
Epoch 2/5
7/7 - 3s - loss: 1.3175 - categorical_accuracy: 0.4051 - val_loss: 1.3512 - val_categorical_accuracy: 0.2712

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0002075.
Epoch 3/5
7/7 - 3s - loss: 1.0498 -

KeyboardInterrupt: 