<a href="https://colab.research.google.com/github/heraclex12/My-Road-to-AI/blob/master/AIHackathon_NoisyEffecientNetB7_weighted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup and Prepare data

### Download external data (other)

In [None]:
!wget https://people.eecs.berkeley.edu/~hendrycks/imagenet-a.tar


--2020-11-27 01:37:07--  https://people.eecs.berkeley.edu/~hendrycks/imagenet-a.tar
Resolving people.eecs.berkeley.edu (people.eecs.berkeley.edu)... 128.32.189.73
Connecting to people.eecs.berkeley.edu (people.eecs.berkeley.edu)|128.32.189.73|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 687552512 (656M) [application/x-tar]
Saving to: ‘imagenet-a.tar’


2020-11-27 01:37:24 (39.3 MB/s) - ‘imagenet-a.tar’ saved [687552512/687552512]



### Download the challenge dataset

In [None]:
import gdown

!gdown https://drive.google.com/u/0/uc?id=1NfaFbUQ9HUnzo-Ah5-jZoGBad9ajgMWQ&export=download
!gdown https://drive.google.com/u/0/uc?id=16iIBC5IZc6l-LkzLwP6h3sXnuQckLo1E&export=download


Downloading...
From: https://drive.google.com/u/0/uc?id=1NfaFbUQ9HUnzo-Ah5-jZoGBad9ajgMWQ
To: /content/test_set_A_full.zip
1.65GB [00:16, 98.4MB/s]
Downloading...
From: https://drive.google.com/u/0/uc?id=16iIBC5IZc6l-LkzLwP6h3sXnuQckLo1E
To: /content/ai4vn_2020.zip
323MB [00:04, 70.3MB/s]


In [None]:
!unzip -qq ai4vn_2020.zip

!unzip -qq test_set_A_full.zip

!mkdir test_data
!mv test_set_A_full/ test_data/

!tar -xvf imagenet-a.tar

!mv imagenet-a/ 0/
!mv 0/ ai4vn_2020/sample_data/


### Clone baseline source code

In [None]:

!git clone https://github.com/hcmcaic/ai4vn-hackathon-2020.git
%cd ai4vn-hackathon-2020/

Cloning into 'ai4vn-hackathon-2020'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 23 (delta 3), reused 8 (delta 3), pack-reused 14[K
Unpacking objects: 100% (23/23), done.
/content/ai4vn-hackathon-2020


## Convert NoisyStudent EffecientNetB7 to a usable Keras model

In [None]:
!wget https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/noisystudent/noisy_student_efficientnet-b7.tar.gz
!wget https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/python/keras/applications/efficientnet_weight_update_util.py
!tar -xf noisy_student_efficientnet-b7.tar.gz
!python efficientnet_weight_update_util.py --model b7 --notop --ckpt \
        noisy-student-efficientnet-b7/model.ckpt --o efficientnetb7_notop.h5

--2020-11-27 01:38:45--  https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/noisystudent/noisy_student_efficientnet-b7.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.204.128, 172.217.203.128, 74.125.139.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.204.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 491946084 (469M) [application/gzip]
Saving to: ‘noisy_student_efficientnet-b7.tar.gz’


2020-11-27 01:38:51 (92.8 MB/s) - ‘noisy_student_efficientnet-b7.tar.gz’ saved [491946084/491946084]



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

from dataloader import image_dataset_from_directory

# Data Loader

In [None]:
path_to_data = '/content/ai4vn_2020/sample_data'


BATCH_SIZE = 10
IMG_SIZE = (600, 600)

NUM_CLASS = 8

train_dataset, train_dataset_filenames = image_dataset_from_directory(path_to_data,
                                             validation_split=0.2,
                                             subset="training",
                                             shuffle=True,
                                             seed=505,
                                             batch_size=BATCH_SIZE,
                                             image_size=IMG_SIZE)

class_names =  train_dataset.class_names

Found 11000 files belonging to 8 classes.
Using 8800 files for training.


In [None]:
validation_dataset, validation_dataset_filenames = image_dataset_from_directory(path_to_data,
                                                  validation_split=0.2,
                                                  subset="validation",
                                                  shuffle=True,
                                                  seed=505,
                                                  batch_size=BATCH_SIZE,
                                                  image_size=IMG_SIZE)

Found 11000 files belonging to 8 classes.
Using 2200 files for validation.


In [None]:
test_dataset, test_dataset_filenames = image_dataset_from_directory('/content/test_data/',
                                                  shuffle=False,
                                                  batch_size=BATCH_SIZE,
                                                  image_size=IMG_SIZE)

Found 19999 files belonging to 1 classes.


In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)

In [None]:
data_augmentation = tf.keras.Sequential([
  # tf.keras.layers.experimental.preprocessing.Rescaling(1./255),
  tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal'),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.05),
  tf.keras.layers.experimental.preprocessing.RandomTranslation(height_factor=0.1, width_factor=0.1),
  tf.keras.layers.experimental.preprocessing.RandomContrast(factor=0.1),
  
])

# Fine-tune model

In [None]:
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.applications import EfficientNetB7


def build_model(num_classes):
    IMG_SIZE = 600
    inputs = tf.keras.layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    x = data_augmentation(inputs)
    model = EfficientNetB7(include_top=False, input_tensor=x, weights="efficientnetb7_notop.h5")

    # Freeze the pretrained weights
    model.trainable = False

    # Rebuild top
    x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
    x = tf.keras.layers.BatchNormalization()(x)

    top_dropout_rate = 0.2
    x = tf.keras.layers.Dropout(top_dropout_rate, name="top_dropout")(x)
    outputs = tf.keras.layers.Dense(num_classes, activation="softmax", name="pred")(x)

    # Compile
    model = tf.keras.Model(inputs, outputs, name="EfficientNet")
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return model

### Freeze model

In [None]:
class_weights = {
    0: 1/14,
    1: 1.,
    2: 1.,
    3: 1.,
    4: 1.,
    5: 1.,
    6: 1.,
    7: 1.
}

model = build_model(num_classes=8)

epochs = 25  # @param {type: "slider", min:8, max:80}
hist = model.fit(train_dataset, epochs=epochs, validation_data=validation_dataset,
                 class_weight=class_weights)


Epoch 1/25
Epoch 2/25
Epoch 3/25
  1/880 [..............................] - ETA: 0s - loss: 0.2296 - accuracy: 0.9000

### Unfreeze model

In [None]:
def unfreeze_model(model):
    # We unfreeze the top 20 layers while leaving BatchNorm layers frozen
    for layer in model.layers[-20:]:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )


unfreeze_model(model)

earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=0, mode='max')
mcp_save = tf.keras.callbacks.ModelCheckpoint('best_model.hdf5', save_best_only=True, monitor='val_accuracy', mode='max')
reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')

epochs = 25  # @param {type: "slider", min:8, max:50}
hist = model.fit(train_dataset, epochs=epochs, validation_data=validation_dataset,
                  class_weight=class_weights,
                  callbacks=[earlyStopping, mcp_save, reduce_lr_loss])


# Prediction

In [None]:
model = tf.keras.models.load_model('best_model.hdf5')


predictions = model.predict(test_dataset)

predicted_class = np.argmax(predictions, axis=-1)


print('Predictions:\n', predicted_class)
# predicted_class[np.max(predictions, axis=-1) < 0.7] = 0

with open('/content/submission.txt', 'w') as submission_file:
  for filename, predicted in zip(test_dataset_filenames, predicted_class):
    submission_file.write('{}\t{}\n'.format(filename.split('/')[-1], class_names[predicted]))