In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications.resnet_v2 import ResNet101V2
from glob import glob
import json
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
TRAIN_MODE = False
PATH = "/disk2/herbarium_data/nybg2020/"
TRAIN = PATH+"train/"
TEST = PATH+"test/"
META = "metadata.json"

In [3]:
with open(TRAIN+META, errors='ignore', encoding='utf8') as f:
    meta = json.load(f)

In [4]:
print("Meta data keys:")
for i in meta.keys():
    print("- "+i)
for i in meta.keys():
    print("\nSample of "+i+":")
    print(list(meta[i])[0])

Meta data keys:
- annotations
- categories
- images
- info
- licenses
- regions

Sample of annotations:
{'category_id': 15672, 'id': 354106, 'image_id': 354106, 'region_id': 1}

Sample of categories:
{'family': 'Orchidaceae', 'genus': 'Aa', 'id': 0, 'name': 'Aa mathewsii (Rchb.f.) Schltr.'}

Sample of images:
{'file_name': 'images/156/72/354106.jpg', 'height': 1000, 'id': 354106, 'license': 1, 'width': 661}

Sample of info:
contributor

Sample of licenses:
{'id': 1, 'name': 'Public Domain Dedication', 'url': 'http://creativecommons.org/publicdomain/zero/1.0/'}

Sample of regions:
{'id': 0, 'name': 'Mexico & Central America'}


In [5]:
train_img = pd.DataFrame(meta['images'])
train_ann = pd.DataFrame(meta['annotations'])
train_df = pd.merge(train_ann, train_img, left_on='image_id', right_on='id', how='left').drop('image_id', axis=1)
train_df = shuffle(train_df)
max_class = train_df['category_id'].max()
#train_df['category_id'] = train_df['category_id'].astype('str')

In [None]:
"""unique, counts = np.unique(train_df['category_id'], return_counts=True)
plt.bar(unique, counts, 1)
plt.title('Class Frequency')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()"""

In [None]:
"""Image.open(TRAIN+train_df['file_name'][0])

size_of_img = (28, 28)
fig=plt.figure(figsize=(72,72))
for i in range(60):
    ax=fig.add_subplot(12,12,i+1)
    img = cv2.imread(TRAIN + meta["images"][i]["file_name"])
    img = cv2.resize(img,size_of_img)
    ax.imshow(img)
plt.show()"""

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

In [None]:
image_count = len(train_df)
image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, 
                                                                  validation_split=0.2,
                                                                  horizontal_flip=True,
                                                                  zoom_range=0.1)
NO_IMPROVE = 5
IMG_HEIGHT = 224
IMG_WIDTH = 224
TRAIN_BATCH_SIZE = 64
VAL_BATCH_SIZE = 64

In [None]:
train_data_gen = image_generator.flow_from_dataframe(dataframe=train_df,
                                                     directory=TRAIN,
                                                     x_col='file_name',
                                                     y_col='category_id',
                                                     batch_size=TRAIN_BATCH_SIZE,
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                     class_mode='raw',
                                                     subset='training')
val_data_gen = image_generator.flow_from_dataframe(dataframe=train_df,
                                                     directory=TRAIN,
                                                     x_col='file_name',
                                                     y_col='category_id',
                                                     batch_size=VAL_BATCH_SIZE,
                                                     target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                     class_mode='raw',
                                                     subset='validation')

In [None]:
def show_batch(image_batch, label_batch):
  plt.figure(figsize=(10,10))
  for n in range(2):
      ax = plt.subplot(5,5,n+1)
      plt.imshow(image_batch[n])
      plt.axis('off')

"""image_batch, label_batch = next(train_data_gen)
show_batch(image_batch, label_batch)
print(label_batch)"""

In [None]:
if TRAIN_MODE:
    resnet = ResNet101V2(include_top=True, input_tensor=None, input_shape=(IMG_HEIGHT, IMG_HEIGHT,3))
    output = resnet.layers[-2].output
    for layer in resnet.layers:
        layer.trainable = True
    output = tf.keras.layers.Dropout(0.1)(output)
    output = tf.keras.layers.BatchNormalization()(output)
    output = tf.keras.layers.Dense(max_class+1, activation='softmax')(output)
    model = tf.keras.Model(inputs=resnet.input, outputs=output)
    stop_when_no_improve = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', min_delta=0, 
                                                                patience = NO_IMPROVE, restore_best_weights=True)
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy())
    print(model.summary())

    model.fit(
        train_data_gen,
        epochs=10000,
        validation_data=val_data_gen,
        callbacks=[stop_when_no_improve]
    )

    model.save("./model.h5")
else:
    model = tf.keras.models.load_model("./model_0.17736.h5")

In [None]:
with open(TEST+META, errors='ignore', encoding='utf8') as f:
    meta_test = json.load(f)

In [None]:
print("Meta data keys:")
for i in meta_test.keys():
    print("- "+i)
for i in meta_test.keys():
    print("\nSample of "+i+":")
    print(list(meta_test[i])[0])

In [None]:
test_img = pd.DataFrame(meta_test['images'])

In [None]:
test_image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_data_gen = test_image_generator.flow_from_dataframe(dataframe=test_img,
                                                         directory=TEST,
                                                         x_col='file_name',
                                                         batch_size=64,
                                                         shuffle=False,
                                                         target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                         class_mode=None)

In [None]:
pred = model.predict(x=test_data_gen, verbose=1)

In [None]:
result = np.argmax(pred, axis=1)

In [None]:
sns.distplot(result)

In [None]:
output = pd.concat([test_img['id'], pd.DataFrame(result)], axis=1).rename(columns={"id": "Id", 0: "Predicted"})
output.set_index('Id').sort_index().to_csv('./submission.csv')