In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import json
import seaborn as sns

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    #if IS_COLAB:
        #print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)


#from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.models import Sequential
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam, RMSprop

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#train_images_dir = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/train/'
#test_images_dir = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/test/'
train_images_dir = './nybg2020/train/'
test_images_dir = './nybg2020/test/'

#train_metadata_file_path = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/train/metadata.json'
#test_metadata_file_path = '/kaggle/input/herbarium-2020-fgvc7/nybg2020/test/metadata.json'
train_metadata_file_path = './nybg2020/train/metadata.json'
test_metadata_file_path = './nybg2020/test/metadata.json'

num_classes = 32093 + 1
batch_size = 16

steps_per_epoch = int(num_classes / batch_size)

img_height = 1000
img_width = 661

epochs_num = 2

In [4]:
with open(train_metadata_file_path, 'r', encoding='utf-8', errors='ignore') as f:
    train_metadata_json = json.load(f)

In [5]:
#Let's see presented keys
train_metadata_json.keys()

dict_keys(['annotations', 'categories', 'images', 'info', 'licenses', 'regions'])

In [6]:
#Create Pandas DataFrame per each data type
train_metadata = pd.DataFrame(train_metadata_json['annotations'])

train_categories = pd.DataFrame(train_metadata_json['categories'])
train_categories.columns = ['family', 'genus', 'category_id', 'category_name']

train_images = pd.DataFrame(train_metadata_json['images'])
train_images.columns = ['file_name', 'height', 'image_id', 'license', 'width']

train_regions = pd.DataFrame(train_metadata_json['regions'])
train_regions.columns = ['region_id', 'region_name']

#Combine DataFrames
train_data = train_metadata.merge(train_categories, on='category_id', how='outer')
train_data = train_data.merge(train_images, on='image_id', how='outer')
train_data = train_data.merge(train_regions, on='region_id', how='outer')

#Remove NaN values
train_data = train_data.dropna()

# Update data types
train_data = train_data.astype({'category_id': 'int32',
                                'id': 'int32',
                                'image_id': 'int32',
                                'region_id': 'int32',
                                'height': 'int32',
                                'license': 'int32',
                                'width': 'int32'})

train_data.info()

#Save DataFrame for future usage.
train_data.to_csv('train_data.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1030747 entries, 0 to 1030746
Data columns (total 12 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   category_id    1030747 non-null  int32 
 1   id             1030747 non-null  int32 
 2   image_id       1030747 non-null  int32 
 3   region_id      1030747 non-null  int32 
 4   family         1030747 non-null  object
 5   genus          1030747 non-null  object
 6   category_name  1030747 non-null  object
 7   file_name      1030747 non-null  object
 8   height         1030747 non-null  int32 
 9   license        1030747 non-null  int32 
 10  width          1030747 non-null  int32 
 11  region_name    1030747 non-null  object
dtypes: int32(7), object(5)
memory usage: 74.7+ MB


In [7]:
del train_categories
del train_images
del train_regions

In [8]:
train_data.head()

Unnamed: 0,category_id,id,image_id,region_id,family,genus,category_name,file_name,height,license,width,region_name
0,15672,354106,354106,1,Lecythidaceae,Lecythis,Lecythis retusa Spruce ex O.Berg,images/156/72/354106.jpg,1000,1,661,South America
1,15672,545181,545181,1,Lecythidaceae,Lecythis,Lecythis retusa Spruce ex O.Berg,images/156/72/545181.jpg,1000,1,661,South America
2,15672,449419,449419,1,Lecythidaceae,Lecythis,Lecythis retusa Spruce ex O.Berg,images/156/72/449419.jpg,1000,1,662,South America
3,15672,200223,200223,1,Lecythidaceae,Lecythis,Lecythis retusa Spruce ex O.Berg,images/156/72/200223.jpg,1000,1,661,South America
4,15672,5327,5327,1,Lecythidaceae,Lecythis,Lecythis retusa Spruce ex O.Berg,images/156/72/5327.jpg,1000,1,661,South America


In [9]:
with open(test_metadata_file_path, 'r', encoding='utf-8', errors='ignore') as f:
    test_metadata_json = json.load(f)

In [10]:
test_metadata_json.keys()

dict_keys(['images', 'info', 'licenses'])

In [11]:
test_data = pd.DataFrame(test_metadata_json['images'])

test_data = test_data.astype({'height': 'int32',
                              'id': 'int32',
                              'license': 'int32',
                              'width': 'int32'})

test_data.to_csv('test_data.csv', index=False)

In [12]:
datagen_without_augmentation = ImageDataGenerator(rescale=1./255)
datagen_with_augmentation = ImageDataGenerator(rescale=1./255, 
                                               featurewise_center=False,
                                               samplewise_center=False,
                                               featurewise_std_normalization=False,
                                               samplewise_std_normalization=False,
                                               zca_whitening=False,
                                               rotation_range = 10,
                                               zoom_range = 0.1,
                                               width_shift_range=0.1,
                                               height_shift_range=0.1,
                                               horizontal_flip=True,
                                               vertical_flip=False)

train_datagen = datagen_with_augmentation.flow_from_dataframe(dataframe=train_data, 
                                                                 directory=train_images_dir, 
                                                                 x_col='file_name', 
                                                                 y_col='category_id',
                                                                 class_mode="raw",
                                                                 batch_size=batch_size,
                                                                 color_mode = 'rgb',
                                                                 target_size=(img_height,img_width)
                                                             )

val_datagen = datagen_without_augmentation.flow_from_dataframe(dataframe=train_data, 
                                                                 directory=train_images_dir, 
                                                                 x_col='file_name', 
                                                                 y_col='category_id',
                                                                 class_mode="raw",
                                                                 batch_size=batch_size,
                                                                 color_mode = 'rgb',
                                                                 target_size=(img_height,img_width))

test_datagen = datagen_without_augmentation.flow_from_dataframe(dataframe=test_data,
                                                              directory=test_images_dir,
                                                               x_col='file_name',
                                                               color_mode = 'rgb',
                                                               class_mode=None,
                                                               target_size=(img_height,img_width))

Found 1030747 validated image filenames.
Found 1030747 validated image filenames.
Found 138292 validated image filenames.


In [13]:
def generator_wrapper(generator, num_of_classes):
    for (X_vals, y_vals) in generator:
        Y_categorical = to_categorical(y_vals, num_classes=num_of_classes)
        
        yield (X_vals, Y_categorical)        
        
train_datagen_wrapper = generator_wrapper(train_datagen, num_classes)
val_datagen_wrapper = generator_wrapper(val_datagen, num_classes)

In [14]:
model = Sequential()

model.add(Conv2D(64, kernel_size=5, activation='relu', input_shape=(img_height, img_width, 3), padding='Same', strides=2))
model.add(Conv2D(64, kernel_size=5, activation='relu', padding='Same', strides=2))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(128, kernel_size=3, activation='relu', padding='Same', strides=2))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.25))
model.add(Conv2D(128, kernel_size=3, activation='relu', padding='Same', strides=2))
model.add(MaxPooling2D(2, 2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(num_classes / 100))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

optimizer = RMSprop(lr=0.001)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
import time

start = time.time()

history = model.fit_generator(train_datagen_wrapper,
                              epochs=epochs_num,
                              validation_data=val_datagen_wrapper,
                              steps_per_epoch=steps_per_epoch,
                              validation_steps=steps_per_epoch)
 

model.save('cnn_model.h2')
end = time.time()

print(f"\nLearning took {end - start}")

Instructions for updating:
Please use Model.fit, which supports generators.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 2005 steps, validate for 2005 steps
Epoch 1/2
Epoch 2/2
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: cnn_model.h2/assets

Learning took 9357.211953639984


In [16]:
batch_size = 8
test_datagen_2 = ImageDataGenerator(featurewise_center=False,
                                    featurewise_std_normalization=False)


generator = test_datagen_2.flow_from_dataframe(
        dataframe = test_data.iloc[:1000], #Limiting the test to the first 10,000 items
        directory = test_images_dir,
        x_col = 'file_name',
        target_size=(120, 120),
        batch_size=batch_size,
        class_mode=None,  # only data, no labels
        shuffle=False)

category = model.predict_generator(test_datagen, verbose=1)
#print()

Found 1000 validated image filenames.
Instructions for updating:
Please use Model.predict, which supports generators.


In [24]:
sub = pd.DataFrame()
#display(test_data)
sub['Id'] = test_data.id
sub['Id'] = sub['Id'].astype('int32')
sub['Predicted'] = np.concatenate([np.argmax(category, axis=1), 23718*np.ones((len(test_data.id)-len(category)))], axis=0)
sub['Predicted'] = sub['Predicted'].astype('int32')
display(sub)
sub.to_csv('category_submission.csv', index=False)

Unnamed: 0,Id,Predicted
0,104891,23718
1,18029,23718
2,35151,23718
3,124144,23718
4,24649,23718
...,...,...
138287,32738,23718
138288,16804,23718
138289,113662,23718
138290,86100,23718
