### Necessary Imports and Installs

In [1]:
#!pip install opendatasets
#!pip install cartopy

In [3]:
!rm -rf GLC
!git clone https://github.com/maximiliense/GLC

Cloning into 'GLC'...
remote: Enumerating objects: 383, done.[K
remote: Counting objects: 100% (228/228), done.[K
remote: Compressing objects: 100% (159/159), done.[K
remote: Total 383 (delta 119), reused 170 (delta 63), pack-reused 155[K
Receiving objects: 100% (383/383), 10.57 MiB | 35.14 MiB/s, done.
Resolving deltas: 100% (205/205), done.


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Add, GlobalAveragePooling2D, Conv2D, Dense, AveragePooling2D, \
BatchNormalization, Normalization, Dropout, Flatten, Lambda, Input, Activation, MaxPooling2D
from tensorflow.keras import Model
from tensorflow.keras.optimizers import schedules, SGD
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import backend as K
#import tensorflow_datasets as tfds

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import scale
import time
from collections import defaultdict
import math
import copy
import threading
import opendatasets as od

%pylab inline --no-import-all
from pathlib import Path
import pandas as pd
import sys

from GLC.data_loading.common import load_patch

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-uib8xiz1 because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


# Dataset

## Download Dataset

In [5]:
# only uncomment if you don't have the dataset stored on disk yet
# -> have your kaggle user credentials ready
# data = od.download("https://www.kaggle.com/competitions/geolifeclef-2022-lifeclef-2022-fgvc9")

## Load Dataset from file

Set path to competition dataset here.

In [6]:
# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9/")

Run the following two commands to verify that the data path is set correctly. They should print folder and file names.

In [7]:
ls -L $DATA_PATH

[0m[01;34mmetadata[0m/      [01;34mpatches-fr[0m/  [01;34mpatches_sample[0m/  [01;34mrasters[0m/
[01;34mobservations[0m/  [01;34mpatches-us[0m/  [01;34mpre-extracted[0m/   sample_submission.csv


In [8]:
ls $DATA_PATH/observations

observations_fr_test.csv   observations_us_test.csv
observations_fr_train.csv  observations_us_train.csv


Load the observation ids of the training dataset.

In [9]:
### Training Dataset ###
# let's load the data from file
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")

df_obs = pd.concat((df_obs_fr, df_obs_us))

print("Number of observations for training: {}".format(len(df_obs)))

# let's have a look at the data
df_obs.head()

Number of observations for training: 1627475


Unnamed: 0_level_0,latitude,longitude,species_id,subset
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10561949,45.705116,1.424622,241,train
10131188,45.146973,6.416794,101,train
10799362,46.783695,-2.072855,700,train
10392536,48.604866,-2.825003,1456,train
10335049,48.815567,-0.161431,157,train


Load the observation ids of the test dataset.

In [10]:
### Test Dataset ###
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")

df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

print("Number of observations for testing: {}".format(len(df_obs_test)))

df_obs_test.head()

Number of observations for testing: 36421


Unnamed: 0_level_0,latitude,longitude
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10782781,43.601788,6.940195
10364138,46.241711,0.683586
10692017,45.181095,1.533459
10222322,46.93845,5.298678
10241950,45.017433,0.960736


Load suggested landcover alignment (only relevant if you're using landcover data later).

In [11]:
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
print(df_suggested_landcover_alignment.head())
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

   landcover_code  suggested_landcover_code suggested_landcover_label
0               0                         0              Missing Data
1               1                        11          Cultivated Crops
2               2                        11          Cultivated Crops
3               3                         6       Broad-leaved Forest
4               4                         7         Coniferous Forest


### Train/Val Split Labels
Retrieve the train/val split provided, and load the labels of the train and val set elements.

In [12]:
obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

y_train = df_obs.loc[obs_id_train]["species_id"].values
y_val = df_obs.loc[obs_id_val]["species_id"].values

n_val = len(obs_id_val)
print("Training set size: {} ({:.1%} of train observations)".format(len(y_train), len(y_train) / len(df_obs)))
print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))

Training set size: 1587395 (97.5% of train observations)
Validation set size: 40080 (2.5% of train observations)


## Write input pipeline to load batches as we train.

### Custom Generator
Since dataset is too large to load it all into memory once, we need to load it from disk in batches as we train. Such a generator can later be passed into model.fit() instead of a train and/or validation dataset.

In [13]:
class Patches_Generator(tf.keras.utils.Sequence) :
  
    def __init__(self, obs_ids, labels, batch_size) :
        self.obs_ids = obs_ids
        self.labels = labels
        self.batch_size = batch_size
        
        # to make the generator thread safe 
        self.lock = threading.Lock()

    def __len__(self) :
        return (np.ceil(len(self.obs_ids) / float(self.batch_size))).astype(int)
  
    # returns one batch
    def __getitem__(self, idx) :
        X_batch = list()
        y_batch = list()

        for i in range(idx * self.batch_size, (idx+1) * self.batch_size):
            patch = load_patch(self.obs_ids[i], DATA_PATH, data='rgb')
            X_batch.append(patch[0])
            y_batch.append(self.labels[i])

        with self.lock:
            return np.asarray(X_batch), np.array(y_batch)

# First Simple Neural Network
Let's create a first neural network as a baseline to see how it performs.

In [14]:
# for distributed training (that is, using multiple GPUs for data parallelization)
# https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit
mirrored_strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [15]:
# returns a simple convolutional neural net
def simple_model(input_shape, learning_rate=0.1):
    
    # for distributed training
    with mirrored_strategy.scope():
    
        model = tf.keras.models.Sequential()

        # 1. Preprocessing
        # rescale inputs
        model.add(tf.keras.layers.Rescaling(1./255))

        # 2. Convolutional Layers
        model.add(Conv2D(32, kernel_size=3, activation='relu', input_shape=input_shape, padding='valid'))
        model.add(MaxPooling2D())

        model.add(Conv2D(64, kernel_size=3, activation='relu', padding='valid'))
        model.add(MaxPooling2D())

        model.add(Conv2D(128, kernel_size=3, activation='relu', padding='valid'))
        model.add(MaxPooling2D())

        model.add(Conv2D(256, kernel_size=3, activation='relu', padding='valid'))

        # from convolutional layers to dense layers
        model.add(tf.keras.layers.Flatten())

        # 3. Dense Layers
        model.add(Dense(64, activation='relu'))

        # 4. Output Layer
        model.add(Dense(17038, activation='softmax'))
    
    # compire the model
    model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  metrics=['accuracy'])
    
    return model

In [16]:
# create the network
model = simple_model((256, 256, 3), learning_rate=0.3)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).


**Settings used when parallelizing the I/O Pipeline in model.fit().**

I used the setting values in the cell below for a high-cpu vm on GCP with the following specs:
- machine type: n1-highcpu-96 (96 CPU cores)
- vCPUs to core ratio: 2 vCPUs per core (making a theoretical max value for num_threads of 96 * 2 = 192)
- 4 x NVIDIA Tesla T4 GPUs

In [17]:
# Remember to tune the learning rate accordingly.
BATCHSIZE = 512   

# The maximun value for num_threads is dependent on amount of CPU cores:
# amount of CPU cores * vCPUs to core ratio = theoretical max of NUM_THREADS
NUM_THREADS = 140 

# The more batches we prefetch, the less idle the GPUs will be. 
# To check GPU usage:
# 1. Run nvidia-smi -l 1 from the terminal to monitor the GPU usage during training. 
# 2. Try to get close to 100% for all GPUs by adjusting the value below (and the two above). Due to the overhead
#    from tf.distribute.MirroredStrategy(), you won't be able to consistently get 100% for all GPUs. But try to 
#    get close.
# 3. Be aware that RAM limits the amount of batches you can prefetch.
PRE_FETCH_NUM_BATCHES = int(NUM_THREADS * 2.5) 

Create generators that will read training / validation data from disk during training.

In [18]:
train_generator = Patches_Generator(obs_id_train, y_train, BATCHSIZE)

In [19]:
#val_generator = Patches_Generator(obs_id_val, y_val, BATCHSIZE)

Train the network.

In [20]:
# define an early stopping callback for when model converges
early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.001, patience=5, 
                                              verbose=0, mode='auto', baseline=None, restore_best_weights=True)

In [None]:
history = model.fit(train_generator, epochs=100, steps_per_epoch=len(y_train)//BATCHSIZE, callbacks=[early_stop], 
                    # for parallelization of reading from disk (I/O) pipeline
                    max_queue_size=PRE_FETCH_NUM_BATCHES, workers=NUM_THREADS, use_multiprocessing=True)

Epoch 1/100
INFO:tensorflow:batch_all_reduce: 12 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:batch_all_reduce: 12 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/

In [None]:
model.save('first_simple_model')