### Necessary Imports and Installs

In [29]:
#!pip install opendatasets
#!pip install cartopy

In [30]:
#!rm -rf GLC
#!git clone https://github.com/maximiliense/GLC

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Add, GlobalAveragePooling2D, Conv2D, Dense, AveragePooling2D, \
BatchNormalization, Normalization, Dropout, Flatten, Lambda, Input, Activation, MaxPooling2D
from tensorflow.keras import Model
from tensorflow.keras.optimizers import schedules, SGD
from tensorflow.keras.callbacks import Callback, LambdaCallback
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
#import tensorflow_datasets as tfds

import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import scale
import time
import random
from collections import defaultdict
import math
import copy
import threading
import opendatasets as od
import tempfile
from sklearn.utils import shuffle

%pylab inline --no-import-all
from pathlib import Path
import pandas as pd
import sys

from GLC.data_loading.common import load_patch

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-_oilyz_s because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


# Dataset

## Download Dataset

In [2]:
# only uncomment if you don't have the dataset stored on disk yet
# -> have your kaggle user credentials ready
#data = od.download("https://www.kaggle.com/competitions/geolifeclef-2022-lifeclef-2022-fgvc9")

## Load Dataset from file

Set path to competition dataset here.

In [2]:
# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9/")

Run the following two commands to verify that the data path is set correctly. They should print folder and file names.

In [3]:
ls -L $DATA_PATH

[0m[01;34mmetadata[0m/      [01;34mpatches-fr[0m/  [01;34mpatches_sample[0m/  [01;34mrasters[0m/
[01;34mobservations[0m/  [01;34mpatches-us[0m/  [01;34mpre-extracted[0m/   sample_submission.csv


In [4]:
ls $DATA_PATH/observations

observations_fr_test.csv   observations_us_test.csv
observations_fr_train.csv  observations_us_train.csv


Load the observation ids of the training dataset.

In [5]:
### Training Dataset ###
# let's load the data from file
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")

df_obs = pd.concat((df_obs_fr, df_obs_us))

print("Number of observations for training: {}".format(len(df_obs)))

# let's have a look at the data
df_obs.head()

Number of observations for training: 1627475


Unnamed: 0_level_0,latitude,longitude,species_id,subset
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10561949,45.705116,1.424622,241,train
10131188,45.146973,6.416794,101,train
10799362,46.783695,-2.072855,700,train
10392536,48.604866,-2.825003,1456,train
10335049,48.815567,-0.161431,157,train


Load the observation ids of the test dataset.

In [6]:
### Test Dataset ###
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")

df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

print("Number of observations for testing: {}".format(len(df_obs_test)))

df_obs_test.head()

Number of observations for testing: 36421


Unnamed: 0_level_0,latitude,longitude
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10782781,43.601788,6.940195
10364138,46.241711,0.683586
10692017,45.181095,1.533459
10222322,46.93845,5.298678
10241950,45.017433,0.960736


Load suggested landcover alignment (only relevant if you're using landcover data later).

In [7]:
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
print(df_suggested_landcover_alignment.head())
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

   landcover_code  suggested_landcover_code suggested_landcover_label
0               0                         0              Missing Data
1               1                        11          Cultivated Crops
2               2                        11          Cultivated Crops
3               3                         6       Broad-leaved Forest
4               4                         7         Coniferous Forest


### Train/Val Split Labels
Retrieve the train/val split provided, and load the labels of the train and val set elements.

In [8]:
obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

y_train = df_obs.loc[obs_id_train]["species_id"].values
y_val = df_obs.loc[obs_id_val]["species_id"].values

n_val = len(obs_id_val)
print("Training set size: {} ({:.1%} of train observations)".format(len(y_train), len(y_train) / len(df_obs)))
print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))

Training set size: 1587395 (97.5% of train observations)
Validation set size: 40080 (2.5% of train observations)


**Let's have a look at the training dataset.**

In [9]:
print("=== Training Dataset ===")
print("There are {} unique labels.".format(len(np.unique(y_train))))
print("We have {:.2f} observations per label on average.".format(len(obs_id_train)/len(np.unique(y_train))))

=== Training Dataset ===
There are 17031 unique labels.
We have 93.21 observations per label on average.


In [10]:
# label_count = defaultdict(lambda: 0)

# print("counting observation ids per label ...")
# for ob in df_obs.iterrows():
#     label_count[ob[1]['species_id']] += 1
# print("done")

In [11]:
# buckets = [0, 10, 100, 500, 1000, 2000, 3000, 10000, 100000]
# buckets_counts = {0: 0, 10: 0, 100: 0, 500: 0, 1000: 0, 2000: 0, 3000: 0, 10000: 0, 100000: 0}
# buckets_labels = {0: list(), 10: list(), 100: list(), 500: list(), 1000: list(), 
#                   2000: list(), 3000: list(), 10000: list(), 100000: list()}

# for label in label_count:
#     c = label_count[label]
    
#     i = 0
#     while c > buckets[i]:
#         i += 1
#     i -= 1
        
#     #print("c: ", c)
#     #print("buckets[i]: ", buckets[i])
    
#     buckets_counts[buckets[i]] += 1
#     buckets_labels[buckets[i]].append(label)
    
# print("=== Analysis of observation ids per label in training set ===")
# print("{:<15}: # of labels that have that range amount of obs ids".format("# of obs ids"))
# print()
# for i in range(0, len(buckets)-1):
#     print("{:<15}: {amount:>4} labels".format("{lower} to {upper}".format(lower=buckets[i], upper=buckets[i+1]), 
#                                               amount=buckets_counts[buckets[i]]))

# #print(buckets_contents)

Now only retrieve the data belonging to a subset of all possible labels.

In [12]:
# subset_size = 10
# obs_list = list()

# # iterate over a subset of the labels
# counter = 0
# for y in np.unique(y_train)[:subset_size]:
#     # for each label, retrieve all corresponding observation ids
#     obs = df_obs.index[df_obs["species_id"] == y].values
#     obs_list.append(obs)
    
# # we now have a numpy array of all observation ids corresponding to this subset of labels
# obs_id_train = np.concatenate(obs_list)

# # obtain the gps coordinates
# latitudes_train = df_obs.loc[obs_id_train]["latitude"].values
# longitudes_train = df_obs.loc[obs_id_train]["longitude"].values
# gps_train = np.column_stack((latitudes, longitudes))

# # obtain the labels in the right order 
# y_train = df_obs.loc[obs_id_train]["species_id"].values

In [13]:
# ### JUST TO TEST IF INPUT PIPELINE WORKS ###
# # load training dataset samples
# # factor = 1 means load full training dataset
# # factor = 100 means load 1/100 of the full dataset
# factor = 100

# obs_list = list()
# for obs_id in obs_id_train:
#     obs_list.append(obs_id)
    
#     if len(obs_list) >= (len(y_train)/factor):
#         break
        
# obs_id_train = np.array(obs_list)
# y_train = y_train[:len(obs_id_train)]

# print("obs_id array shape: ", np.shape(obs_id_train))
# print("label array shape: ", np.shape(y_train))
# print(np.min(y_train[:len(obs_id_train)]))
# print(np.max(y_train[:len(obs_id_train)]))

In [14]:
# ### Keep only the labels with lots of observations ###
# min_obs_count = 2000
# max_obs_count = 3000

# # obtain all relevant labels
# y_train = list()
# for b in buckets:
#     if b >= min_obs_count and b <= max_obs_count:
#         for l in buckets_labels[b]:
#             y_train.append(l)
# print("obtained all relevant labels")
            
# # obtain all corresponding obs ids
# obs_list = list()

# # iterate over a subset of the labels
# counter = 0
# for y in y_train:
#     # for each label, retrieve all corresponding observation ids
#     obs = df_obs.index[(df_obs["subset"] == "train") & (df_obs["species_id"] == y)].values
#     #print(len(obs))
#     obs_list.append(obs)
    
# # we now have a numpy array of all observation ids corresponding to this subset of labels
# obs_id_train_new = np.concatenate(obs_list)
# print("obtained all observations")

# # obtain the labels in the right order 
# y_train = df_obs.loc[obs_id_train_new]["species_id"].values

# print(len(obs_id_train_new))
# print(len(obs_id_train))

# # count how many ids exist in obs_id_train_new that don't exist in obs_id_train
# # should be 0
# print("should be 0: ", len(obs_id_train_new) - len(obs_id_train_new[np.in1d(obs_id_train_new, obs_id_train)]))

# print(len(np.unique(y_train)))
# print(np.unique(y_train))

In [15]:
### Kennedy's Train, Val and Test Split ###
# CHOOSE SUBSET FOR 30 Labels
import random
subset_size = 0
obs_list = list()
obs_test_list = list()
import numpy as np
# iterate over a subset of the labels
m = 0
for y in (np.unique(y_train)[:]):
    #print("in")
    # for each label, retrieve all corresponding observation ids
    obs = df_obs.index[(df_obs["species_id"] == y)]
   
    #print(len(obs))
    #print(counter)
    #print(obs)
    #print(len(obs))
    if (len(obs) >= 2000 and len(obs) <= 3000):
      t = set(df_obs.index[(df_obs["species_id"] == y) & (df_obs["subset"] == "train")].values)
      m += len(t)
      ten_perc = int(len(t)/10)
      random.seed(3)
      test = random.sample(t, ten_perc)
      train = t-set(test)
      #print(train)
      #print("here")
      obs_test_list.append(list(test))
      obs_list.append(list(train))
      subset_size += 1
    if (subset_size >= 30):
      #print("break")
      break
print(m)   
# we now have a numpy array of all observation ids corresponding to this subset of labels
obs_id_train = np.concatenate(obs_list)
obs_id_test = np.concatenate(obs_test_list)
gps_train = np.concatenate((df_obs.loc[obs_id_train]["latitude"].values, df_obs.loc[obs_id_train]["longitude"].values))
# obtain the labels in the right order 
y_train = df_obs.loc[obs_id_train]["species_id"].values
y_test = df_obs.loc[obs_id_test]["species_id"].values
print(y_train.size)

print()
print(y_test.size)
print(y_train[2])
print()

subset_size = 8
obs_list_1 = list()
print(y_val.size)

# iterate over a subset of the labels
counter = 0
print(y_val[0])
for y in (np.unique(y_val)[:]):
    # for each label, retrieve all corresponding observation ids
    if (y in y_train):
      #obs = df_obs.index[df_obs["species_id"] == y].values
      v = df_obs.index[(df_obs["species_id"] == y) & (df_obs["subset"] == "val")].values
      obs_list_1.append(v)
# we now have a numpy array of all observation ids corresponding to this subset of labels
obs_id_val = np.concatenate(obs_list_1)

# obtain the labels in the right order 
y_val = df_obs.loc[obs_id_val]["species_id"].values
gps_val = np.concatenate((df_obs.loc[obs_id_val]["latitude"].values, df_obs.loc[obs_id_val]["longitude"].values))

print(y_val.size)
print(obs_id_val == obs_id_train)

70294
63278

7016
5

40080
11
1831
False


  print(obs_id_val == obs_id_train)


### Remap the labels s.t. they go from 0 to n-1
(NAN fix is here)

In [16]:
# create a map s.t. the labels will go from 0 to n-1
map_labels = dict()
i = 0
for l in np.unique(y_train):
    map_labels[l] = i
    i+=1
print(map_labels)

{5: 0, 68: 1, 125: 2, 129: 3, 223: 4, 553: 5, 570: 6, 605: 7, 679: 8, 692: 9, 739: 10, 979: 11, 1075: 12, 1155: 13, 1916: 14, 2196: 15, 2524: 16, 2821: 17, 2950: 18, 2992: 19, 3253: 20, 4940: 21, 4991: 22, 4999: 23, 5025: 24, 5086: 25, 5168: 26, 5192: 27, 5207: 28, 5228: 29}


In [17]:
# apply the map to the training labels
y_train_normalized = np.zeros(np.shape(y_train), dtype='int64')

for i in range(len(y_train)):
    y_train_normalized[i] = map_labels[y_train[i]]
    
# shuffle together
obs_id_train, y_train = shuffle(obs_id_train, y_train_normalized)

print("# output neurons: ", len(np.unique(y_train)))

# output neurons:  30


In [18]:
# apply the map to the validation labels
y_val_normalized = np.zeros(np.shape(y_val), dtype='int64')

for i in range(len(y_val)):
    y_val_normalized[i] = map_labels[y_val[i]]
    
# shuffle together
obs_id_val, y_val = shuffle(obs_id_val, y_val_normalized)

## Write input pipeline to load batches as we train.

### Custom Generator
Since dataset is too large to load it all into memory once, we need to load it from disk in batches as we train. Such a generator can later be passed into model.fit() instead of a train and/or validation dataset.

In [19]:
class Patches_Generator(tf.keras.utils.Sequence) :
  
    def __init__(self, obs_ids, labels, batch_size) :
        self.obs_ids = obs_ids
        self.labels = labels
        self.batch_size = batch_size
        
        # to make the generator thread safe 
        self.lock = threading.Lock()

    def __len__(self) :
        return (np.ceil(len(self.obs_ids) / float(self.batch_size))).astype(int)
  
    # returns one batch
    def __getitem__(self, idx) :
        X_batch = list()
        y_batch = list()

        for i in range(idx * self.batch_size, (idx+1) * self.batch_size):
            if i >= len(self.obs_ids): break
            
            patch = load_patch(self.obs_ids[i], DATA_PATH, data='rgb')
            X_batch.append(patch[0])
            y_batch.append(self.labels[i])

        with self.lock:
            return np.asarray(X_batch), np.array(y_batch)

In [18]:
# class GPS_Generator(tf.keras.utils.Sequence) :
  
#     def __init__(self, obs_ids, gps, labels, batch_size) :
#         self.obs_ids = obs_ids
#         self.gps = gps
#         self.labels = labels
#         self.batch_size = batch_size
        
#         # to make the generator thread safe 
#         self.lock = threading.Lock()

#     def __len__(self) :
#         return (np.ceil(len(self.gps) / float(self.batch_size))).astype(int)
  
#     # returns one batch
#     def __getitem__(self, idx) :
#         X_batch_patch = list()
#         X_batch_gps = list()
#         y_batch = list()

#         for i in range(idx * self.batch_size, (idx+1) * self.batch_size):
#             if i >= len(self.gps): break

#             patch = load_patch(self.obs_ids[i], DATA_PATH, data='rgb')
#             X_batch_patch.append(patch[0])
#             X_batch_gps.append(self.gps[i])
#             y_batch.append(self.labels[i])

#         with self.lock:
#             return [np.asarray(X_batch_patch), np.asarray(X_batch_gps)], np.array(y_batch)

# First Simple Neural Network
Let's create a first neural network as a baseline to see how it performs.

In [19]:
# # for distributed training (that is, using multiple GPUs for data parallelization)
# # https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit
# mirrored_strategy = tf.distribute.MirroredStrategy()

In [20]:
# # returns a simple convolutional neural net
# def non_simple_model(input_shape, learning_rate=0.1, output_neurons=100):
    
#     # for distributed training
#     with mirrored_strategy.scope():
    
#         model = tf.keras.models.Sequential()
        
#         he = tf.keras.initializers.HeNormal(seed=42)

#         # 1. Preprocessing
#         # rescale inputs to distribution with mean = 0 and variance = 1
#         model.add(tf.keras.layers.Normalization())

#         # 2. Convolutional Layers
#         model.add(Conv2D(32, kernel_size=4, activation='relu', padding='valid', input_shape=input_shape,
#                          kernel_initializer=he))
#         model.add(MaxPooling2D())

#         model.add(Conv2D(64, kernel_size=4, activation='relu', padding='valid',
#                          kernel_initializer=he))
#         model.add(MaxPooling2D())
        
#         model.add(Dropout(0.3))

#         model.add(Conv2D(128, kernel_size=4, activation='relu', padding='valid',
#                          kernel_initializer=he))
#         model.add(MaxPooling2D())

#         model.add(Conv2D(128, kernel_size=4, activation='relu', padding='valid',
#                          kernel_initializer=he))
#         model.add(Dropout(0.3))

#         # from convolutional layers to dense layers
#         model.add(tf.keras.layers.Flatten())

#         # 3. Dense Layers
#         model.add(Dense(64, activation='relu', kernel_initializer=he))
#         model.add(Dropout(0.3))
#         model.add(Dense(128, activation='relu', kernel_initializer=he))
#         model.add(Dropout(0.3))

#         # 4. Output Layer
#         model.add(Dense(output_neurons, activation='softmax'))
    
#     # compire the model
#     model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
#                   optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#                   metrics=['accuracy'])
    
#     return model

In [20]:
# returns a 10 layer ReLU model of width 2
def simple_model(input_shape):
    random.seed(4)
    
    model = tf.keras.models.Sequential()
    
    # 1. Preprocessing
    # rescale inputs
    model.add(tf.keras.layers.Rescaling(1./255))

    # 2. Convolutional Layers
    model.add(Conv2D(32, kernel_size=5, activation='relu', input_shape=input_shape, padding='same'))
    #model.add(AveragePooling2D())

    model.add(Conv2D(64, kernel_size=5, activation='relu', padding='same'))
    #model.add(AveragePooling2D())
    
    model.add(Conv2D(128, kernel_size=5, activation='relu', padding='same'))
    
    # from convolutional layers to dense layers
    model.add(tf.keras.layers.Flatten())
    
    # 3. Dense Layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    
    # 4. Output Layer
    model.add(Dense(30, activation='softmax'))
    
    # compire the model
    model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  metrics=['accuracy'])
    
    return model

In [22]:
# # returns a simple convolutional neural net
# def simple_model_with_gps(input_shapes, learning_rate=0.1, output_neurons=100):
    
#     # settings
#     he = tf.keras.initializers.HeNormal(seed=42)
    
#     # for distributed training
#     with mirrored_strategy.scope():
        
#         ### convolutional part ###
#         conv_inputs = tf.keras.Input(shape=input_shapes[0])
#         conv_processed = tf.keras.layers.Normalization()(conv_inputs)
        
#         conv1 = Conv2D(32, kernel_size=3, activation='relu', padding='valid', kernel_initializer=he)(conv_processed)
#         mconv1 = MaxPooling2D()(conv1)
        
#         conv2 = Conv2D(64, kernel_size=3, activation='relu', padding='valid', kernel_initializer=he)(mconv1)
#         mconv2 = MaxPooling2D()(conv2)
        
#         conv3 = Conv2D(128, kernel_size=3, activation='relu', padding='valid', kernel_initializer=he)(mconv2)
#         mconv3 = MaxPooling2D()(conv3)
        
#         conv4 = Conv2D(128, kernel_size=3, activation='relu', padding='valid', kernel_initializer=he)(mconv3)

#         # from convolutional layers to dense layers
#         conv_flat = tf.keras.layers.Flatten()(conv4)
        
#         ### gps part ###
#         gps_inputs = tf.keras.Input(shape=input_shapes[1])
#         gps_processed = tf.keras.layers.Normalization()(gps_inputs)
#         gps_dense1 = Dense(16, activation='relu', kernel_initializer=he)(gps_processed)
#         gps_dense2 = Dense(32, activation='relu', kernel_initializer=he)(gps_dense1)
#         gps_dense3 = Dense(32, activation='relu', kernel_initializer=he)(gps_dense2)
#         gps_flat = tf.keras.layers.Flatten()(gps_dense3)

#         ### together + output layer ###
#         merged = tf.keras.layers.Concatenate()([conv_flat, gps_flat])
#         dense_1 = Dense(64, activation='relu', kernel_initializer=he)(merged)
#         output = Dense(output_neurons, activation='softmax')(dense_1)
        
#         model = tf.keras.Model([conv_inputs, gps_inputs], output)
    
#     # compire the model
#     model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
#                   optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#                   metrics=['accuracy'])
    
#     return model

## Training Prep

### Settings used when parallelizing the I/O Pipeline in model.fit().

I used the setting values in the cell below for a high-cpu vm on GCP with the following specs:
- machine type: n1-highcpu-96 (96 CPU cores)
- vCPUs to core ratio: 2 vCPUs per core (making a theoretical max value for num_threads of 96 * 2 = 192)
- 4 x NVIDIA Tesla T4 GPUs

In [21]:
# Remember to tune the learning rate accordingly.
BATCHSIZE = 64

# The maximun value for num_threads is dependent on amount of CPU cores:
# amount of CPU cores * vCPUs to core ratio = theoretical max of NUM_THREADS
NUM_THREADS = 11

# The more batches we prefetch, the less idle the GPUs will be. 
# To check GPU usage:
# 1. Run nvidia-smi -l 1 from the terminal to monitor the GPU usage during training. 
# 2. Try to get close to 100% for all GPUs by adjusting the value below (and the two above). Due to the overhead
#    from tf.distribute.MirroredStrategy(), you won't be able to consistently get 100% for all GPUs. But try to 
#    get close.
# 3. Be aware that RAM limits the amount of batches you can prefetch.
PRE_FETCH_NUM_BATCHES = int(NUM_THREADS * 20) 

### Generators
Create generators that will read training / validation data from disk during training.

In [22]:
train_generator = Patches_Generator(obs_id_train, y_train, BATCHSIZE)
#train_gps_generator = GPS_Generator(obs_id_train, gps_train, y_train, BATCHSIZE)

In [23]:
val_generator = Patches_Generator(obs_id_val, y_val, BATCHSIZE)

### Cyclic learning rate
Tune the cyclic learning rate prior to training.

In [26]:
# # code source: https://pyimagesearch.com/2019/08/05/keras-learning-rate-finder/

# class LearningRateFinder:
#     def __init__(self, model, stopFactor=4, beta=0.98):
#         # store the model, stop factor, and beta value (for computing
#         # a smoothed, average loss)
#         self.model = model
#         self.stopFactor = stopFactor
#         self.beta = beta
        
#         # initialize our list of learning rates and losses,
#         # respectively
#         self.lrs = []
#         self.losses = []
        
#         # initialize our learning rate multiplier, average loss, best
#         # loss found thus far, current batch number, and weights file
#         self.lrMult = 1
#         self.avgLoss = 0
#         self.bestLoss = 1e9
#         self.batchNum = 0
#         self.weightsFile = None
        
#     def on_batch_end(self, batch, logs):
#         # grab the current learning rate and add log it to the list of
#         # learning rates that we've tried
#         lr = K.get_value(self.model.optimizer.lr)
#         self.lrs.append(lr)
        
#         # grab the loss at the end of this batch, increment the total
#         # number of batches processed, compute the average average
#         # loss, smooth it, and update the losses list with the
#         # smoothed value
#         l = logs["loss"]
#         self.batchNum += 1
#         self.avgLoss = (self.beta * self.avgLoss) + ((1 - self.beta) * l)
#         smooth = self.avgLoss / (1 - (self.beta ** self.batchNum))
#         self.losses.append(smooth)
        
#         # compute the maximum loss stopping factor value
#         stopLoss = self.stopFactor * self.bestLoss
        
#         # check to see whether the loss has grown too large
#         if self.batchNum > 1 and smooth > stopLoss:
#             # stop returning and return from the method
#             self.model.stop_training = True
#             return
        
#         # check to see if the best loss should be updated
#         if self.batchNum == 1 or smooth < self.bestLoss:
#             self.bestLoss = smooth
            
#         # increase the learning rate
#         lr *= self.lrMult
#         K.set_value(self.model.optimizer.lr, lr)
        
#     def find(self, trainData, startLR, endLR, epochs=None,
#         stepsPerEpoch=None, batchSize=64, sampleSize=2048):
            
#         # compute the total number of batch updates that will take
#         # place while we are attempting to find a good starting
#         # learning rate
#         numBatchUpdates = epochs * stepsPerEpoch
        
#         # derive the learning rate multiplier based on the ending
#         # learning rate, starting learning rate, and total number of
#         # batch updates
#         self.lrMult = (endLR / startLR) ** (1.0 / numBatchUpdates)
        
#         # grab the *original* learning rate (so we can reset it
#         # later), and then set the *starting* learning rate
#         origLR = K.get_value(self.model.optimizer.lr)
#         K.set_value(self.model.optimizer.lr, startLR)

#         # construct a callback that will be called at the end of each
#         # batch, enabling us to increase our learning rate as training
#         # progresses
#         callback = LambdaCallback(on_batch_end=lambda batch, logs:
#             self.on_batch_end(batch, logs))
        
#         # train our model using Keras' fit method
#         """self.model.fit(
#             trainData,
#             batch_size=batchSize,
#             epochs=epochs,
#             callbacks=[callback])"""
        
#         history = self.model.fit(trainData, 
#                             batch_size=batchSize,
#                             epochs=epochs, 
#                             steps_per_epoch=stepsPerEpoch,
#                             callbacks=[callback], 
#                             # for parallelization of reading from disk (I/O) pipeline
#                             max_queue_size=PRE_FETCH_NUM_BATCHES, 
#                             workers=NUM_THREADS, 
#                             use_multiprocessing=True)
        
#         # create a temporary file path for the model weights and
#         # then save the weights (so we can reset the weights when we
#         # are done)
#         self.weightsFile = tempfile.mkstemp()[1]
#         self.model.save_weights(self.weightsFile)
            
#         # restore the original model weights and learning rate
#         self.model.load_weights(self.weightsFile)
#         K.set_value(self.model.optimizer.lr, origLR)

#     def plot_loss(self, skipBegin=10, skipEnd=1, title="", show_candidates_only=True):
#         # grab the learning rate and losses values to plot
#         lrs = self.lrs[skipBegin:-skipEnd]
#         losses = self.losses[skipBegin:-skipEnd]
        
#         # extract the candidate losses
#         candidates = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e+0, 1e+1]

#         x_vals = list()
#         y_vals = list()
#         nextCandidate = 0
        
#         for i in range(len(lrs)):
#             if math.isclose(lrs[i], candidates[nextCandidate], rel_tol=0.09, abs_tol=0):
#                 x_vals.append(candidates[nextCandidate])
#                 y_vals.append(losses[i])
#                 nextCandidate += 1
            
#             if nextCandidate >= len(candidates): 
#                 break
                
#         x_vals.pop()
#         y_vals.pop()
#         x_vals.append(candidates[nextCandidate-1])
#         y_vals.append(losses[-1])
        
#         # plot the learning rate vs. loss
#         if show_candidates_only:
#             plt.title("10 Candidate Learning Rates")
#             plt.plot(x_vals, y_vals)
#         else:
#             plt.title("All Analyzed Learning Rates")
#             plt.plot(lrs, losses)
        
#         plt.xscale("log")
#         plt.xlabel("Learning Rate (Log Scale)")
#         plt.ylabel("Loss")
#         plt.grid()
#         plt.xticks(candidates)
        
#         # if the title is not empty, add it to the plot
#         if title != "":
#             plt.title(title)
            
#         plt.show()

In [30]:

# # uncomment if you want to analyze which upper and lower bound for the cyclical learning rate policy to use

# ###### initialize model
# model_lr = simple_model((256, 256, 3), learning_rate=1e-9)

# # initialize the learning rate finder and then train with learning
# # rates ranging from 1e-10 to 1e+1
# print("[INFO] finding learning rate...")
# lrf = LearningRateFinder(model_lr)
# lrf.find(train_generator,
#          1e-10, 1e+2,
#          stepsPerEpoch=np.ceil(len(y_train)/BATCHSIZE),
#          epochs=5, 
#          batchSize=BATCHSIZE)


In [28]:
# plot the loss for the candidate learning rates
#lrf.plot_loss()

In [29]:
# plot the loss for the various learning rates
#lrf.plot_loss(show_candidates_only=False)

#### From that figure identify the values of lrmin and lrmax.

I have identified them as follows:

    lr_min = 1e-5
    lr_max = 1e-2

### Training

Use the cyclical learning rate policy (with exponential decay).

In [31]:
# STEPS_PER_EPOCH = len(y_train)//BATCHSIZE

In [32]:
# # define cyclical learning rate policy
# lr_min = 1e-9
# lr_max = 1e-3

# clr = tfa.optimizers.CyclicalLearningRate(
#         initial_learning_rate=lr_min,
#         maximal_learning_rate=lr_max,
#         scale_fn=lambda x: 1/(2.**(x-1)), # exponential decay
#         step_size=2 * STEPS_PER_EPOCH
#       )

Create the model.

In [24]:
# create the network
#model = simple_model((256, 256, 3), learning_rate=clr, output_neurons=len(np.unique(y_train)))
#model = simple_model_with_gps([(256, 256, 3), np.shape(gps)], learning_rate=clr, output_neurons=len(np.unique(y_train)))

model = simple_model((256, 256, 3))


Train the network.

In [25]:
# # define an early stopping callback for when model converges
# early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=3, 
#                                               verbose=0, mode='auto', baseline=None, restore_best_weights=True)

In [26]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.001, patience=5, 
                                              verbose=0, mode='auto', baseline=None, restore_best_weights=True)


In [None]:
history = model.fit(train_generator, epochs=100, callbacks=[early_stop], #steps_per_epoch=STEPS_PER_EPOCH, 
                    validation_data=val_generator,
                    # for parallelization of reading from disk (I/O) pipeline
                    max_queue_size=PRE_FETCH_NUM_BATCHES, workers=NUM_THREADS, use_multiprocessing=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

##### model.save('simple_cnn_kennedydata')