### Necessary Imports and Installs

In [None]:
#!pip install opendatasets

In [None]:
#!pip install cartopy

In [1]:
!rm -rf GLC
!git clone https://github.com/maximiliense/GLC

Cloning into 'GLC'...
remote: Enumerating objects: 383, done.[K
remote: Counting objects: 100% (228/228), done.[K
remote: Compressing objects: 100% (159/159), done.[K
remote: Total 383 (delta 119), reused 170 (delta 63), pack-reused 155[K
Receiving objects: 100% (383/383), 10.57 MiB | 35.02 MiB/s, done.
Resolving deltas: 100% (205/205), done.


In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Add, GlobalAveragePooling2D, Conv2D, Dense, AveragePooling2D, \
BatchNormalization, Normalization, Dropout, Flatten, Lambda, Input, Activation
from tensorflow.keras import Model
from tensorflow.keras.optimizers import schedules, SGD
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import backend as K
#import tensorflow_datasets as tfds

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import scale
import time
from collections import defaultdict
import math
import copy
import threading
import opendatasets as od

%pylab inline --no-import-all
from pathlib import Path
import pandas as pd
import sys

from GLC.data_loading.common import load_patch

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-th2cd152 because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


# Dataset

## Download Dataset

{"username":"nathaliemh","key":"cba80d1f619e96b238e4a95aa3017836"}

In [None]:
#data = od.download("https://www.kaggle.com/competitions/geolifeclef-2022-lifeclef-2022-fgvc9")

## Load Dataset from file

In [2]:
# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9/")

In [3]:
ls -L $DATA_PATH

[0m[01;34mmetadata[0m/      [01;34mpatches-fr[0m/  [01;34mpatches_sample[0m/  [01;34mrasters[0m/
[01;34mobservations[0m/  [01;34mpatches-us[0m/  [01;34mpre-extracted[0m/   sample_submission.csv


In [4]:
ls $DATA_PATH/observations

observations_fr_test.csv   observations_us_test.csv
observations_fr_train.csv  observations_us_train.csv


In [5]:
### Training Dataset ###
# let's load the data from file
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")

df_obs = pd.concat((df_obs_fr, df_obs_us))

print("Number of observations for training: {}".format(len(df_obs)))

# let's have a look at the data
df_obs.head()

Number of observations for training: 1627475


Unnamed: 0_level_0,latitude,longitude,species_id,subset
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10561949,45.705116,1.424622,241,train
10131188,45.146973,6.416794,101,train
10799362,46.783695,-2.072855,700,train
10392536,48.604866,-2.825003,1456,train
10335049,48.815567,-0.161431,157,train


In [6]:
### Test Dataset ###
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")

df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

print("Number of observations for testing: {}".format(len(df_obs_test)))

df_obs_test.head()

Number of observations for testing: 36421


Unnamed: 0_level_0,latitude,longitude
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10782781,43.601788,6.940195
10364138,46.241711,0.683586
10692017,45.181095,1.533459
10222322,46.93845,5.298678
10241950,45.017433,0.960736


In [7]:
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
print(df_suggested_landcover_alignment.head())
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

   landcover_code  suggested_landcover_code suggested_landcover_label
0               0                         0              Missing Data
1               1                        11          Cultivated Crops
2               2                        11          Cultivated Crops
3               3                         6       Broad-leaved Forest
4               4                         7         Coniferous Forest


### Train/Val Split Labels
Retrieve the train/val split provided.

In [8]:
obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

obs_id_train = obs_id_train[:int(len(obs_id_train)/3)]

y_train = df_obs.loc[obs_id_train]["species_id"].values
y_val = df_obs.loc[obs_id_val]["species_id"].values

n_val = len(obs_id_val)
print("Training set size: {} ({:.1%} of train observations)".format(len(y_train), len(y_train) / len(df_obs)))
print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))

Training set size: 529131 (32.5% of train observations)
Validation set size: 40080 (2.5% of train observations)


In [9]:
print(len(obs_id_train))

529131


In [10]:
len(obs_id_train[int(1587395*0.99986):])

0

In [11]:
len(np.unique(y_train))

4810

In [14]:
# load training dataset samples
# factor = 1 means load full training dataset
# factor = 100 means load 1/100 of the full dataset
factor = 1
last_print = time.time()

#X_train = list() #np.array((np.shape(y_train), 256, 256, 3))
X_train = np.zeros((len(obs_id_train), 256, 256, 3))
for i in range(len(obs_id_train)):
    obs_id = obs_id_train[i]
    
    patch = load_patch(obs_id, DATA_PATH, landcover_mapping=landcover_mapping, data='rgb')
    #X_train.append(patch[0])
    X_train[i] = patch[0]
    
    #percent_progress = len(X_train)/(len(y_train)/factor) * 100
    percent_progress = i/(len(y_train)/factor) * 100
    
    if time.time() - last_print > 1:
        sys.stdout.write('\r')
        # the exact output you're looking for:
        sys.stdout.write("[%-20s] %.3f%%" % ('='*int(percent_progress/5), percent_progress))
        sys.stdout.flush()
        last_print = time.time()
        
    #if len(X_train) >= (len(y_train)/factor):
    #if percent_progress >= 99.98:
    #    break
print("done with loading")
    
X_train = np.array(X_train)
print("sample array shape: ", np.shape(X_train))
print("label array shape: ", np.shape(y_train))

[=====               ] 29.271%

KeyboardInterrupt: 

In [None]:
print(len(X_train))

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train[:]))
train_ds = train_ds.batch(64)

### Write patches and labels to TFRecord
This only needs to be done once to obtain a TFRecord file. This file then can be reused during future runs.

In [None]:
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
# Create a dictionary with features that may be relevant.
def image_tfexample(img, label):
    image_shape = img.shape

    feature = {
      'height': _int64_feature(image_shape[0]),
      'width': _int64_feature(image_shape[1]),
      'depth': _int64_feature(image_shape[2]),
      'label': _int64_feature(label),
      'image_raw': _bytes_feature(img.tobytes()),
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
# Write the raw image files to a TFRecord file (record_file).
# First, process all images into `tf.train.Example` messages.
# Then, write to a `.tfrecords` file.   
def writeToTFRecord(obs_ids, labels, record_file):
    
    counter = 0
    last_print = time.time()
    
    options = tf.io.TFRecordOptions(compression_type = 'GZIP')
    
    with tf.io.TFRecordWriter(record_file, options=options) as writer:
        for obs_id, label in zip(obs_ids, labels):
            patch = load_patch(obs_id, DATA_PATH)

            # convert single rgb image + label to tf example
            tf_example = image_tfexample(patch[0], label)

            # write to records file
            writer.write(tf_example.SerializeToString())
            counter += 1

            if time.time() - last_print > 1:
                percent_progress = counter/len(labels) * 100
                sys.stdout.write('\r')
                sys.stdout.write("[%-20s] %.3f%%" % ('='*int(percent_progress/5), percent_progress))
                sys.stdout.flush()
                last_print = time.time()
                
            if counter > 10000:
                break
            
    return counter

#### Training Set

In [None]:
np.shape(y_train)

In [None]:
# check whether it has the same length as y_train
np.shape(obs_id_train)

In [None]:
# write training dataset samples to TFrecord
train_record_amount = writeToTFRecord(obs_id_train, y_train, record_file = 'training.tfrecords')

#### Validation set

In [None]:
# write training dataset samples to TFrecord
val_record_amount = writeToTFRecord(obs_id_val, y_val, record_file = 'validation.tfrecords')

## Playing Around.

Convert data to TFRecords file.

In [None]:
# Create a dictionary with features that may be relevant.
def image_tfexample(img, label):
    image_shape = img.shape

    feature = {
      'height': _int64_feature(image_shape[0]),
      'width': _int64_feature(image_shape[1]),
      'depth': _int64_feature(image_shape[2]),
      'label': _int64_feature(label),
      'image_raw': _bytes_feature(img),
    }
    
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
# Write the raw image files to `images.tfrecords`.
# First, process all images into `tf.train.Example` messages.
# Then, write to a `.tfrecords` file.

record_file = 'X_train.tfrecords'
with tf.io.TFRecordWriter(record_file) as writer:
    for img, label in zip(X_train, y_train):

        # convert single image + label to tf example
        img_string = np.array2string(img)
        tf_example = image_tfexample(img, label)
    
        # write to records file
        writer.write(tf_example.SerializeToString())

Read first few images from records file to ensure it worked.

In [None]:
# Create a dictionary describing the features.
image_feature_description = {
    'height': tf.io.FixedLenFeature([], tf.int64),
    'width': tf.io.FixedLenFeature([], tf.int64),
    'depth': tf.io.FixedLenFeature([], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
}

def _parse_image_function(example_proto):
  # Parse the input tf.train.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, image_feature_description)

In [None]:
image_records = tf.data.TFRecordDataset(record_file)
parsed_image_records = image_records.map(_parse_image_function)

In [None]:
for record in parsed_image_records:
  image_raw = record['image_raw'].numpy()
  display.display(display.Image(data=image_raw))

Load entire dataset in batches.

In [None]:
# read file
dataset = tf.data.TFRecordDataset(record_file)

# parse each instance
dataset = dataset.map(_parse_image_function, num_parallel_calls=num_threads)

# shuffle
dataset = dataset.shuffle(buffer_size)

# form batch and epoch
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(num_epoch)
iterator = dataset.make_one_shot_iterator()

# get a batch
x_batch, y_batch = self.iterator.get_next()


## Write input pipeline to load batches as we train.
The dataset is so large that it doesn't fit into memory. We have to load each batch as we're training.

Plan (do this once for train and once for val):
1. Store entire dataset as TFRecord.
    - Convert each image to a TFExample as we're loading it from file. (Done)
    - Store image in TFRecord of respecitve (train/val) dataset. (Done)
2. Write custom generator which loads the batches from the TFRecord. 
    - https://medium.com/@mrgarg.rajat/training-on-large-datasets-that-dont-fit-in-memory-in-keras-60a974785d71
3. Use custom generator with fit_generator.

Implement performance speed-up: https://linuxtut.com/en/a7c31b08d2f76c886a92/ 

### Custom Generator
Since dataset is too large to load it all into memory once, we need to load it from file in batches as we train.

In [None]:
# Create a dictionary describing the features.
image_feature_description = {
    'height': tf.io.FixedLenFeature([], tf.int64),
    'width': tf.io.FixedLenFeature([], tf.int64),
    'depth': tf.io.FixedLenFeature([], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64),
    'image_raw': tf.io.FixedLenFeature([], tf.string),
}

def _parse_image_function(example_proto):
  # Parse the input tf.train.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_proto, image_feature_description)

In [None]:
class TFRecords_Generator(tf.keras.utils.Sequence) :
  
    def __init__(self, record_filename, record_amount, batch_size) :
        self.record_filename = record_filename
        self.record_amount = record_amount
        self.batch_size = batch_size

    def __len__(self) :
        return (np.ceil(self.record_amount / float(self.batch_size))).astype(int)
  
    def __getitem__(self, idx) :
        record_dataset = tf.data.TFRecordDataset(self.record_filename)
        parsed_image_records = record_dataset.map(_parse_image_function)
        
        ds2 = tf.data.TFRecordDataset(self.record_filename) \
          .batch(batch_size) \
          .apply(tf.data.experimental.parse_example_dataset(image_feature_description)) \
          .map(dict2tuple)

        #batch_records = 

        batch_x = self.image_filenames[idx * self.batch_size : (idx+1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]

        return np.array([
                resize(imread('/content/all_images/' + str(file_name)), (80, 80, 3))
                   for file_name in batch_x])/255.0, np.array(batch_y)

In [None]:
class Raw_Generator(tf.keras.utils.Sequence) :
  
    def __init__(self, obs_ids, labels, batch_size) :
        self.obs_ids = obs_ids
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self) :
        return (np.ceil(len(self.obs_ids) / float(self.batch_size))).astype(int)
  
    def __getitem__(self, idx) :
        X_batch = list()
        y_batch = list()

        for i in range(idx * self.batch_size, (idx+1) * self.batch_size):
            patch = load_patch(self.obs_ids[i], DATA_PATH)
            X_batch.append(patch[0])
            y_batch.append(self.labels[i])

        return np.asarray(X_batch), np.array(y_batch)

## Preprocess

# First Simple Neural Network
Let's create a first neural network as a baseline to see how it performs.

In [None]:
# for distributed training
# https://www.tensorflow.org/guide/distributed_training#use_tfdistributestrategy_with_keras_modelfit
mirrored_strategy = tf.distribute.MirroredStrategy()

In [None]:
# returns a 10 layer ReLU model of width 2
def simple_model(input_shape):
    
    # for distributed training
    with mirrored_strategy.scope():
        model = tf.keras.models.Sequential()
        model.add(Conv2D(6, 5, activation='tanh', input_shape=input_shape))
        model.add(AveragePooling2D(2))
        model.add(tf.keras.layers.Activation('sigmoid'))
        model.add(Conv2D(16, 5, activation='tanh'))
        model.add(AveragePooling2D(2))
        model.add(tf.keras.layers.Activation('sigmoid'))
        model.add(Conv2D(120, 5, activation='tanh'))
        model.add(Flatten())
        model.add(Dense(84, activation='tanh'))
        model.add(Dense(17031, activation='softmax'))
    
    
    """model = tf.keras.models.Sequential()
    
    # 1. Preprocessing
    # rescale inputs
    model.add(tf.keras.layers.Rescaling(1./255))
    
    # 2. Convolutional Layers
    model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=input_shape, padding='same'))
    model.add(AveragePooling2D())

    model.add(Conv2D(64, kernel_size=3, activation='relu', padding='same'))
    model.add(AveragePooling2D())
    
    model.add(Conv2D(128, kernel_size=3, activation='relu', padding='same'))
    model.add(Conv2D(128, kernel_size=3, activation='relu', padding='same'))
    model.add(Conv2D(128, kernel_size=3, activation='relu', padding='same'))
    model.add(AveragePooling2D())
    
    model.add(Conv2D(256, kernel_size=3, activation='relu', padding='same'))
    model.add(Conv2D(256, kernel_size=3, activation='relu', padding='same'))
    
    # from convolutional layers to dense layers
    model.add(tf.keras.layers.Flatten())
    
    # 3. Dense Layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(128, activation='relu'))
    
    # 4. Output Layer
    model.add(Dense(17038, activation='softmax'))"""
    
    # compire the model
    model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  metrics=['accuracy'])
    
    return model

In [None]:
# create the network
model = simple_model((256, 256, 3))

Train the network.

In [None]:
BATCHSIZE = 128

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.001, patience=5, 
                                              verbose=0, mode='auto', baseline=None, restore_best_weights=True)

In [None]:
generator = Raw_Generator(obs_id_train, y_train, BATCHSIZE)

In [None]:
#history = model.fit(generator, epochs=100, callbacks=[early_stop])
history = model.fit(train_ds, epochs=100, callbacks=[early_stop])

In [None]:
model.save('first_simple_model')