In [1]:
#!/usr/bin/python

# Connect GCE service account to Earth engine API
# Note: Accessing EE Api through Cloud requires connecting your service account through a JSON Key
# https://gis.stackexchange.com/questions/350527/authentication-issue-earth-engine-python-using-ee-serviceaccountcredentials

# Google Cloud Project ID: ee-lakex055
# Google Cloud Project Name: Spurge EE New Cloud ProjectUMN

import ee
service_account = 'spurge-demography-earthengine@ee-lakex055.iam.gserviceaccount.com'
credentials = ee.ServiceAccountCredentials(service_account, '/home/moeller/lakex055/LeafySpurgeDemography/jsonKeys/ee-lakex055-527940b5071a.json')
ee.Initialize(credentials)


In [2]:
# Other module imports

import os
import pandas as pd
import numpy as np
import datetime
import pprint
import time
from functools import reduce
from pprint import pprint
import geemap #advanced python function for GEE
import fsspec # file system specification
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [3]:
# Tensorflow & Keras setup.

# Tensorflow setup.
import tensorflow as tf
print(tf.__version__)

# Keras setup.
import keras
from keras import layers
from keras import backend as K
from keras import regularizers
from keras import optimizers
from keras.regularizers import l2
from keras.layers import Input, Dense, Activation, BatchNormalization, Dropout, Flatten, Lambda, SpatialDropout1D, Concatenate, Flatten
from keras.layers import Conv1D, Conv2D, AveragePooling1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.callbacks import Callback, ModelCheckpoint, History, EarlyStopping
from keras.models import Model, load_model
from keras.utils.np_utils import to_categorical


2.9.1


In [4]:
# Define file name prefixes/suffix for training, testing, and validation files
TRAIN_FILE_PREFIX = 'Training_nlcd2019'
TEST_FILE_PREFIX = 'Testing_nlcd2019'
VALID_FILE_PREFIX = 'Validation_nlcd2019'

file_extension = '.tfrecord.gz'

# Define bands used for model training.
BANDS = ['0_BlueMarchApril2018',
 '0_GreenMarchApril2018',
 '0_RedMarchApril2018',
 '0_NIRMarchApril2018',
 '0_SWIR1MarchApril2018',
 '0_SWIR2MarchApril2018',
 '0_NDVIMarchApril2018',
 '0_BlueMayJune2018',
 '0_GreenMayJune2018',
 '0_RedMayJune2018',
 '0_NIRMayJune2018',
 '0_SWIR1MayJune2018',
 '0_SWIR2MayJune2018',
 '0_NDVIMayJune2018',
 '0_BlueJulyAug2018',
 '0_GreenJulyAug2018',
 '0_RedJulyAug2018',
 '0_NIRJulyAug2018',
 '0_SWIR1JulyAug2018',
 '0_SWIR2JulyAug2018',
 '0_NDVIJulyAug2018',
 '1_BlueMarchApril2019',
 '1_GreenMarchApril2019',
 '1_RedMarchApril2019',
 '1_NIRMarchApril2019',
 '1_SWIR1MarchApril2019',
 '1_SWIR2MarchApril2019',
 '1_NDVIMarchApril2019',
 '1_BlueMayJune2019',
 '1_GreenMayJune2019',
 '1_RedMayJune2019',
 '1_NIRMayJune2019',
 '1_SWIR1MayJune2019',
 '1_SWIR2MayJune2019',
 '1_NDVIMayJune2019',
 '1_BlueJulyAug2019',
 '1_GreenJulyAug2019',
 '1_RedJulyAug2019',
 '1_NIRJulyAug2019',
 '1_SWIR1JulyAug2019',
 '1_SWIR2JulyAug2019',
 '1_NDVIJulyAug2019',
 '2_BlueMarchApril2020',
 '2_GreenMarchApril2020',
 '2_RedMarchApril2020',
 '2_NIRMarchApril2020',
 '2_SWIR1MarchApril2020',
 '2_SWIR2MarchApril2020',
 '2_NDVIMarchApril2020',
 '2_BlueMayJune2020',
 '2_GreenMayJune2020',
 '2_RedMayJune2020',
 '2_NIRMayJune2020',
 '2_SWIR1MayJune2020',
 '2_SWIR2MayJune2020',
 '2_NDVIMayJune2020',
 '2_BlueJulyAug2020',
 '2_GreenJulyAug2020',
 '2_RedJulyAug2020',
 '2_NIRJulyAug2020',
 '2_SWIR1JulyAug2020',
 '2_SWIR2JulyAug2020',
 '2_NDVIJulyAug2020']

LABEL = 'class'

# Number of label values, i.e. number of classes in the classification.
N_CLASSES = 10

# These names are used to specify properties in the export of
# training/testing data and to define the mapping between names and data
# when reading into TensorFlow datasets.
FEATURE_NAMES = list(BANDS)
FEATURE_NAMES.append(LABEL)

Data preparation and pre-processing

Read data from the TFRecord file into a tf.data.Dataset. Pre-process the dataset to get it into a suitable format for input to the model.

Read into a tf.data.Dataset

Here we are going to read a file in Cloud Storage into a tf.data.Dataset. (these TensorFlow docs explain more about reading data into a Dataset). Check that you can read examples from the file. The purpose here is to ensure that we can read from the file without an error. The actual content is not necessarily human readable.

In [5]:

#Naviate to the location of files for training models
filebase = '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/'

files_list = tf.io.gfile.listdir(filebase)

training_files_list = [t for t in files_list if TRAIN_FILE_PREFIX in t]

def prepend(list, str):
    # Using format()
    str += '{0}'
    list = [str.format(i) for i in list]
    return(list)
  
# Driver function
# Add the file base to each of the file names to create full absolute paths
training_files_list = prepend(training_files_list, filebase)

print(training_files_list[0:5])

# Number of training files
print(len(training_files_list))


['/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Training_nlcd2019_1.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Training_nlcd2019_2.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Training_nlcd2019_3.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Training_nlcd2019_4.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Training_nlcd2019_5.tfrecord.gz']
59


In [6]:
# Create training, testing, and validation datasets from the TFRecord file

#Modify the TRAIN_FILE_PATH to be a List of filenames to read for training

test_file_path_list = []
valid_file_path_list = []

for r in range(len(training_files_list)): #length of testing files
  test_path = training_files_list[r].replace("Training", "Testing")
  test_file_path_list.append(test_path)
  valid_path = training_files_list[r].replace("Training", "Validation")
  valid_file_path_list.append(valid_path)

    
#Verify training/testing/validation filepaths are correct
print(test_file_path_list[0:5])
print(len(test_file_path_list))
    



['/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Testing_nlcd2019_1.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Testing_nlcd2019_2.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Testing_nlcd2019_3.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Testing_nlcd2019_4.tfrecord.gz', '/panfs/roc/groups/7/moeller/lakex055/LeafySpurgeDemography/landcoverPointSamples/NLCD_2019_points/Testing_nlcd2019_5.tfrecord.gz']
59


# Create a tensorflow record dataset from training file path list
Output code will be hard to interpret because of the TFRecord format

In [7]:
# This dataset loads TFRecords from the files as bytes, exactly as they were written.
# TFRecordDataset does not do any parsing or decoding on its own. Parsing and decoding can be done in the next steps
# https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset

train_dataset = tf.data.TFRecordDataset(training_files_list, compression_type='GZIP')

valid_dataset = tf.data.TFRecordDataset(valid_file_path_list, compression_type='GZIP')

# Print encoded records
# example: tf.Tensor(b"\n\xfc\x0f\n \n\x140_BlueMarchApril2018\x12\x08\x12\x06\n\x04\xa1\xb9N=\n!\n\x1
print(iter(train_dataset).next())

tf.Tensor(b'\n\xfc\x0f\n \n\x140_BlueMarchApril2018\x12\x08\x12\x06\n\x04\xa9jJ=\n!\n\x150_GreenMarchApril2018\x12\x08\x12\x06\n\x04r\xe1x=\n\x1f\n\x130_RedMarchApril2018\x12\x08\x12\x06\n\x04\xd6s\x82=\n\x1f\n\x130_NIRMarchApril2018\x12\x08\x12\x06\n\x04\xab\xb2/>\n!\n\x150_SWIR1MarchApril2018\x12\x08\x12\x06\n\x04\xfd\xbc\xf9=\n!\n\x150_SWIR2MarchApril2018\x12\x08\x12\x06\n\x04\x03[\xa9=\n \n\x140_NDVIMarchApril2018\x12\x08\x12\x06\n\x04\xb8W\xf4>\n\x1d\n\x110_BlueMayJune2018\x12\x08\x12\x06\n\x04\x80\x9f\xe1<\n\x1e\n\x120_GreenMayJune2018\x12\x08\x12\x06\n\x04\xd6\x8bA=\n\x1c\n\x100_RedMayJune2018\x12\x08\x12\x06\n\x04\x9d\x11==\n\x1c\n\x100_NIRMayJune2018\x12\x08\x12\x06\n\x04a\x89C>\n\x1e\n\x120_SWIR1MayJune2018\x12\x08\x12\x06\n\x04\x81\xcf\x03>\n\x1e\n\x120_SWIR2MayJune2018\x12\x08\x12\x06\n\x04\x08U\x9a=\n\x1d\n\x110_NDVIMayJune2018\x12\x08\x12\x06\n\x04"\xe8\x1a?\n\x1d\n\x110_BlueJulyAug2018\x12\x08\x12\x06\n\x04,\x0e\x0f=\n\x1e\n\x120_GreenJulyAug2018\x12\x08\x12\x06\n\x04a\x

Define the structure of your data

For parsing the exported TFRecord files, featuresDict is a mapping between feature names (recall that featureNames contains the band and label names) and float32 tf.io.FixedLenFeature objects. This mapping is necessary for telling TensorFlow how to read data in a TFRecord file into tensors. Specifically, all numeric data exported from Earth Engine is exported as float32.

(Note: features in the TensorFlow context (i.e. tf.train.Feature) are not to be confused with Earth Engine features (i.e. ee.Feature), where the former is a protocol message type for serialized data input to the model and the latter is a geometry-based geographic data structure.)

In [8]:
# List of fixed-length features, all of which are float32.
columns = [
  tf.io.FixedLenFeature(shape=[1], dtype=tf.float32) for k in FEATURE_NAMES
]

# Dictionary with names as keys, features as values.
features_dict = dict(zip(FEATURE_NAMES, columns))

pprint(features_dict)

{'0_BlueJulyAug2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_BlueMarchApril2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_BlueMayJune2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_GreenJulyAug2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_GreenMarchApril2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_GreenMayJune2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_NDVIJulyAug2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_NDVIMarchApril2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_NDVIMayJune2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_NIRJulyAug2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_NIRMarchApril2018': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 '0_NIRMayJune2018'

Parse the dataset

Now we need to make a parsing function for the data in the TFRecord files. The data comes in flattened 2D arrays per record and we want to use the first part of the array for input to the model and the last element of the array as the class label. The parsing function reads data from a serialized Example proto into a dictionary in which the keys are the feature names and the values are the tensors storing the value of the features for that example. (These TensorFlow docs explain more about reading Example protos from TFRecord files).

Note that each record of the parsed dataset contains a tuple. The first element of the tuple is a dictionary with bands for keys and the numeric value of the bands for values. The second element of the tuple is a class label.

In [13]:
def parse_tfrecord(example_proto):
  """The parsing function.

  Read a serialized example into the structure defined by featuresDict.

  Args:
    example_proto: a serialized Example.

  Returns:
    A tuple of the predictors dictionary and the label, cast to an `int32`.
  """
  parsed_features = tf.io.parse_single_example(example_proto, features_dict)
  labels = parsed_features.pop(LABEL)
  return parsed_features, tf.cast(labels, tf.int32)

# Map the function over the dataset.
parsed_training_dataset = train_dataset.map(parse_tfrecord, num_parallel_calls=5)

parsed_validation_dataset = valid_dataset.map(parse_tfrecord, num_parallel_calls=5)

# Print the first parsed record to check.
pprint(iter(parsed_validation_dataset).next())

({'0_BlueJulyAug2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.05977875], dtype=float32)>,
  '0_BlueMarchApril2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.07429875], dtype=float32)>,
  '0_BlueMayJune2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.05517938], dtype=float32)>,
  '0_GreenJulyAug2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.08705188], dtype=float32)>,
  '0_GreenMarchApril2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.08727188], dtype=float32)>,
  '0_GreenMayJune2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.08237687], dtype=float32)>,
  '0_NDVIJulyAug2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.50607824], dtype=float32)>,
  '0_NDVIMarchApril2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.32166594], dtype=float32)>,
  '0_NDVIMayJune2018': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.52795774], dtype=float32)>,
  '0_NIRJulyAug2018': <tf.Tensor: shape=(1,

<ParallelMapDataset element_spec=({'0_BlueJulyAug2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_BlueMarchApril2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_BlueMayJune2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_GreenJulyAug2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_GreenMarchApril2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_GreenMayJune2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_NDVIJulyAug2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_NDVIMarchApril2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_NDVIMayJune2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_NIRJulyAug2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_NIRMarchApril2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_NIRMayJune2018': TensorSpec(shape=(1,), dtype=tf.float32, name=None), '0_RedJulyAug2018': TensorSpec(shape=(1,), dtype=tf.float32, name

Create the Keras model

Before we create the model, there's still a wee bit of pre-processing to get the data into the right input shape and a format that can be used with cross-entropy loss. Specifically, Keras expects a list of inputs and a one-hot vector for the class. (See the Keras loss function docs, the TensorFlow categorical identity docs and the tf.one_hot docs for details).

Here we will use a simple neural network model with a 64 node hidden layer, a dropout layer and an output layer. Once the dataset has been prepared, define the model, compile it, fit it to the training data. See the Keras Sequential model guide for more details.

In [21]:
from tensorflow import keras

# Keras requires inputs as a tuple.  Note that the inputs must be in the
# right shape.  Also note that to use the categorical_crossentropy loss,
# the label needs to be turned into a one-hot vector.
def to_tuple(inputs, label):
  return (tf.transpose(list(inputs.values())),
          tf.one_hot(indices=label, depth=N_CLASSES))

# Map the to_tuple function, shuffle and batch.
input_training_dataset = parsed_training_dataset.map(to_tuple).batch(8, drop_remainder=True)

input_validation_dataset = parsed_validation_dataset.map(to_tuple).batch(8, drop_remainder=True)

print(input_training_dataset, type(input_training_dataset))

train_np = np.array(list(input_training_dataset))


ImportError: cannot import name 'tfds' from 'tensorflow' (/home/moeller/lakex055/.local/lib/python3.8/site-packages/tensorflow/__init__.py)

In [12]:
from tensorflow import keras

# Keras requires inputs as a tuple.  Note that the inputs must be in the
# right shape.  Also note that to use the categorical_crossentropy loss,
# the label needs to be turned into a one-hot vector.
def to_tuple(inputs, label):
  return (tf.transpose(list(inputs.values())),
          tf.one_hot(indices=label, depth=N_CLASSES))

# Map the to_tuple function, shuffle and batch.
input_training_dataset = parsed_training_dataset.map(to_tuple).batch(8)

input_validation_dataset = parsed_validation_dataset.map(to_tuple).batch(8)



#-- parameters of the architecture
l2_rate = 1.e-6
dropout_rate = 0.10
nbclasses = 10

# Define the layers in the model.
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv1D(filters = 32, kernel_size = 3, strides = 1, padding = "causal", dilation_rate = 1, kernel_regularizer = l2(1.e-6), kernel_initializer = "he_normal"),
  tf.keras.layers.Activation('relu'),
  tf.keras.layers.Dropout(dropout_rate),
  tf.keras.layers.Conv1D(filters = 64, kernel_size = 3, strides = 1, padding = "causal", dilation_rate = 2, kernel_regularizer = l2(1.e-6), kernel_initializer = "he_normal"),
  tf.keras.layers.Activation('relu'),
  tf.keras.layers.Dropout(dropout_rate),
  tf.keras.layers.Conv1D(filters = 128, kernel_size = 3, strides = 1, padding = "causal", dilation_rate = 4, kernel_regularizer = l2(1.e-6), kernel_initializer = "he_normal"),
  tf.keras.layers.Activation('relu'),
  tf.keras.layers.Dropout(dropout_rate),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(units = 512, kernel_initializer="he_normal", kernel_regularizer=l2(1.e-6)),
tf.keras.layers.Activation('relu'),
tf.keras.layers.Dropout(dropout_rate),
tf.keras.layers.Dense(nbclasses, activation = "softmax", kernel_initializer="he_normal", kernel_regularizer=l2(1.e-6))
])


# Define Class Weights
# from sklearn.utils import class_weight
# class_weights = {0: 0.0001,
#                  1: 8.156186612576064,
#                  2: 3.5251315020455873,
#                  3: 30.983732876712327,
#                  4: 0.07975888744407467,
#                  5: 0.05826879417778994,
#                  6: 0.0388306490552271,
#                  7: 0.0370967576599387,
#                  8: 4.064353099730458,
#                  9: 13.428200371057514}


###
# Define Model Variables
###

# Model variables
n_epochs = 100
batch_size = 256
lr = 0.0001 #recommended in Allred et al., 2021
beta_1 = 0.9 #not used, but can be used to modify optimizer LR
beta_2 = 0.999
decay = 0.0
	
#Model Optimizer
#opt = tf.keras.optimizers.Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, decay=decay)
opt = tf.keras.optimizers.Adam(lr=lr)

# Compile Model
model.compile(optimizer = opt, loss = "mean_squared_error", metrics=[tf.keras.metrics.CategoricalAccuracy()])

#model.summary()

# Model callbacks
#checkpoint = ModelCheckpoint(out_model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=1, mode='auto')


#Plot Loss and Accuracy Callback
class PlotLearning(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        self.f1 = []
        self.val_f1 = []
        
        self.fig = plt.figure()
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.f1.append(logs.get('accuracy'))
        self.val_f1.append(logs.get('val_accuracy'))
        self.i += 1
        f, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
        
        clear_output(wait=True)
        
        ax1.set_yscale('log')
        ax1.plot(self.x, self.losses, label="loss")
        ax1.plot(self.x, self.val_losses, label="val loss")
        ax1.legend()
        
        ax2.plot(self.x, self.f1, label="Acc")
        ax2.plot(self.x, self.val_f1, label="val Acc ")
        ax2.legend()
        
        plt.show();
        
plot_losses = PlotLearning()

callback_list = [plot_losses]
		

start_train_time = time.time()

hist = model.fit(x = input_training_dataset, 
                 validation_data = input_validation_dataset,
                 epochs = n_epochs,
                 batch_size = batch_size, 
                 shuffle=True,
                 verbose=1, 
                 callbacks = [plot_losses])

train_time = round(time.time()-start_train_time, 2)



KeyboardInterrupt: 