In [1]:
import azureml.core
from azureml.core import Experiment, Workspace, Dataset, Datastore, ScriptRunConfig
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
import os
import shutil
import glob
from os.path import join
import tensorflow as tf
from sys import path
import numpy as np

# check core SDK version number

print("Azure ML SDK Version: ", azureml.core.VERSION)

In [2]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
# get metadata about the workspace
print(ws.name, ws.location, ws.resource_group, sep='\t')
# list the registered datastores
ws.datastores

wetlands	eastus	cic_ai


{'wetlanddatastore': {
   "name": "wetlanddatastore",
   "container_name": "data",
   "account_name": "wetlands2489098639",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspacefilestore': {
   "name": "workspacefilestore",
   "container_name": "azureml-filestore-2640f222-8801-40ec-b413-83a7ec003a55",
   "account_name": "wetlands2489098639",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspaceblobstore': {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-2640f222-8801-40ec-b413-83a7ec003a55",
   "account_name": "wetlands2489098639",
   "protocol": "https",
   "endpoint": "core.windows.net"
 }}

In [3]:
# access our registered data share containing image data in this workspace
datastore = Datastore.get(workspace = ws, datastore_name = 'wetlanddatastore')

train_path = (datastore, 'data/training/')
eval_path = (datastore, 'data/eval/')

test_path = (datastore, 'data/predict/test_aoi_David1_DE')

train_dataset = Dataset.File.from_files(path = [train_path])
eval_dataset = Dataset.File.from_files(path = [eval_path])
test_dataset = Dataset.File.from_files(path = [test_path])

# when we combine datasets the selected directories and relative paths to the datastore are brought in
# mount folder
# |-cddatafilestore
# | |-GEE
# | | |-training
# | | |-eval
# | |-Onera
# | | |-training
# | | |-eval
# train_dataset = Dataset.File.from_files(path = [gee_train_path, onera_train_path])
# eval_dataset = Dataset.File.from_files(path = [gee_eval_path, onera_eval_path])



In [5]:
envs = Environment.list(workspace = ws)
env = envs.get('wetland-training')

In [4]:
experiment_name = 'wetland-unet'
exp = Experiment(workspace = ws, name = experiment_name)

In [6]:
# define the compute target
ws.compute_targets
wetlandGPU = ws.compute_targets['wetlandsGPU']
wetlandDSVM = ws.compute_targets['wetland-dsvm']
wetlandDSVM2 = ws.compute_targets['wetland-dsvm2']

## Project specific code

In [7]:
# make sure we have the most current version of github repo
%cd Satellite_ComputerVision
!git pull
%cd ..

/mnt/batch/tasks/shared/LS_root/mounts/clusters/wetlandsbasiccpu/code/Users/mevans/Satellite_ComputerVision
Already up to date.
/mnt/batch/tasks/shared/LS_root/mounts/clusters/wetlandsbasiccpu/code/Users/mevans


In [8]:
# attach our utilities folder to the path to import modules
path.append('/home/azureuser/cloudfiles/code/Users/mevans/Satellite_ComputerVision/utils')

In [9]:
from model_tools import get_model, make_confusion_matrix
from processing import get_training_dataset

In [10]:
# Define some global variabes

# specify surface layers
lidar = ['lidar_intensity']
geomorphon = ["geomorphons"]

# Specify inputs (Sentinel bands) to the model
opticalBands = ['B3', 'B4', 'B5', 'B6']
thermalBands = ['B8', 'B11', 'B12']
senBands = opticalBands + thermalBands

# get band names for three seasons
seasonalBands = [[band+'_summer', band + '_fall', band + '_spring'] for band in senBands]


# specify NAIP bands
naipBands = ['R', 'G', 'B', 'N']

BANDS = [item for sublist in seasonalBands for item in sublist] + naipBands
MORPHS = lidar + geomorphon
RESPONSE = 'wetland'
FEATURES = BANDS + MORPHS + [RESPONSE]
print(FEATURES)
# Specify the size and shape of patches expected by the model.
KERNEL_SIZE = 256
KERNEL_SHAPE = [KERNEL_SIZE, KERNEL_SIZE]
COLUMNS = [
  tf.io.FixedLenFeature(shape=KERNEL_SHAPE, dtype=tf.float32) for k in FEATURES
]
FEATURES_DICT = dict(zip(FEATURES, COLUMNS))

['B3_summer', 'B3_fall', 'B3_spring', 'B4_summer', 'B4_fall', 'B4_spring', 'B5_summer', 'B5_fall', 'B5_spring', 'B6_summer', 'B6_fall', 'B6_spring', 'B8_summer', 'B8_fall', 'B8_spring', 'B11_summer', 'B11_fall', 'B11_spring', 'B12_summer', 'B12_fall', 'B12_spring', 'R', 'G', 'B', 'N', 'lidar_intensity', 'geomorphons', 'wetland']


### Calculate Training Data Stats (optional)

In [14]:
# Create mountcontext and mount the dataset
train_mount = train_dataset.mount()

# open access to the mount point
train_mount.start() 

# Get the mount point
dataset_mount_folder = train_mount.mount_point
print(dataset_mount_folder)

train_files = []
i = 1
for root, dirs, files in os.walk(dataset_mount_folder):
    for f in files:
        if i%2 == 0:
            train_files.append(join(root, f))
        i += 1
# test_files = glob.glob(join(dataset_mount_folder, '*.'))

/tmp/tmps0ppyrbs


In [15]:
len(train_files)

2037

In [18]:

# read in the training data so we can calculate size

training = get_training_dataset(
    files = train_files,
    ftDict = FEATURES_DICT,
    features = naipBands,#BANDS+MORPHS, # reading data can be memory intensive. for the purpose of calculating class weights, we don't need all the predictor variables
    response = RESPONSE,
    buff = 1,
    batch = 1,
    repeat = False,
    splits = None)#,
    #one_hot = {'geomorphons':11})

In [19]:
# confirm the dataset looks like we expect
iterator = iter(training)

print(iterator.next())

Downloaded path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/data/training/UNET_256wetlandDE100.tfrecord.gz is different from target path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/UNET_256wetlandDE100.tfrecord.gz
(<tf.Tensor: shape=(1, 256, 256, 4), dtype=float32, numpy=
array([[[[-6.4164937e-01, -2.4784486e-01, -8.0650425e-01,
           1.6959982e+00],
         [-7.0066410e-01, -3.1015149e-01, -6.9916737e-01,
           1.7099828e+00],
         [-7.4166703e-01, -2.8572398e-01, -6.7805219e-01,
           1.7054435e+00],
         ...,
         [-4.8748913e-01, -3.3851597e-01, -8.7280816e-01,
           1.6988133e+00],
         [-5.8821177e-01, -5.1748689e-02, -9.9275595e-01,
           1.6327169e+00],
         [-5.2300161e-01, -1.3393548e-01, -9.9303609e-01,
           1.6499727e+00]],

        [[-6.4283484e-01, -1.9179049e-01, -8.4793019e-01,
           1.6825558e+00],
         [-6.8737984e-01, -2.7924204e-01, -7.3782641e-01,
           1.7044485e+00],
         

In [20]:
# # calculate some summary statistics used in model training
m = get_model(depth = len(naipBands), optim = tf.keras.optimizers.Adam(learning_rate = 0.001, beta_1=0.9, beta_2=0.999), loss = 'mse', mets = [tf.keras.metrics.categorical_accuracy], bias = None)
train_con_mat = make_confusion_matrix(training, m)

Downloaded path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/data/training/UNET_256wetlandDE1025.tfrecord.gz is different from target path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/UNET_256wetlandDE1025.tfrecord.gz
Downloaded path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/data/training/UNET_256wetlandDE1035.tfrecord.gz is different from target path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/UNET_256wetlandDE1035.tfrecord.gz
Downloaded path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/data/training/UNET_256wetlandDE1045.tfrecord.gz is different from target path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/UNET_256wetlandDE1045.tfrecord.gz
Downloaded path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/data/training/UNET_256wetlandDE1050.tfrecord.gz is different from target path: /tmp/tmphbr4c_s9/b070b329-680b-4405-893d-c0444c77b74b/UNET_256wetlandDE1050.tfrecord.gz
Downloaded path: /tmp/tmphbr4c_s9/b070b329-680b-

In [21]:
classums = train_con_mat.sum(axis = 1)
BIAS = np.log(classums[1]/classums[0])
WEIGHT = classums[0]/classums[1]
TRAIN_SIZE = train_con_mat.sum()//(256*256)

print('size = ', TRAIN_SIZE)
print(f'bias = {BIAS}')
print(f'weight = {WEIGHT}')

size =  7207
bias = -0.7054408116732834
weight = 2.024739016826708


In [47]:
# 3507+3459+3507+3752
(0.0957+ -1.4 + -0.42 + -1.3) /4
# (0.91  + 4.02 + 1.5 + 3.7)/4

-0.756075

In [22]:
train_mount.stop()

### Define Training Parameters

In [11]:
# create script run config
# use the azure folder as our script folder
source = 'Satellite_ComputerVision'
util_folder = 'utils'
script_folder = 'azure'
script_file = 'train_wetland_{}.py'

# copy the training script from github repo to local folder
# shutil.copy(src = os.path.join(source, script_folder, script_file),
# dst = os.path.join(script_folder, script_file))

# Initially we need to copy the directories
# shutil.copytree(src = '/home/azureuser/cloudfiles/code/Users/mevans/Satellite_ComputerVision/azure', dst = '/home/azureuser/cloudfiles/code/Users/mevans/azure')
# shutil.copytree(src = '/home/azureuser/cloudfiles/code/Users/mevans/Satellite_ComputerVision/utils', dst = '/home/azureuser/cloudfiles/code/Users/mevans/azure/utils')



In [31]:
print(FEATURES)
basic = ['B3_summer', 'B3_fall', 'B3_spring', 'B4_summer', 'B4_fall', 'B4_spring', 'B5_summer', 'B5_fall', 'B5_spring', 'B6_summer', 'B6_fall', 'B6_spring', 'B8_summer', 'B8_fall', 'B8_spring', 'B11_summer', 'B11_fall', 'B11_spring', 'B12_summer', 'B12_fall', 'B12_spring', 'R', 'G', 'B', 'N']
wlidar = ['B3_summer', 'B3_fall', 'B3_spring', 'B4_summer', 'B4_fall', 'B4_spring', 'B5_summer', 'B5_fall', 'B5_spring', 'B6_summer', 'B6_fall', 'B6_spring', 'B8_summer', 'B8_fall', 'B8_spring', 'B11_summer', 'B11_fall', 'B11_spring', 'B12_summer', 'B12_fall', 'B12_spring', 'R', 'G', 'B', 'N', 'lidar_intensity']
wgeomorphon = ['B3_summer', 'B3_fall', 'B3_spring', 'B4_summer', 'B4_fall', 'B4_spring', 'B5_summer', 'B5_fall', 'B5_spring', 'B6_summer', 'B6_fall', 'B6_spring', 'B8_summer', 'B8_fall', 'B8_spring', 'B11_summer', 'B11_fall', 'B11_spring', 'B12_summer', 'B12_fall', 'B12_spring', 'R', 'G', 'B', 'N', 'lidar_intensity', 'geomorphons']
full = ['B3_summer', 'B3_fall', 'B3_spring', 'B4_summer', 'B4_fall', 'B4_spring', 'B5_summer', 'B5_fall', 'B5_spring', 'B6_summer', 'B6_fall', 'B6_spring', 'B8_summer', 'B8_fall', 'B8_spring', 'B11_summer', 'B11_fall', 'B11_spring', 'B12_summer', 'B12_fall', 'B12_spring', 'R', 'G', 'B', 'N', 'lidar_intensity', 'geomorphons', 'wetland']
features = 'basic'

['B3_summer', 'B3_fall', 'B3_spring', 'B4_summer', 'B4_fall', 'B4_spring', 'B5_summer', 'B5_fall', 'B5_spring', 'B6_summer', 'B6_fall', 'B6_spring', 'B8_summer', 'B8_fall', 'B8_spring', 'B11_summer', 'B11_fall', 'B11_spring', 'B12_summer', 'B12_fall', 'B12_spring', 'R', 'G', 'B', 'N', 'lidar_intensity', 'geomorphons', 'wetland']


In [13]:
# define the command line arguments to our training sript
args = [
    '--train_data', train_dataset.as_mount(),
    '--eval_data', eval_dataset.as_mount(),
    '--test_data', test_dataset.as_mount(),
    '--weight', 2, 
    '--bias', -0.7,
    '-lr', 0.001,
    '--epochs', 100,
    '--batch', 16,
    '--size', 7207, #14225,
    '--kernel_size', 256,
    '--response', 'wetland']
    # '--bands', "B3_summer " "B3_fall " "B3_spring " "B4_summer " "B4_fall " "B4_spring " "B5_summer " "B5_fall " "B5_spring " "B6_summer " "B6_fall " "B6_spring " "B8_summer " "B8_fall " "B8_spring " "B11_summer " "B11_fall " "B11_spring " "B12_summer " "B12_fall " "B12_spring " "R " "G " "B " "N " "lidar_intensity " "geomorphons"
    # '--splits', None]

# TODO: figure out how to pass lists as command line arguments



In [28]:
#  run the training job
src = ScriptRunConfig(source_directory=script_folder,
                      script=script_file.format(features), 
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)
run = exp.submit(config=src, tags = dict({'splits':'None', 'one_hot':'geomorphons', 'model':'Unet', 'normalization':'pixel', 'features':f'{features}'}))
run

Experiment,Id,Type,Status,Details Page,Docs Page
wetland-unet,wetland-unet_1623790008_941cae1d,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [29]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [37]:
#  run the training job
features = 'basic'
src_basic = ScriptRunConfig(source_directory=script_folder,
                      script=script_file.format(features), 
                      arguments=args,
                      compute_target=wetlandGPU,
                      environment=env)
run_basic = exp.submit(config=src_basic, tags = dict({'splits':'None', 'one_hot':'geomorphons', 'model':'Unet', 'normalization':'pixel', 'features':f'{features}'}))
run_basic

Experiment,Id,Type,Status,Details Page,Docs Page
wetland-unet,wetland-unet_1623790695_d58ff016,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [None]:
RunDetails(run_basic).show()

In [39]:
#  run the training job
features = 'wlidar'
src_lidar = ScriptRunConfig(source_directory=script_folder,
                      script=script_file.format(features), 
                      arguments=args,
                      compute_target=wetlandDSVM,
                      environment=env)
run_lidar = exp.submit(config=src_lidar, tags = dict({'splits':'None', 'one_hot':'geomorphons', 'model':'Unet', 'normalization':'pixel', 'features':f'{features}'}))
run_lidar

Experiment,Id,Type,Status,Details Page,Docs Page
wetland-unet,wetland-unet_1623804375_e4b4909d,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [14]:
features = 'wgeomorphon'
src_geomorphon = ScriptRunConfig(source_directory = script_folder,
    script = script_file.format(features),
    arguments = args,
    compute_target = wetlandGPU,
    environment = env)

run_geomporphon = exp.submit(config = src_geomorphon, tags = dict({'splits':'None', 'one_hot':'geomorphons', 'model':'Unet', 'normalization':'pixel', 'features':f'{features}'}))

In [15]:
from azureml.widgets import RunDetails
RunDetails(run_geomporphon).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…