In [None]:
!pwd

In [2]:
# Check core SDK version number
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Environment, Experiment, Model
from azureml.core.run import Run
from azureml.core.runconfig import RunConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute, RemoteCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.train.estimator import Estimator
from azureml.core.conda_dependencies import CondaDependencies
from azureml.data.datapath import DataPath

#Other stuff
import os
import glob
import shutil
from pathlib import Path
print("SDK version:", azureml.core.VERSION)

SDK version: 1.6.0


In [3]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()

## IMPORTANT REMINDER about versions 
**The version of Detectron2 that runs on the AML cluster is behind the one that runs on the VM (Spearhead3).
Model weights are probably not transferable between the cluster and Spearhead3.**  I can't update the cluster version because _the cluster machines use CUDA 10.1._  I may be able to downgrade Spearhead3's CUDA version in the future, but it's still useful for other development (tiling, etc.).

**Differences between versions of Detectron2:**  
- This version uses TransformGen, which has been renamed to Augmentation in the newer version on Spearhead3.

## Data Preparation Steps

### Copy data from AWS to the Azure VM (if necessary)
See `trident_tz_on_vm.ipynb` for details.  

### Copy data from the VM to the workspace blobstore (if necessary)
See `trident_tz_on_vm.ipynb` for details.  Remember that you only need to copy tiled images and annotations; the originals don't matter for the model.

### Remake a Dataset (if necessary)

In [17]:
#REMAKE A DATASET
# # retrieve an existing datastore in the workspace by name
datastore = Datastore.get(ws, 'workspaceblobstore')

# #A "FileDataset" is really just a name for a blob, and nothing happens until it's mounted.
# #Once it's mounted, then you have to use other tools to get its paths 
# #Define a FileDataset for the temp directory, which has subdirectories
datastore_paths = [(datastore, 'temp')]
image_ds = Dataset.File.from_files(path=datastore_paths)

#Register it 
image_ds = image_ds.register(workspace=ws,
                                 name='image_ds',
                                 description='tanzania main folder temp',
                                 create_new_version=True)

## Prepare for run

### Check workspace and datasets

In [None]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

In [None]:
#List all datastores registered in the current workspace:
datastores = ws.datastores
for name, datastore in datastores.items():
    print(name, datastore, datastore.datastore_type)

### Check the image data
1. You can see what files are in the folder using Microsoft Storage Explorer
2. The following commands are slow to execute, but the number of filenames should match the size of the temp directory (in Storage Explorer, right-click on the folder and then "Selection Statistics").

In [13]:
#Warning -- these are SLOW to execute
#fs= image_ds.to_path() #Get all filenames

#Count filenames
len(image_ds.to_path()) #15602
#Take a sample of 5
image_ds.take(5).to_path()

['/annotations/TA25-RKE-20191128A/TA25-RKE-20191128A_L_4766.xml',
 '/annotations/TA25-RKE-20191128A/TA25-RKE-20191128A_L_4767.xml',
 '/annotations/TA25-RKE-20191128A/TA25-RKE-20191128A_L_4768.xml',
 '/annotations/TA25-RKE-20191128A/TA25-RKE-20191128A_L_4769.xml',
 '/annotations/TA25-RKE-20191128A/TA25-RKE-20191128A_L_4770.xml']

## Deploy a cluster
Note: this will check to see if a cluster is already running before creating one.  If it finds a cluster of the same name, it will use it instead of creating a new cluster.

Also, if you go to the Compute tab in Studio, you can edit the properties such as the minimum nodes (if 0, the cluster will power down when finished; if 1 it will stay alive).

In [None]:
#Get a list of supported VM sizes in your region (you'll need the name for the cluster)
AmlCompute.supported_vmsizes(ws, location='westus2')

In [110]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpu-cluster"

try:
    mycompute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    #max_nodes is the number of VMs you are creating
    #Try NC24r_Promo
    compute_config = AmlCompute.provisioning_configuration(
        vm_size= 'Standard_NC24rs_v2',#'Standard_NC24r','STANDARD_NC12S_V2',#'Standard_NC24r','STANDARD_NC6'
        min_nodes = 0,#SET TO 0 FOR IT TO AUTO-STOP, OR TO 1 to keep alive
        max_nodes=1) 

    # create the cluster
    mycompute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    
    mycompute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
# print(mycompute_target.get_status().serialize())

#WAIT for it to say "Minimum number of nodes requested have been provisioned"

Creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Attach VM instead of cluster
**REMINDER -- Spearhead3 uses CUDA v10.2 whereas the containerized code uses CUDA v10.1, so Spearhead3 should not be used for model training because the weights may not be transferrable.**

In [31]:
from azureml.core.compute import RemoteCompute, ComputeTarget

# Create the compute config 
compute_target_name = "spearhead3"
#Create the resource_id (fill in your own values)
resource_id = /subscriptions/<subscription_id>/resourceGroups/<resource_group>/providers/Microsoft.Compute/virtualMachines/<vm_name>
attach_config = RemoteCompute.attach_configuration(resource_id=resource_id,
    ssh_port=22,
    username=<user>,
    password=None,
    private_key_file=<path/to/.ssh/azure_rsa>)
    #private_key_passphrase="<passphrase>")

# For password access instead of ssh
# attach_config = RemoteCompute.attach_configuration(resource_id='<resource_id>',
#                                                 ssh_port=22,
#                                                 username='<username>',
#                                                 password="<password>")

# Attach the compute
spearhead3_compute_target = ComputeTarget.attach(ws, compute_target_name, attach_config)
compute.wait_for_completion(show_output=True)

## Create or get an experiment

In [None]:
#This "gets OR creates" an experiment; i.e., if it already exists, it will return the existing one
#from azureml.core import Experiment

experiment_name = 'aisurvey_expt4'
experiment = Experiment(ws, name=experiment_name)
experiment

## Create Environment

In [None]:
trident_env = Environment(name="trident") #Instantiate an Environment object
trident_env.python.user_managed_dependencies = True
trident_env.docker.enabled = True
trident_env.docker.base_image = 'tridentbase:v5'
trident_env.docker.base_image_registry.address = <your_workspace_container_registry>
#Get the existing conda dependencies
conda_dep = trident_env.python.conda_dependencies
#azureml-defaults seems to be added automatically, even if you don't include it anywhere

#Register the environment with the workspace
trident_env.register(ws)

In [112]:
trident_env = Environment.get(ws,"trident")

In [81]:
#Make sure I have the dataset needed (this is a repeat of above)
image_ds = Dataset.get_by_name(ws, name='image_ds') #or Dataset.get_by_id(ws,id)

In [140]:
#These are command-line parameters to be used with the main .py script file when it is called.
#Notes:
# 1: as_named_input() looks for a dataset registered to the workspace under that name
# 2: "data_folder at this stage is a "DatasetConsumptionConfig" object.  It gets converted during the run to a directory name.
# 3: dist_url should be set to "auto" for a single machine, or a master node URL for multiple machines.
script_params = {
    # Mount image_ds and pass the mount point name as the data folder argument 
    '--data_folder': image_ds.as_named_input('image_ds').as_mount(),
    '--output_dir':  './outputs',
    '--num_gpus_per_machine':4,
    '--num_machines':1,
    '--machine_rank':0,
    '--dist_url':"auto" #, 
    #'--eval_only':None # Will trigger evaluation if uncommented (yes, 'None' is correct; 'True' throws an error)
}

project_folder = './trident_project'
est = Estimator(source_directory=project_folder,
              script_params=script_params,
              compute_target=mycompute_target,
              environment_definition=Environment.get(workspace=ws,name="trident"),
              entry_script='trident_run.py')

### Did you copy the weights over AND double-check the filename in test_predict.py?
Using Azure Storage Explorer:   
- go to the usual `azureml-blobstore_etc`, 
- then to `azureml`, 
- then find the run you want (usually on pg 2 -- look for where the folders end), 
- then copy `model_final.pth` into `azureml-blobstore_etc/temp` (NOT outside `temp`!).  Moving doesn't work.

In [141]:
run = experiment.submit(est)

In [142]:
#Get the runId of the current run so you can load it later
run.get_details()['runId']


'aisurvey_expt5_1596913142_d733d47c'

In [None]:
#Then come back tomorrow and see how it's doing:
run = Run(exp, 'mdv4_trial_1580850141_379ed8f0')
run.get_status()

In [None]:
run

In [None]:
#Live-stream logs (every 10-15 seconds)
from azureml.widgets import RunDetails
RunDetails(run).show()

## Assess training progress

#### Understanding the Detectron2 output
Reminder: don't interpret the slash in the name as division: it just identifies a model component.  For example, `fast_rcnn/cls_accuracy` means the "class accuracy of the RCNN"
- `fast_rcnn/cls_accuracy` = num_accurate / num_instances (Want this to be 1)
- `fast_rcnn/fg_cls_accuracy` = fg_num_accurate / num_fg (Want this to be 1)
- `fast_rcnn/false_negative` = num_false_negative / num_fg) (Want this to be zero)
- `roi_head/num_fg_samples` The number of fg samples that are selected for training ROI heads =mean(num_fg_samples)
- `roi_head/num_bg_samples` The number of bg samples that are selected for training ROI heads =mean(num_bg_samples)
- `rpn/num_pos_anchors` the number of positive anchors per-image used in training = num_pos_anchors/num_images
- `rpn/num_neg_anchors` the number of negative anchors per-image used in training = num_neg_anchors/num_images


In [16]:
#This doesn't show metrics because Detectron stores them in metrics.json, which is only available after the run.
run.get_metrics()

{'Data folder': '/tmp/tmpq86gb_ai',
 'annotationpath': "PosixPath('/tmp/tmpq86gb_ai/tiled_annotations')",
 'test': 100,
 'rootdir': "PosixPath('/tmp/tmpq86gb_ai')",
 'test_list': [1, 2, 3],
 'imagepath': "PosixPath('/tmp/tmpq86gb_ai/tiled_images')",
 'output_dir': './outputs',
 'Class names': ['giraffe',
  'building',
  'cow',
  'human',
  'impala',
  'buffalo',
  'elephant',
  'boma',
  'shoats',
  'zebra',
  'donkey',
  'wildebeest',
  'oryx',
  'charcoal sack',
  'charcoal mound',
  'eland',
  'kudu',
  'hartebeest',
  'gazelle'],
 'survey_valid': "Metadata(name='survey_valid', thing_classes=['giraffe', 'building', 'cow', 'human', 'impala', 'buffalo', 'elephant', 'boma', 'shoats', 'zebra', 'donkey', 'wildebeest', 'oryx', 'charcoal sack', 'charcoal mound', 'eland', 'kudu', 'hartebeest', 'gazelle'])"}

In [None]:
#Get files associated with the run
print(run.get_file_names())

## Where the hell are my output files?
Answer: use the Microsoft Azure Storage Explorer.  It's in:
- `azureml-blobstore-84d56c80-a8c3-4b14-a1fa-0dde9dadda0d/azureml/<run_results>/outputs`  
For `<run_results>`, choose one of the folders, noting that:   
1. The files are in backwards date order (oldest on top)
2. There are TWO folders for each run; one of them has a name that ends in '-setup' and the other is the results
3. There are also two files per run; one is a zip and the other has 0 bytes; I think the zip probably has everything but I haven't checked.

## Register the model
This makes the workspace track it and version it.  It's also a step necessary for deployment.  There's nice documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-train-models-with-aml) and in part 2 of the same tutorial they do deployment.

In [56]:
from azureml.core.model import Model
# Tip: When model_path is set to a directory, you can use the child_paths parameter to include
#      only some of the files from the directory
# model = Model.register(model_path = "./models",
#                        model_name = "sentiment",
#                        description = "Sentiment analysis model trained outside Azure Machine Learning",
#                        workspace = ws)
# Register model
model = run.register_model(model_name='tridentmodelv1',
                           model_path='outputs/model_final.pth',
                           description = "First training on v2s 6Aug2020")
print(model.name, model.id, model.version, sep='\t')

tridentmodelv1	tridentmodelv1:1	1


# Other Notes

### Upload files to workspace datastore
**Notes:**
1. The target_path parameter specifies the location in the file share (or blob container) to upload. It defaults to `None`, so the data is uploaded to root. If overwrite=True, any existing data at target_path is overwritten.
2. You can also upload a list of individual files to the datastore via the `upload_files()` method.

In [None]:
#Upload either a directory or individual files to the datastore by using the Python SDK:
datastore = Datastore.get(ws, datastore_name='workspaceblobstore')
datastore.upload(src_dir='your source directory',
                 target_path='your target path',
                 overwrite=True,
                 show_progress=True)

### Try to access Howard's DataShare (testing)

In [159]:
image_ds.get_all(ws)

{'image_ds': DatasetRegistration(id='3f3064ba-5016-4255-90b7-72d4e528e94a', name='image_ds', version=2, description='tanzania main folder temp', tags={})}

In [6]:
image_ds = Dataset.get_by_name(workspace=ws, name='image_ds')

In [None]:
tanz_ds = Dataset.get_by_name(workspace=ws, name='')

In [162]:
datastore = Datastore.get(ws, 'workspaceblobstore')

In [163]:
datastore

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-84d56c80-a8c3-4b14-a1fa-0dde9dadda0d",
  "account_name": "jpworkspace3957796892",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [154]:
#datastore_paths = 'https://datashare.hosting.portal.azure.net/account/receivedshares/sharesubscriptiondetails#'

#[(datastore, 'surveyimagery_hlf')] #[(datastore, 'sde-images')]

In [155]:
tanz_ds = Dataset.File.from_files(path=datastore_paths)

In [158]:
tanz_ds.take(5).to_path()

['/sharesubscriptiondetails#']

In [None]:
#https://datashare.hosting.portal.azure.net/account/receivedshares/sharesubscriptiondetails#
#from azureml.core import Workspace, Datastore, Dataset

# retrieve an existing datastore in the workspace by name
datastore = Datastore.get(workspace, workspaceblobstore)

# create a TabularDataset from 3 file paths in datastore (note multiple sources: one dataset!)
datastore_paths = [(datastore, 'weather/2018/11.csv'),
                   (datastore, 'weather/2018/12.csv'),
                   (datastore, 'weather/2019/*.csv')] #note wildcard
weather_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

# Create a FileDataset from a single source (root directory)
datastore_paths = [(datastore, 'TA25')]
animal_ds = Dataset.File.from_files(path=datastore_paths)

#Create a FileDataset in one step, using wildcards
dataset = Dataset.File.from_files('https://dprepdata.blob.core.windows.net/demo/green-small/*.csv')

# Create a FileDataset from public image and label files (i.e., data at public web urls)
web_paths = ['https://azureopendatastorage.blob.core.windows.net/mnist/train-images-idx3-ubyte.gz',
             'https://azureopendatastorage.blob.core.windows.net/mnist/train-labels-idx1-ubyte.gz']
mnist_ds = Dataset.File.from_files(path=web_paths)

hdata = Dataset.get_by_name(ws,name='sde-images')

### How I added AdaBound optimizer
"As described in the paper, AdaBound is an optimizer that behaves like Adam at the beginning of training, and gradually transforms to SGD at the end. The final_lr parameter indicates AdaBound would transforms to an SGD with this learning rate. **In common cases, a default final learning rate of 0.1 can achieve relatively good and stable results on unseen data. It is not very sensitive to its hyperparameters.** See Appendix G of the paper for more details."

To AdaBound, I added the following to `test_predict.py`:
1. imports for the `adabound` package
2. imports for some supporting packages in `detectron2.solver.build`
3. A new class method (`build_optimizer`) added to the Trainer class.

Using conda_dep to import the package failed, so I added it to the Docker image instead, which worked, and the changes moved the Docker image on to v5.  
`conda_dep.add_pip_package("adabound") #FAILED`  
See aml-pipeline_tanzania.ipynb for Dockerfile details.

## Training loop

- `detectron2/tools/plain_train_net.py` is a fairly fleshed-out version of a training script that you might want to customize.
- `train_net.py` is a simpler version with more default behavior baked in.

The `DefaultTrainer` class has a good layout of everything you'd want to overwrite.  It's in `detectron2/engine/Defaults.py`.  

`DefaultTrainer` calls its superclass `SimpleTrainer`, which is:   
"A simple trainer for the most common type of task: single-cost single-optimizer single-data-source iterative optimization.  It assumes that every step, you:  
1. Compute the loss with a data from the data_loader.  
2. Compute the gradients with the above loss.  
3. Update the model with the optimizer.  
`SimpleTrainer` includes the method `run_step(),` which runs a _single_ step.

All other tasks during training (checkpointing, logging, evaluation, LR schedule) are maintained by hooks, which can be registered by `TrainerBase.register_hooks`."

The actual training _loop_ function is in its superclass, `TrainerBase`.  All `train()` does is step for max_iterations, and processes hooks (`before_step`, `after_step`, etc.).
```python
def train():
    for self.iter in range(start_iter, max_iter):
        self.before_step()
        self.run_step()
        self.after_step()
```

