In [4]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

In [None]:
from azureml.telemetry import set_diagnostics_collection

set_diagnostics_collection(send_diagnostics=True)

In [None]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

In [None]:
'''
datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                      datastore_name="gpt2training",
                                                        account_name="amherstwstorageinnganzr",
                                                    container_name="gpt2training", 
                                                      account_key='<MY ACCOUNT KEY>',
                                                      create_if_not_exists=True)
'''

In [None]:
'''
import azureml.data
from azureml.data.azure_storage_datastore import AzureFileDatastore, AzureBlobDatastore

datastore.upload(src_dir='./pytorch-gpt2',
                 target_path='pytorch-gpt2',
                 overwrite=True,
                 show_progress=True)
'''

In [None]:
print("These are your availible datastores...")
import azureml.core
from azureml.core import Workspace, Datastore
datastores = ws.datastores
for name, ds in datastores.items():
    print(name, ds.datastore_type)

In [None]:
#get named datastore from current workspace
datastore = Datastore.get(ws, datastore_name='gpt2training')
ws.set_default_datastore('gpt2training')

In [None]:
datastore.as_mount()

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NC24r', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

In [None]:
from azureml.core import Experiment

experiment_name = 'pytorch-gpt2'
experiment = Experiment(ws, name=experiment_name)

In [None]:
import os
import shutil

project_folder = './train-on-amlcompute'
os.makedirs(project_folder, exist_ok=True)
shutil.copy('train.py', project_folder)
shutil.copy('processed_data_final.csv', project_folder)

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

myenv = Environment("myenv")

myenv.docker.enabled = True
myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=['utils','torch','tensorflow','azureml-sdk','argparse','pandas','numpy','transformers', 'scikit-learn'])

In [None]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_dir': datastore.path('/pytorch-gpt2').as_mount(),
    '--output_dir': './outputs'
}


estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_gpu=True,
                    inputs=[datastore.path('/gpt2training').as_download(),datastore.as_mount()],
                    pip_packages=['pillow==5.4.1','git+https://github.com/huggingface/transformers'])


# Set environment
estimator.run_config.environment = myenv

In [None]:
run = experiment.submit(estimator)
print(run)

In [None]:
print(run.get_details())

In [None]:
from azureml.widgets import RunDetails

RunDetails(run).show()