In [1]:
# !az login

In [1]:
subscription_id = !az account show --query id --output tsv
subscription_id = subscription_id[0]

In [2]:
# Get the resource group name and workspace name for the workspace from the config.conf file using pyhocon
from pyhocon import ConfigFactory
config = ConfigFactory.parse_file("config.conf")
resource_group = config.get_string("RESOURCE_GROUP")
workspace_name = config.get_string("WORKSPACE_NAME")
compute_name = config.get_string("COMPUTE_NAME")
print(config)

ConfigTree([('RESOURCE_GROUP', 'rg-llm-experiments'), ('WORKSPACE_NAME', 'yaol-llm-experiments'), ('REGION', 'eastus'), ('COMPUTE_VM', 'Standard_NC6s_v3'), ('COMPUTE_NAME', 'teslav100')])


In [3]:

from azure.ai.ml import MLClient, Input, Output
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

# NOTE: It's very import to always set the resource_group and workspace_name when creating the MLClient
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id=subscription_id, resource_group_name=resource_group, workspace_name=workspace_name
)
print(ml_client)

from azureml.core import Workspace, Datastore

ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name)


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f939709a830>,
         subscription_id=ea54fca2-3a3e-4b3b-91e7-a7bf971e0443,
         resource_group_name=rg-llm-experiments,
         workspace_name=yaol-llm-experiments)


## Get Params / Setup

In [4]:
import json
with open("../configs/azure_configs_debugging.json") as f:
    params = json.load(f)
    train_params = params['train_params']
    sample_params = params['sample_params']
    dataset_params = params['dataset_params']

## Define Environment

In [5]:
from azure.ai.ml.entities import Environment

custom_env = Environment(
    name='nano_gpt_env',
    description="Custom environment for nano gpt training",
    conda_file="../environment.yml",
    image="mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04",
)
# ml_client.environments.create_or_update(custom_env)

## Register Dataset

In [6]:
# import subprocess
# from azure.ai.ml.entities import Data
# import os

# def build_cmd(script, param_dict):
#     return f"python {script}" + "".join(f" --{k}" if v is True else f" --{k}={v}" for k, v in param_dict.items())

# def get_or_create_dataset(dataset_name):
#     try:
#         # Try to get the existing dataset
#         dataset_train = ml_client.data.get(name=dataset_name + '_train', version="latest")
#         dataset_val = ml_client.data.get(name=dataset_name + '_val', version="latest")
#         print(f"Dataset {dataset_name} already exists. Using existing version.")
#         return dataset_train, dataset_val
#     except Exception:
#         print(f"Dataset {dataset_name} doesn't exist or not registered.")
        
#         dataset_path = './data/' + dataset_name
        
#         if os.path.exists(dataset_path):
#             print(f"Dataset {dataset_name} already exists at {dataset_path}.")
#         else:
#             print(f"Dataset {dataset_name} doesn't exist yet locally, create it...")
#             cmd = build_cmd('../data/prepare.py', dataset_params)
#             subprocess.run(cmd, shell=True)
        
#             # Create the dataset
#         dataset_train = Data(
#             name=dataset_name + '_train',
#             description=f"train dataset for {dataset_name}",
#             path=dataset_path + '/train.bin',
#             type="uri_file",
#             version="latest"
#         )
        
#         dataset_val = Data(
#             name=dataset_name + '_val',
#             description=f"val dataset for {dataset_name}",
#             path=dataset_path + '/val.bin',
#             type="uri_file",
#             version="latest"
#         )

#         ml_client.data.create_or_update(dataset_train)
#         ml_client.data.create_or_update(dataset_val)
#         print(f"Dataset {dataset_name} has been registered/updated.")
#         return dataset_train, dataset_val

# dataset_train, dataset_val = get_or_create_dataset(dataset_params['dataset_id'])


In [11]:
import subprocess
from azure.ai.ml.entities import Data

def build_cmd(script, param_dict):
    return f"python {script}" + "".join(f" --{k}" if v is True else f" --{k}={v}" for k, v in param_dict.items())

def create_dataset(dataset_name):
    try:
        # Try to get the existing dataset
        dataset = ml_client.data.get(name=dataset_name, version="latest")
        print(f"Dataset {dataset_name} already exists. Using existing version.")
    except Exception:
        print(f"Dataset {dataset_name} doesn't exist. Creating new dataset.")
        cmd = build_cmd('../src/data/prepare.py', dataset_params)
        subprocess.run(cmd, shell=True)
        dataset_folder_name = dataset_name.split("/")[-1].replace("-", "_")
        dataset_path = '../src/data/' + dataset_folder_name
            # Create the dataset
        dataset = Data(
            name=dataset_name,
            description=f"Dataset for {dataset_name}",
            path=dataset_path,
            type="uri_folder",
            version="latest"
        )

        ml_client.data.create_or_update(dataset)
        print(f"Dataset {dataset_name} has been registered/updated.")

create_dataset(dataset_params['dataset_id'])


Dataset openwebtext-10k already exists. Using existing version.


## Pipeline

In [12]:
from azure.ai.ml import command

train_cmd = build_cmd('train.py', train_params)

# configure job
train_job = command(
    code="../src",
    command=train_cmd,
    environment=custom_env,
    compute=compute_name,
    display_name=train_params["experiment_name"],
    experiment_name=train_params["experiment_name"],
)


In [13]:
returned_job = ml_client.create_or_update(train_job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

[32mUploading src (22.56 MBs): 100%|██████████| 22563007/22563007 [00:15<00:00, 1430255.18it/s]
[39m



In [None]:

sample_job = command(
    code="../src",
    command="python train.py",
    inputs={
        f"{dataset_name} dataset": Input(type="uri_folder"),
    },
    outputs={
        "model": Output(type="uri_folder", mode="rw_mount"),
    }
    environment=custom_env,
    compute=compute_name,
    display_name=train_params["experiment_name"],
    experiment_name=train_params["experiment_name"],
)