In [16]:
!az login

[33mTo sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code CXZESVAZX to authenticate.[0m
[
  {
    "cloudName": "AzureCloud",
    "id": "ea54fca2-3a3e-4b3b-91e7-a7bf971e0443",
    "isDefault": true,
    "name": "Azure subscription 1",
    "state": "Enabled",
    "tenantId": "1fdb6d07-480b-4c31-a913-e847d51e7446",
    "user": {
      "name": "Philliplakaschus@gmail.com",
      "type": "user"
    }
  }
]
[0m

In [18]:
subscription_id = !az account show --query id --output tsv
subscription_id = subscription_id[0]

In [27]:
# Get the resource group name and workspace name for the workspace from the config.conf file using pyhocon
from pyhocon import ConfigFactory
config = ConfigFactory.parse_file("config.conf")
resource_group = config.get_string("RESOURCE_GROUP")
workspace_name = config.get_string("WORKSPACE_NAME")
compute_name = config.get_string("COMPUTE_NAME")

In [28]:

from azure.ai.ml import MLClient, Input, Output
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

# NOTE: It's very import to always set the resource_group and workspace_name when creating the MLClient
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace_name
)
print(ml_client)

MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f994c3c6cb0>,
         subscription_id=ea54fca2-3a3e-4b3b-91e7-a7bf971e0443,
         resource_group_name=rg-llm-experiments,
         workspace_name=yaol-llm-experiments)


## Define Environment

In [29]:
from azure.ai.ml.entities import Environment, Data

custom_env = Environment(
    name='nano_gpt_env',
    description="Custom environment for nano gpt training",
    conda_file="../environment.yml",
    image="mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04"
)
ml_client.environments.create_or_update(custom_env)

Environment({'arm_type': 'environment_version', 'latest_version': None, 'image': 'mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04', 'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'nano_gpt_env', 'description': 'Custom environment for nano gpt training', 'tags': {}, 'properties': {'azureml.labels': 'latest'}, 'print_as_yaml': False, 'id': '/subscriptions/ea54fca2-3a3e-4b3b-91e7-a7bf971e0443/resourceGroups/rg-llm-experiments/providers/Microsoft.MachineLearningServices/workspaces/yaol-llm-experiments/environments/nano_gpt_env/versions/3', 'Resource__source_path': '', 'base_path': '/home/lakaschus/python/nanoGPTExperiments/azure_deployment', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f995c20c880>, 'serialize': <msrest.serialization.Serializer object at 0x7f995db06410>, 'version': '3', 'conda_file': {'channels': ['nvidia/label/cuda-11.8.0', 'pytorch', 'conda-fo

## Get Params / Setup

In [30]:
import json
with open("../configs/azure_configs_debugging.json") as f:
    params = json.load(f)
    train_params = params['train_params']
    sample_params = params['sample_params']
    dataset_params = params['dataset_params']

In [31]:
def build_cmd(script, param_dict):
    return f"python {script}.py" + "".join(f" --{k}={v}" for k, v in param_dict.items())


## Register Dataset

In [None]:
def get_or_create_dataset(
) -> Data:
    try:
        # Try to get the existing dataset
        dataset = ml_client.data.get(name=dataset_name, version="latest")
        print(f"Dataset {dataset_name} already exists. Using existing version.")
        return dataset
    except Exception:
        print(f"Dataset {dataset_name} doesn't exist. Creating new dataset.")

    build_cmd('data/prepare.py', dataset_params)

        # Create the dataset
    dataset = Data(
        name=dataset_name,
        description=f"Dataset for {dataset_name}",
        path=dataset_path,
        type="uri_file"
    )
    
    ml_client.data.create_or_update(dataset)
    print(f"Dataset {dataset_name} has been registered/updated.")
    return dataset

# TODO CONTINUE HER

## Training

In [None]:
def get_or_create_dataset(
    dataset_name: str,
    dataset_id: str,
    num_proc: int,
    test_size: float,
    seed: int,
    encoding: str
) -> Data:
    try:
        # Try to get the existing dataset
        dataset = ml_client.data.get(name=dataset_name, version="latest")
        print(f"Dataset {dataset_name} already exists. Using existing version.")
        return dataset
    except Exception:
        print(f"Dataset {dataset_name} doesn't exist. Creating new dataset.")
        
        # Define dataset creation component
        @command(
            name="create_dataset",
            display_name="Create Dataset",
            description="Create and preprocess the dataset",
            environment=custom_env,
        )
        def create_dataset(
            dataset_id: str,
            num_proc: int,
            test_size: float,
            seed: int,
            encoding: str
        ) -> Output(type="uri_folder"):
            import subprocess
            
            script_path = "create_dataset.py"
            cmd = [
                "python", script_path,
                "--dataset_id", dataset_id,
                "--num_proc", str(num_proc),
                "--test_size", str(test_size),
                "--seed", str(seed),
                "--encoding", encoding
            ]
            
            subprocess.run(cmd, check=True)
            
            return Output(path="./")

        # Create the dataset
        job = create_dataset(
            dataset_id=dataset_id,
            num_proc=num_proc,
            test_size=test_size,
            seed=seed,
            encoding=encoding
        )
        
        returned_job = ml_client.jobs.create_or_update(job)
        returned_job = ml_client.jobs.stream(returned_job.name)

        # Register the dataset as a data asset
        dataset = Data(
            name=dataset_name,
            description=f"Processed dataset from {dataset_id}",
            path=returned_job.outputs.output.path,
            type="uri_folder"
        )
        ml_client.data.create_or_update(dataset)
        
        print(f"Dataset {dataset_name} has been created and registered.")
        return dataset

# Get or create the dataset
dataset_name = f"{train_params['dataset']}_processed"
dataset = get_or_create_dataset(
    dataset_name=dataset_name,
    dataset_id=train_params.get("dataset_id", "stas/openwebtext-10k"),
    num_proc=train_params.get("num_proc", 1),
    test_size=train_params.get("test_size", 0.1),
    seed=train_params.get("seed", 2351),
    encoding=train_params.get("encoding", "gpt2")
)

Before running the job, remove all unnecessary files from the src folder

In [23]:
# TODO: Convert the below command into a pipeline
# Step 1: Create and register a dataset given on the dataset name; Only update the dataset if it's not already registered or changed
# Step 2: Load and preview the dataset
# Step 3: Run the training script with the given parameters
# Step 4: Create a sample from the trained model

In [26]:
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import load_component

@load_component(source="train.yml")
def train_model(
    out_dir: str,
    dataset: str,
    batch_size: int,
    learning_rate: float,
    max_iters: int,
    experiment_name: str
) -> Output(type="uri_folder"):
    pass

@load_component(source="sample.yml")
def generate_sample(
    out_dir: Input(type="uri_folder"),
    num_samples: int,
    max_new_tokens: int
) -> Output(type="uri_folder"):
    pass

# Define the pipeline
@pipeline(name="nano-gpt-pipeline", description="Pipeline for training and sampling from a nano GPT model")
def nano_gpt_pipeline(
    out_dir: str,
    dataset: str,
    batch_size: int,
    learning_rate: float,
    max_iters: int,
    experiment_name: str,
    num_samples: int,
    max_new_tokens: int
):
    train_job = train_model(
        out_dir=out_dir,
        dataset=dataset,
        batch_size=batch_size,
        learning_rate=learning_rate,
        max_iters=max_iters,
        experiment_name=experiment_name
    )
    
    sample_job = generate_sample(
        out_dir=train_job.outputs.model_output,
        num_samples=num_samples,
        max_new_tokens=max_new_tokens
    )
    
    return {"train_output": train_job.outputs.model_output, "sample_output": sample_job.outputs.samples}


ValidationException: No such file or directory: train.yml

In [None]:

# Create the pipeline job
pipeline_job = nano_gpt_pipeline(
    out_dir=train_params["out_dir"],
    dataset=train_params["dataset"],
    batch_size=train_params["batch_size"],
    learning_rate=train_params["learning_rate"],
    max_iters=train_params["max_iters"],
    experiment_name=train_params["experiment_name"],
    num_samples=sample_params["num_samples"],
    max_new_tokens=sample_params["max_new_tokens"]
)

# Submit the job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name=train_params["experiment_name"]
)

In [None]:
from azure.ai.ml import command

# configure job
job = command(
    code="../src",
    command="python pipeline.py",
    environment=custom_env,
    compute=compute_name,
    display_name=params["experiment_name"],
    experiment_name=params["experiment_name"],
)

# submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)



Uploading src (27.77 MBs): 100%|##########| 27773706/27773706 [00:08<00:00, 3117742.11it/s]




Monitor your job at https://ml.azure.com/runs/calm_muscle_lwb12zg6gc?wsid=/subscriptions/ea54fca2-3a3e-4b3b-91e7-a7bf971e0443/resourcegroups/rg-nano-gpt/workspaces/ml-nano-gpt&tid=1fdb6d07-480b-4c31-a913-e847d51e7446


In [None]:
returned_job

Experiment,Name,Type,Status,Details Page
nano-gpt-training,heroic_spinach_qslcj4s1d6,command,Starting,Link to Azure Machine Learning studio
