In [45]:
import boto3
import importlib
import json
import os
import pathlib
import sys

import awswrangler as wr

from botocore.exceptions import ClientError
from IPython.display import display

# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 

_ = """
ml.g6.xlarge:   6     250GB
ml.g6.2xlarge:  3
ml.g6.4xlarge:  6
ml.g6.8xlarge:  6
ml.g6.12xlarge: 6
ml.g6.16xlarge: 6
ml.g6.24xlarge: 6
ml.g6.48xlarge: 6
"""
script_dir

utils.py loaded: v0.2.12
config.py loaded: v0.1


PosixPath('/home/sagemaker-user/sagemaker_research_classification/src/05_tuning_basic')

In [10]:
client = boto3.client(
    service_name='service-quotas',
    region_name=config.AWS_REGION
)

In [9]:
first_page = True
page = {}
services = []
while first_page or 'NextToken' in page:
    first_page = False
    if 'NextToken' in page:
        page = client.list_services(MaxResults=100, NextToken=page['NextToken'])
    else:
        page = client.list_services(MaxResults=100)
    services = services + page['Services']
services

[{'ServiceCode': 'AWSCloudMap', 'ServiceName': 'AWS Cloud Map'},
 {'ServiceCode': 'access-analyzer', 'ServiceName': 'Access Analyzer'},
 {'ServiceCode': 'acm', 'ServiceName': 'AWS Certificate Manager (ACM)'},
 {'ServiceCode': 'acm-pca',
  'ServiceName': 'AWS Private Certificate Authority'},
 {'ServiceCode': 'airflow',
  'ServiceName': 'Amazon Managed Workflows for Apache Airflow'},
 {'ServiceCode': 'amplify', 'ServiceName': 'AWS Amplify'},
 {'ServiceCode': 'amplifyuibuilder', 'ServiceName': 'Amplify UI Builder'},
 {'ServiceCode': 'aoss', 'ServiceName': 'Amazon OpenSearch Serverless'},
 {'ServiceCode': 'apigateway', 'ServiceName': 'Amazon API Gateway'},
 {'ServiceCode': 'app-integrations',
  'ServiceName': 'Amazon Connect Application Integrations'},
 {'ServiceCode': 'appconfig', 'ServiceName': 'AWS AppConfig'},
 {'ServiceCode': 'appflow', 'ServiceName': 'Amazon AppFlow'},
 {'ServiceCode': 'application-autoscaling',
  'ServiceName': 'Application Auto Scaling'},
 {'ServiceCode': 'applicat

In [13]:
first_page = True
page = {}
quotas = {}
while first_page or 'NextToken' in page:
    first_page = False
    if 'NextToken' in page:
        page = client.list_service_quotas(ServiceCode='sagemaker', MaxResults=100, NextToken=page['NextToken'])
    else:
        page = client.list_service_quotas(ServiceCode='sagemaker', MaxResults=100)
    for quota in page['Quotas']:
        if quota['QuotaName'] not in quotas:
            quotas[quota['QuotaName']] = quota['Value']
        else:
            raise ValueError('Quota name already exists')
quotas

{'Studio CodeEditor Apps running on ml.r6id.large instances': 2.0,
 'ml.g4dn.4xlarge for transform job usage': 0.0,
 'Studio CodeEditor Apps running on ml.r5.4xlarge instances': 1.0,
 'ml.r5d.xlarge for training job usage': 0.0,
 'ml.g6.16xlarge for endpoint usage': 0.0,
 'ml.t3.large for processing job usage': 10.0,
 'ml.g4dn.12xlarge for notebook instance usage': 0.0,
 'ml.r5d.24xlarge for spot training job usage': 0.0,
 'ml.r7i.16xlarge for cluster usage': 0.0,
 'ml.r6gd.4xlarge for endpoint usage': 2.0,
 'ml.r5.4xlarge for endpoint usage': 1.0,
 'RSessionGateway Apps running on ml.c5.18xlarge instance': 1.0,
 'ml.r5.4xlarge for notebook instance usage': 1.0,
 'Studio JupyterLab Apps running on ml.m7i.2xlarge instances': 3.0,
 'ml.g5.12xlarge for spot training job usage': 0.0,
 'Rate of ListLabelingJobsForWorkteam requests': 2.0,
 'ml.c4.8xlarge for training warm pool usage': 0.0,
 'ml.r7i.large for spot training job usage': 20.0,
 'Studio CodeEditor Apps running on ml.c6i.xlarge in

In [15]:
page

{'Quotas': [{'ServiceCode': 'sagemaker',
   'ServiceName': 'Amazon SageMaker',
   'QuotaArn': 'arn:aws:servicequotas:eu-west-2:762595428873:sagemaker/L-B2B3BA64',
   'QuotaCode': 'L-B2B3BA64',
   'QuotaName': 'ml.inf1.xlarge for endpoint usage',
   'Value': 2.0,
   'Unit': 'None',
   'Adjustable': True,
   'GlobalQuota': False,
   'UsageMetric': {'MetricNamespace': 'AWS/Usage',
    'MetricName': 'ResourceCount',
    'MetricDimensions': {'Class': 'None',
     'Resource': 'endpoint/ml.inf1.xlarge',
     'Service': 'SageMaker',
     'Type': 'Resource'},
    'MetricStatisticRecommendation': 'Maximum'},
   'Period': {'PeriodValue': 5, 'PeriodUnit': 'MINUTE'},
   'QuotaAppliedAtLevel': 'ACCOUNT',
   'Description': 'ml.inf1.xlarge for endpoint usage'},
  {'ServiceCode': 'sagemaker',
   'ServiceName': 'Amazon SageMaker',
   'QuotaArn': 'arn:aws:servicequotas:eu-west-2:762595428873:sagemaker/L-40F00B06',
   'QuotaCode': 'L-40F00B06',
   'QuotaName': 'ml.c5.24xlarge for cluster usage',
   'Value

In [28]:
instance_capacity = {}
instances = [
    'ml.g6.xlarge',
    'ml.g6.2xlarge',
    'ml.g6.4xlarge',
    'ml.g6.8xlarge',
    'ml.g6.12xlarge',
    'ml.g6.16xlarge',
    'ml.g6.24xlarge',
    'ml.g6.48xlarge'
]
for instance in instances:
    quota = int(quotas[f'{instance} for training job usage'])
    instance_capacity[instance] = {'quota': quota, 'usage': 0, 'available': quota}

In [30]:
sagemaker_client = boto3.client('sagemaker')
first_page = True
page = {}
runs = {}
while first_page or 'NextToken' in page:
    first_page = False
    if 'NextToken' in page:
        page = sagemaker_client.list_training_jobs(NextToken=page['NextToken'])
    else:
        page = sagemaker_client.list_training_jobs()
    for training_job_summary in page['TrainingJobSummaries']:
        if training_job_summary['TrainingJobStatus'] in ('InProgress', 'Stopping'):
            training_job = sagemaker_client.describe_training_job(TrainingJobName=training_job_summary['TrainingJobName'])
            instance = training_job['ResourceConfig']['InstanceType']
            instance_capacity[instance]['usage'] += 1
            instance_capacity[instance]['available'] -= 1

instance_capacity

{'ml.g6.xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.2xlarge': {'quota': 3, 'usage': 2, 'available': 1},
 'ml.g6.4xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.8xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.12xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.16xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.24xlarge': {'quota': 6, 'usage': 4, 'available': 2},
 'ml.g6.48xlarge': {'quota': 6, 'usage': 0, 'available': 6}}

In [49]:
utils.get_available_training_quotas()

{'ml.g6.xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.2xlarge': {'quota': 3, 'usage': 0, 'available': 3},
 'ml.g6.4xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.8xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.12xlarge': {'quota': 6, 'usage': 2, 'available': 4},
 'ml.g6.16xlarge': {'quota': 6, 'usage': 4, 'available': 2},
 'ml.g6.24xlarge': {'quota': 6, 'usage': 6, 'available': 0},
 'ml.g6.48xlarge': {'quota': 6, 'usage': 1, 'available': 5}}