In [2]:
import boto3
import importlib
import json
import os
import pathlib
import sys
import wandb

import awswrangler as wr
import numpy as np
import pandas as pd

from botocore.exceptions import ClientError
from IPython.display import display

# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 

utils.py loaded: v0.2.12
config.py loaded: v0.1


In [3]:
wandb_api_key = utils.get_secret(region_name=config.AWS_REGION, secret_name='WeightsAndBiases')['api_key']
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/sagemaker-user/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msteve-attila-kopias[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
print(wandb_api.from_path('steve-attila-kopias/sagemaker_research_classification/runs/bigbird-topic-title-s1-0918024259-tyu32b-algo-1'))

<Run steve-attila-kopias/sagemaker_research_classification/bigbird-topic-title-s1-0918024259-tyu32b-algo-1 (finished)>


In [19]:
wandb.__file__

'/home/sagemaker-user/.conda/envs/python_311/lib/python3.11/site-packages/wandb/__init__.py'

In [13]:
wandb_api.runs

<bound method Api.runs of <wandb.apis.public.api.Api object at 0x7fcd0227f650>>

In [6]:
class WandbRuns:
    def __init__(self, entity=config.WANDB_ENTITY, project=config.WANDB_PROJECT, and_tags=None, filters={}):
        self.entity = entity
        self.project = project
        self.and_tags = and_tags
        self.filters = filters
        if 'state' not in filters:
            self.filters['state'] = 'finished'
        # To test:
        # self.filters['config.job_name'] = 'scibert-subfield-fulltext-s100-0917214139'

        if self.and_tags is not None:
            self.filters['tags'] = {'$all': self.and_tags}

        self.root = f'{self.entity}/{self.project}'
        self.runs = None
        self.runs_history = None

        self.api = wandb.Api()
        
        self.get_runs()
        self.get_runs_history()

    def get_runs(self):
        self.runs = self.api.runs(
            self.root,
            filters=self.filters,
        )
    
    def get_runs_history(self):
        runs_details = []
        for run in self.runs:                
            needed_keys = [
                '_step',
                '_runtime',
                '_timestamp',

                'train/epoch',
                'train/global_step',

                'train/loss',
                'eval/loss',

                'eval/f1',
                'eval/accuracy',
                'eval/matthews_correlation',
            ]

            history_df_full = run.history(
                pandas=True,
                samples=500,
                x_axis='_epoch',
            )
            needed_cols = [v for v in needed_keys if v in history_df_full.columns.to_list()]
            history_df = history_df_full[needed_cols]

            history_df = history_df.sort_values(by=['_step', '_runtime', '_timestamp'])
            history_df.loc[:, 'train/loss'] = history_df.loc[:, 'train/loss'].ffill()
            history_df = history_df[history_df['train/epoch'] == np.round(history_df['train/epoch']).astype('Int64')]
            history_df['train/epoch'] = history_df['train/epoch'].astype('Int64')
            history_df = history_df[history_df['eval/loss'].notna()]

            history_df['run_id'] = run.id
            tag_keys = []
            for tag in run.tags:
                tag_key, tag_value = tag.split(': ')
                if tag_key != 'instance':
                    history_df[tag_key] = tag_value
                    tag_keys.append(tag_key)

            history_df = history_df[['run_id'] + tag_keys + needed_cols]

            best_f1_epoch_idx = history_df['eval/f1'].idxmax()
            history_df['is_best_epoch'] = False
            history_df.loc[best_f1_epoch_idx, 'is_best_epoch'] = True

            if self.runs_history is None:
                self.runs_history = history_df
            else:
                self.runs_history = pd.concat([self.runs_history, history_df])
            self.runs_history = self.runs_history.reset_index(drop=True)

            run_details = {
                # 'name': run.name,
                # 'id': run.id,
                # 'url': run.url,
                # 'state': run.state,  # running, finished, crashed, killed, preempting, preempted
                # 'tags': run.tags,  # list
                # 'config': run.config,  # dict
                # 'summary': summary,  # dict
                # 'metadata ': run.metadata,
                # 'created_at ': run.created_at,
                #'system_metrics ': run.system_metrics,
    
                # 'history_keys': run.history_keys,
                # 'history_dict': history_dict,
                # 'history_df': history_df,
                # 'html': run.to_html(height=420, hidden=False)
            }
            runs_details.append(run_details)
        self.runs_details = runs_details

wandb_log = WandbRuns()

In [7]:
wandb_log.runs_history[
    (wandb_log.runs_history['is_best_epoch'] == True) &
    (wandb_log.runs_history['text'] != 'title') &
    (wandb_log.runs_history['label'] == 'subfield') &
    (wandb_log.runs_history['sample'] == '100%')
].sort_values(by='eval/f1', ascending=False)

Unnamed: 0,run_id,label,model,sample,text,_step,_runtime,_timestamp,train/epoch,train/global_step,train/loss,eval/loss,eval/f1,eval/accuracy,eval/matthews_correlation,is_best_epoch
531,ModernBERT-subfield-abstract-s100-0918004625-y...,subfield,ModernBERT,100%,abstract,20,15670.092521,1758172000.0,2,9712,0.4008,0.519818,0.832227,0.832227,0.785402,True
446,scibert-subfield-abstract-s100-0917214740-r1eg...,subfield,scibert,100%,abstract,20,9178.743561,1758155000.0,2,9712,0.4343,0.533883,0.829818,0.829818,0.783129,True
712,deberta-subfield-abstract-s100-0918144955-5j81...,subfield,deberta,100%,abstract,16,13383.401662,1758235000.0,3,7284,0.4145,0.548704,0.828998,0.828998,0.781508,True
423,roberta-subfield-abstract-s100-0917145723-evca...,subfield,roberta,100%,abstract,22,10157.192408,1758153000.0,4,9712,0.3255,0.576277,0.828152,0.828152,0.779875,True
617,bigbird-subfield-abstract-s100-0918020348-h2km...,subfield,bigbird,100%,abstract,16,15335.3866,1758186000.0,3,7284,0.3987,0.549849,0.827357,0.827357,0.77863,True
633,longformer-subfield-abstract-s100-0918102352-t...,subfield,longformer,100%,abstract,41,64509.34739,1758256000.0,4,19424,0.3151,0.591924,0.825307,0.825307,0.776819,True
297,distilbert-subfield-abstract-s100-0917114954-i...,subfield,distilbert,100%,abstract,31,7338.748381,1758136000.0,3,14568,0.3539,0.576341,0.825256,0.825256,0.776974,True
72,bert-subfield-abstract-s100-0917031638-t5h3je-...,subfield,bert,100%,abstract,31,13757.907113,1758093000.0,3,14568,0.337,0.57666,0.825076,0.825076,0.777063,True
706,ModernBERT-subfield-fulltext-s100-0918144655-5...,subfield,ModernBERT,100%,fulltext,10,9392.073418,1758226000.0,2,4856,0.4667,0.571083,0.816029,0.816029,0.764666,True
436,scibert-subfield-fulltext-s100-0917214139-3zdq...,subfield,scibert,100%,fulltext,20,9680.182557,1758155000.0,2,9712,0.5074,0.570129,0.815568,0.815568,0.764787,True


In [92]:
all_cols = [c for c in manual_run_histories[0]['history'].columns.to_list() if not c.startswith('gradients')]
all_cols

['eval/steps_per_second',
 'eval/runtime',
 'test/samples_per_second',
 'eval/accuracy',
 'test/accuracy',
 'eval/loss',
 'test/loss',
 'test/f1',
 'eval/f1',
 'test/runtime',
 '_timestamp',
 'test/matthews_correlation',
 'eval/samples_per_second',
 'train/learning_rate',
 'eval/matthews_correlation',
 '_step',
 'train/loss',
 'train/epoch',
 'train/grad_norm',
 '_runtime',
 'test/steps_per_second',
 'train/global_step']

In [101]:
eval_cols = [
'_step',
'_runtime',
'_timestamp',

'train/epoch',
'train/global_step',

'train/loss',
'eval/loss',

'eval/f1',
'eval/accuracy',
'eval/matthews_correlation',
]

In [103]:
manual_run_histories[0]['history'][eval_cols]

Unnamed: 0,_step,_runtime,_timestamp,train/epoch,train/global_step,train/loss,eval/loss,eval/f1,eval/accuracy,eval/matthews_correlation
0,0,922.999144,1758147000.0,0.102965,500,0.9481,,,,
1,1,1376.782473,1758147000.0,0.205931,1000,0.6958,,,,
2,2,1830.593171,1758147000.0,0.308896,1500,0.6659,,,,
3,3,2285.707001,1758148000.0,0.411862,2000,0.6449,,,,
4,4,2740.864968,1758148000.0,0.514827,2500,0.6241,,,,
5,5,3197.293095,1758149000.0,0.617792,3000,0.6183,,,,
6,6,3652.151142,1758149000.0,0.720758,3500,0.6265,,,,
7,7,4105.667298,1758150000.0,0.823723,4000,0.6003,,,,
8,8,4560.085087,1758150000.0,0.926689,4500,0.5984,,,,
9,9,5066.403098,1758151000.0,1.0,4856,,0.604459,0.807469,0.807469,0.753558


In [115]:
hist_sorted = manual_run_histories[0]['history'][eval_cols].sort_values(by=['_step', '_runtime', '_timestamp'])
hist_sorted.loc[:, 'train/loss'] = hist_sorted.loc[:, 'train/loss'].ffill()
hist_sorted = hist_sorted[hist_sorted['train/epoch']==np.round(hist_sorted['train/epoch']).astype('Int64')]
hist_sorted['train/epoch'] = hist_sorted['train/epoch'].astype('Int64')
hist_sorted = hist_sorted[hist_sorted['eval/loss'].notna()]
hist_sorted

Unnamed: 0,_step,_runtime,_timestamp,train/epoch,train/global_step,train/loss,eval/loss,eval/f1,eval/accuracy,eval/matthews_correlation
9,9,5066.403098,1758151000.0,1,4856,0.5984,0.604459,0.807469,0.807469,0.753558
20,20,9680.182557,1758155000.0,2,9712,0.5074,0.570129,0.815568,0.815568,0.764787
31,31,14299.845061,1758160000.0,3,14568,0.3839,0.624009,0.809545,0.809545,0.757252
41,41,18905.327119,1758165000.0,4,19424,0.254,0.721726,0.805111,0.805111,0.750082
52,52,23532.013626,1758169000.0,5,24280,0.1513,0.842704,0.800472,0.800472,0.744961


In [96]:
runs.histories(
            samples=500,
            keys=[
                'loss',
                'f1',
                #'accuracy',
                #'matthews_correlation'
            ],
            x_axis='_epoch',
            # format='pandas'
        )

[]

In [51]:
test_cols = [
'_step',
'test/loss',
'test/f1',
'test/accuracy',
'test/matthews_correlation',
]

In [68]:
traineval_cols = [
'_step',
'_runtime',
'_timestamp',

'train/epoch',
'train/global_step',

'train/loss',
'eval/loss',

'train/f1',
'eval/f1',
'eval/accuracy',
'eval/matthews_correlation',

'train/global_step',
]

In [69]:
cols_needed = traineval_cols + [c for c in test_cols if c not in traineval_cols]
hist = manual_run_histories[0]['history'][cols_needed]
hist.loc[:, 'train/loss'] = hist.loc[:, 'train/loss'].ffill()
hist

In [44]:
hist_test = hist[hist['test/loss'].notna()]
hist_test

Unnamed: 0,_step,_runtime,_timestamp,train/epoch,train/global_step,train/loss,eval/loss,test/loss,test/f1,eval/f1,...,eval/matthews_correlation,test/matthews_correlation,train/global_step.1,train/learning_rate,eval/samples_per_second,test/samples_per_second,eval/steps_per_second,test/steps_per_second,eval/runtime,test/runtime
7,7,128.425751,1758063000.0,,975,,,1.501861,0.46,,...,,0.329423,975,,,304.195,,12.168,,0.3287


In [28]:
instance_capacity = {}
instances = [
    'ml.g6.xlarge',
    'ml.g6.2xlarge',
    'ml.g6.4xlarge',
    'ml.g6.8xlarge',
    'ml.g6.12xlarge',
    'ml.g6.16xlarge',
    'ml.g6.24xlarge',
    'ml.g6.48xlarge'
]
for instance in instances:
    quota = int(quotas[f'{instance} for training job usage'])
    instance_capacity[instance] = {'quota': quota, 'usage': 0, 'available': quota}

In [30]:
sagemaker_client = boto3.client('sagemaker')
first_page = True
page = {}
runs = {}
while first_page or 'NextToken' in page:
    first_page = False
    if 'NextToken' in page:
        page = sagemaker_client.list_training_jobs(NextToken=page['NextToken'])
    else:
        page = sagemaker_client.list_training_jobs()
    for training_job_summary in page['TrainingJobSummaries']:
        if training_job_summary['TrainingJobStatus'] in ('InProgress', 'Stopping'):
            training_job = sagemaker_client.describe_training_job(TrainingJobName=training_job_summary['TrainingJobName'])
            instance = training_job['ResourceConfig']['InstanceType']
            instance_capacity[instance]['usage'] += 1
            instance_capacity[instance]['available'] -= 1

instance_capacity

{'ml.g6.xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.2xlarge': {'quota': 3, 'usage': 2, 'available': 1},
 'ml.g6.4xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.8xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.12xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.16xlarge': {'quota': 6, 'usage': 0, 'available': 6},
 'ml.g6.24xlarge': {'quota': 6, 'usage': 4, 'available': 2},
 'ml.g6.48xlarge': {'quota': 6, 'usage': 0, 'available': 6}}

In [4]:
utils.get_available_training_quotas()

{'ml.g6.xlarge': {'quota': 6, 'usage': 1, 'available': 5},
 'ml.g6.2xlarge': {'quota': 3, 'usage': 2, 'available': 1},
 'ml.g6.4xlarge': {'quota': 6, 'usage': 2, 'available': 4},
 'ml.g6.8xlarge': {'quota': 6, 'usage': 2, 'available': 4},
 'ml.g6.12xlarge': {'quota': 6, 'usage': 4, 'available': 2},
 'ml.g6.16xlarge': {'quota': 6, 'usage': 6, 'available': 0},
 'ml.g6.24xlarge': {'quota': 6, 'usage': 4, 'available': 2},
 'ml.g6.48xlarge': {'quota': 6, 'usage': 3, 'available': 3}}