In [2]:
import sys                                                                             # Python system library needed to load custom functions
import numpy as np                                                                     # for performing calculations on numerical arrays
import pandas as pd                                                                    # home of the DataFrame construct, _the_ most important object for Data Science
import seaborn as sns                                                                  # additional plotting library
import matplotlib.pyplot as plt                                                        # allows creation of insightful plots
import os                                                                              # for changing the directory

import sagemaker                                                                       # dedicated sagemaker library to execute training jobs
import boto3                                                                           # for interacting with S3 buckets

from sagemaker.huggingface import HuggingFace                                           # for executing the trainig jobs
from sklearn.metrics import precision_recall_fscore_support, accuracy_score             # tools to understand how our model is performing

sys.path.append('../src')                                                               # Add the source directory to the PYTHONPATH. This allows to import local functions and modules.
from gdsc_utils import create_encrypted_bucket, download_and_extract_model, PROJECT_DIR # functions to create S3 buckets and to help with downloading models. Importing our root directory
from gdsc_eval import plot_confusion_matrix                                             # function for creating confusion matrix
from config import DEFAULT_BUCKET, DEFAULT_REGION, MY_BUCKET        # importing the bucket name that contains data for the challenge and the default region
os.chdir(PROJECT_DIR)                                                                   # changing our directory to root

In [3]:
entry_point = 'train_x.py'
exp_name = entry_point.split('.')[0].replace('_', '-')  # AWS does not allow . and _ as experiment names
exp_name

'train-x'

In [4]:
account_id = boto3.client('sts').get_caller_identity().get('Account')
role = sagemaker.get_execution_role()

In [5]:
input_channels = {    
    "data": f"s3://{MY_BUCKET}"    
}
input_channels

{'data': 's3://gdsc.sh19700520'}

In [6]:
# We need to create our own s3 bucket if it doesn't exist yet:
# sagemaker_bucket = f"sagemaker-{DEFAULT_REGION}-{account_id}"
sagemaker_bucket = MY_BUCKET

create_encrypted_bucket(sagemaker_bucket)

s3_output_location = f"s3://{sagemaker_bucket}/{exp_name}"
s3_output_location

's3://gdsc.sh19700520/train-x'

In [7]:
hyperparameters={
    "sampling_rate":16000,                                  # sampling rate without downsampling
    "learning_rate":3.5e-7,                                       # learning rate
    "epochs":100,                                                   # number of training epochs
    "patience":10,                                                  # early stopping - how many epoch without improvement will stop the training
    "train_batch_size":4,                                          # training batch size
    "model_name":"MIT/ast-finetuned-audioset-10-10-0.4593",        # name of the pretrained model from HuggingFace
    "train_dir":"train",                                           # folder name with training data
    "val_dir":"val",                                               # folder name with validation data
    "test_dir":"test",                                             # folder name with test data
    "train_dataset_mean":-8.141991150530815,                       # mean value of spectrograms of our resampled data 
    "train_dataset_std":4.095692486358449                          # standard deviation value of spectrograms of our resampled data
}

In [8]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [9]:
image_uri = '954362353459.dkr.ecr.us-east-1.amazonaws.com/sm-training-custom:latest'

huggingface_estimator = HuggingFace(
    entry_point=entry_point,                # fine-tuning script to use in training job
    source_dir="./src",                     # directory where fine-tuning script is stored. This directory will be downloaded to training instance
    instance_type="ml.g4dn.xlarge",         # instance type - ml.g4dn.xlarge is a GPU instance so the training will be faster 
    output_path = s3_output_location,       # outputbucket to store our model after training
    instance_count=1,                       # number of instances. We are limited to 1 instance
    role=role,                              # IAM role used in training job to acccess AWS resources (S3)
    image_uri = image_uri,                  # passing our custom image with the required libraries
    py_version="py310",                     # Python version 
    hyperparameters=hyperparameters,        # hyperparameters to use in training job
    metric_definitions = metric_definitions # metrics we want to extract from logs. It will be visible in SageMaker training job UI
)

After we created the estimator, we will need to call the *fit* method to start the training job. As this might take a while, we can set ```wait=False``` so our notebook will not wait for the training job to finish and we can continue working, but for the sake of the tutorial let's set it to ```True```.

In [10]:
huggingface_estimator.fit(input_channels, wait=False)

INFO:sagemaker:Creating training-job with name: sm-training-custom-2023-07-16-11-31-14-964


Using provided s3_resource
