In [2]:
%%capture
import sys
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
!{sys.executable} -m pip install -U pip sagemaker-studio-image-build

In [6]:
%%writefile train.py
import os
import json
import boto3
import json
import warnings
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor

warnings.filterwarnings("ignore", category=DeprecationWarning)
prefix = "/opt/ml"
input_path = os.path.join(prefix, "input/data")
output_path = os.path.join(prefix, "output")
model_path = os.path.join(prefix, "model")
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')


def train(params):
    label = params["label"]
    channel_name = "training"
    training_path = os.path.join(input_path, channel_name)
    training_dataset = TabularDataset(os.path.join(training_path, "training.csv"))
    predictor = TabularPredictor(label=label, path=model_path).fit(training_dataset)
    with open(os.path.join(model_path, "Fit_Summary.txt"), "w") as f:
        print(predictor.fit_summary(), file=f)
    return predictor
    

def test(params, predictor):
    label = params["label"]
    channel_name = "testing"
    testing_path = os.path.join(input_path, channel_name)
    testing_dataset = TabularDataset(os.path.join(testing_path, "testing.csv"))
    ground_truth = testing_dataset[label]
    testing_data = testing_dataset.drop(columns=label)
    predictions = predictor.predict(testing_data)
    with open(os.path.join(model_path, "Model_Evaluation.txt"), "w") as f:
        print(
            json.dumps(
                predictor.evaluate_predictions(
                    y_true=ground_truth,
                    y_pred=predictions,
                    auxiliary_metrics=True
                ),
                indent=4
            ),
            file=f
        )
    leaderboard = predictor.leaderboard(testing_dataset, silent=True)
    leaderboard.to_csv(os.path.join(model_path, "Leaderboard.csv"))


if __name__ == "__main__":
    print("Loading Parameters\n")
    with open(param_path) as f:
        params = json.load(f)
    print("Training Models\n")
    predictor = train(params)
    print("Testing Models\n")
    test(params, predictor)
    print("AutoGluon Job Complete")

Overwriting train.py


In [11]:
%%writefile Dockerfile
ARG REGION
FROM 763104351884.dkr.ecr.${REGION}.amazonaws.com/autogluon-training:0.3.1-cpu-py37-ubuntu18.04
RUN pip install -U pip
RUN pip install bokeh==2.0.1
RUN mkdir -p /opt/program
RUN mkdir -p /opt/ml
COPY train.py /opt/program
WORKDIR /opt/program
ENTRYPOINT ["python", "train.py"]

Overwriting Dockerfile


In [12]:
import boto3
import sagemaker

aws_region = sagemaker.Session().boto_session.region_name
!sm-docker build --build-arg REGION={aws_region} .

...[Container] 2023/01/08 06:32:09 going inside waitForAgent

[Container] 2023/01/08 06:32:09 Waiting for agent ping
[Container] 2023/01/08 06:32:10 Waiting for DOWNLOAD_SOURCE
[Container] 2023/01/08 06:32:12 Phase is DOWNLOAD_SOURCE
[Container] 2023/01/08 06:32:12 finished waitForAgent
[Container] 2023/01/08 06:32:12 CODEBUILD_SRC_DIR=/codebuild/output/src503828207/src
[Container] 2023/01/08 06:32:12 YAML location is /codebuild/output/src503828207/src/buildspec.yml
[Container] 2023/01/08 06:32:12 Setting HTTP client timeout to higher timeout for S3 source
[Container] 2023/01/08 06:32:12 Processing environment variables
[Container] 2023/01/08 06:32:12 No runtime version selected in buildspec.
[Container] 2023/01/08 06:32:13 Moving to directory /codebuild/output/src503828207/src
[Container] 2023/01/08 06:32:13 Configuring ssm agent with target id: codebuild:f458e01c-f72c-4b93-a33b-f40a0af8f23c
[Container] 2023/01/08 06:32:13 Successfully updated ssm agent configuration
[Container] 2023/

In [13]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

column_names = ["sex", "length", "diameter", "height", "whole_weight", "shucked_weight", "viscera_weight", "shell_weight", "rings"]
abalone_data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", names=column_names)
training_data, testing_data = train_test_split(abalone_data, test_size=0.1)
training_data.to_csv("training.csv")
testing_data.to_csv("testing.csv")

In [15]:
import sagemaker
import datetime

image_uri = "623127157773.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-studio-d-h0kay08tqcv4:herley"
role = sagemaker.get_execution_role()
session = sagemaker.session.Session()
bucket = session.default_bucket()
job_version = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')[:-3]
job_name = f"abalone-autogluon-{job_version}"

In [19]:
from sagemaker.estimator import Estimator

autogluon = Estimator(
    image_uri=image_uri,
    role=role,
    output_path=f"s3://{bucket}/{job_name}",
    base_job_name=job_name,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    hyperparameters={
        "label": "rings",
        "bucket": bucket,
        "training_job": job_name
    },
    volume_size=20
)

In [20]:
autogluon.fit(
    inputs={
        "training": session.upload_data(
            "training.csv",
            bucket=bucket,
            key_prefix=f"{job_name}/input"
        ),
        "testing": session.upload_data(
            "testing.csv",
            bucket=bucket,
            key_prefix=f"{job_name}/input"
        )
    }
)

2023-01-08 06:52:26 Starting - Starting the training job...
2023-01-08 06:52:40 Starting - Preparing the instances for trainingProfilerReport-1673160745: InProgress
......
2023-01-08 06:53:53 Downloading - Downloading input data...
2023-01-08 06:54:25 Training - Training image download completed. Training in progress..[34mLoading Parameters[0m
[34mTraining Models[0m
[34mBeginning AutoGluon training ...[0m
[34mAutoGluon will save models to "/opt/ml/model/"[0m
[34mAutoGluon Version:  0.3.1[0m
[34mTrain Data Rows:    3759[0m
[34mTrain Data Columns: 9[0m
[34mPreprocessing data ...[0m
[34mAutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).[0m
[34m#011First 10 (of 27) unique label values:  [9, 8, 7, 11, 12, 6, 19, 10, 14, 16][0m
[34m#011If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['bin

In [21]:
!mkdir extract
sagemaker.s3.S3Downloader.download(autogluon.model_data, "./")
!tar xfz ./model.tar.gz -C extract

In [22]:
df = pd.read_csv("./extract/Leaderboard.csv")
df = df.filter(["model","score_test", "score_val"]).sort_values(by="score_val", ascending=False).reset_index().drop(columns="index")
df

Unnamed: 0,model,score_test,score_val
0,WeightedEnsemble_L2,0.294258,0.32998
1,NeuralNetMXNet,0.282297,0.309859
2,NeuralNetFastAI,0.272727,0.305835
3,LightGBMXT,0.263158,0.299799
4,LightGBM,0.255981,0.297787
5,XGBoost,0.255981,0.295775
6,LightGBMLarge,0.258373,0.277666
7,ExtraTreesEntr,0.248804,0.277666
8,CatBoost,0.26555,0.275654
9,RandomForestEntr,0.241627,0.27163


In [23]:
import IPython
IPython.display.HTML(filename="./extract/SummaryOfModels.html")