In [1]:
import sagemaker
import json
import boto3

In [2]:
sess = sagemaker.Session()

role = sagemaker.get_execution_role()
print(
    role
)  # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = sess.default_bucket()  # Replace with your own bucket name if needed
print(bucket)
prefix = "blazingtext/supervised"  # Replace with the prefix under which you want to store the data if needed

arn:aws:iam::962148432162:role/service-role/AmazonSageMaker-ExecutionRole-20201126T084214
sagemaker-us-east-1-962148432162


In [3]:
! aws s3 cp s3://aws-mls-c01/sagemaker/blazingtext/dbpedia.train ./data/
! aws s3 cp s3://aws-mls-c01/sagemaker/blazingtext/dbpedia.validation ./data/

download: s3://aws-mls-c01/sagemaker/blazingtext/dbpedia.train to data/dbpedia.train
download: s3://aws-mls-c01/sagemaker/blazingtext/dbpedia.validation to data/dbpedia.validation


In [4]:
region_name = boto3.Session().region_name

In [5]:
container = sagemaker.image_uris.retrieve("blazingtext", region_name, version="latest")

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:1 (us-east-1)


In [6]:
train_channel = prefix + "/train"
validation_channel = prefix + "/validation"

sess.upload_data(path="./data/dbpedia.train", bucket=bucket, key_prefix=train_channel)
sess.upload_data(path="./data/dbpedia.validation", bucket=bucket, key_prefix=validation_channel)

s3_train_data = "s3://{}/{}".format(bucket, train_channel)
s3_validation_data = "s3://{}/{}".format(bucket, validation_channel)

In [7]:
s3_output_location = "s3://{}/{}/output".format(bucket, prefix)

In [8]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.c4.4xlarge",
    volume_size=30,
    max_run=360000,
    input_mode="File",
    output_path=s3_output_location,
    hyperparameters={
        "mode": "supervised",
        "epochs": 1,
        "min_count": 2,
        "learning_rate": 0.05,
        "vector_dim": 10,
        "early_stopping": True,
        "patience": 4,
        "min_epochs": 5,
        "word_ngrams": 2,
    },
)

In [9]:
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/plain",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

In [10]:
%%time
bt_model.fit(inputs=data_channels, logs=True)

2021-10-16 03:29:11 Starting - Starting the training job...
2021-10-16 03:29:35 Starting - Launching requested ML instancesProfilerReport-1634354951: InProgress
......
2021-10-16 03:30:41 Starting - Preparing the instances for training.........
2021-10-16 03:32:10 Downloading - Downloading input data......
2021-10-16 03:33:11 Training - Downloading the training image...
2021-10-16 03:33:37 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[10/16/2021 03:33:27 INFO 140199767618944] nvidia-smi took: 0.025306224822998047 secs to identify 0 gpus[0m
[34m[10/16/2021 03:33:27 INFO 140199767618944] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[10/16/2021 03:33:27 INFO 140199767618944] Processing /opt/ml/input/data/train/dbpedia.train . File size: 34.98068809509277 MB[0m
[34m[10/16/2021 03:33:27 INFO 140199767618944] Processing /opt/ml/input/data/validation/dbpedia.validation

In [11]:
%%time

from sagemaker.serializers import JSONSerializer

text_classifier = bt_model.deploy(
    initial_instance_count=1, instance_type="ml.m5.xlarge", serializer=JSONSerializer()
)

-----------!CPU times: user 173 ms, sys: 11.2 ms, total: 184 ms
Wall time: 5min 32s


In [12]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
sentences = [
    "Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft.",
    "Berwick secondary college is situated in the outer melbourne metropolitan suburb of berwick .",
    "Sparky is a dog of the canis familiaris family. He was lives in our house. Which is in Australia."
]

# using the same nltk tokenizer that we used during data preparation for training
tokenized_sentences = [" ".join(nltk.word_tokenize(sent)) for sent in sentences]

payload = {"instances": tokenized_sentences}

response = text_classifier.predict(payload)

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

[
  {
    "label": [
      "__label__Artist"
    ],
    "prob": [
      0.4722236096858978
    ]
  },
  {
    "label": [
      "__label__EducationalInstitution"
    ],
    "prob": [
      0.4115302264690399
    ]
  },
  {
    "label": [
      "__label__Animal"
    ],
    "prob": [
      0.2677112817764282
    ]
  }
]


In [14]:
text_classifier.delete_endpoint()