In [1]:
import sagemaker

# Get a SageMaker-compatible role used by this Notebook Instance.
role = sagemaker.get_execution_role()

# get a SageMaker session object, that can be
# used to manage the interaction with the SageMaker API.
sagemaker_session = sagemaker.Session()

# create a training job to train a KMeans model using
# Amazon SageMaker's own implementation of the k-means algorithm
#
# set hyperparameter k = 3
from sagemaker import KMeans

input_location = 's3://awsml-sagemaker-source/iris-train.csv'
output_location = 's3://awsml-sagemaker-results'

kmeans_estimator = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.m4.xlarge',
                output_path=output_location,
                k=3)

In [2]:
import boto3
import io
import pandas as pd
import numpy as np

# load training and validation dataset from Amazon S3
s3_client = boto3.client('s3')
s3_bucket_name='awsml-sagemaker-source'

response = s3_client.get_object(Bucket='awsml-sagemaker-source', Key='iris_train.csv')
response_body = response["Body"].read()
df_iris_train = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)

response = s3_client.get_object(Bucket='awsml-sagemaker-source', Key='iris_test.csv')
response_body = response["Body"].read()
df_iris_test = pd.read_csv(io.BytesIO(response_body), header=0, index_col=False, delimiter=",", low_memory=False)

# Convert target variables 'species' from strings into integers.
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
labelEncoder.fit(df_iris_train['species'])
labelEncoder.fit(df_iris_test['species'])
df_iris_train['species'] = labelEncoder.transform(df_iris_train['species'])
df_iris_test['species'] = labelEncoder.transform(df_iris_test['species'])

# separate training and validation dataset into separate features and target datasets
# assuming that the first column of the iris_train.csv and iris_test.csv files
# contains the target attribute.
#
# since training a k-means classifier does not require labelled training data,
# you will not make use of df_iris_target_train

df_iris_features_train= df_iris_train.iloc[:,1:]
df_iris_target_train = df_iris_train.iloc[:,0]

df_iris_features_test= df_iris_test.iloc[:,1:]
df_iris_target_test = df_iris_test.iloc[:,0]

# create a training job.
train_data = df_iris_features_train.values.astype('float32')
record_set = kmeans_estimator.record_set(train_data)
kmeans_estimator.fit(record_set)

INFO:sagemaker:Creating training-job with name: kmeans-2019-05-02-19-58-39-816


2019-05-02 19:58:40 Starting - Starting the training job...
2019-05-02 19:58:42 Starting - Launching requested ML instances......
2019-05-02 19:59:43 Starting - Preparing the instances for training.........
2019-05-02 20:01:41 Downloading - Downloading input data
2019-05-02 20:01:41 Training - Training image download completed. Training in progress..
[31mDocker entrypoint called with argument(s): train[0m
[31m[05/02/2019 20:01:43 INFO 140157414549312] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metri


2019-05-02 20:01:54 Uploading - Uploading generated training model
2019-05-02 20:01:54 Completed - Training job completed
Billable seconds: 25


In [3]:
# deploy the model to a prediction instance
# and create a prediction endpoint.
predictor = kmeans_estimator.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

INFO:sagemaker:Creating model with name: kmeans-2019-05-02-20-03-26-006
INFO:sagemaker:Creating endpoint with name kmeans-2019-05-02-19-58-39-816


---------------------------------------------------------------------------!

In [4]:
test_data = df_iris_features_test.values.astype('float32')

predictions = predictor.predict(test_data)
print (predictions)

[label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 0.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.6348797678947449
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 1.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.14434637129306793
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 2.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.655796468257904
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 1.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.5183477401733398
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 1.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
 

In [5]:
# terminate the prediction instance and associated
# HTTPS endpoint.
kmeans_estimator.delete_endpoint()

INFO:sagemaker:Deleting endpoint with name: kmeans-2019-05-02-19-58-39-816
