In [None]:
### Importing libraries

In [None]:
import sagemaker
from sagemaker import get_execution_role
import boto3

bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/DEMO-linear-mnist"
role = get_execution_role()

In [None]:
role

In [None]:
### Importing MNIST Data set

In [None]:
%%time
import pickle, gzip, numpy, urllib.request, json

fobj = boto3.client('s3').get_object(
    Bucket='sagemaker-sample-files',
    Key='datasets/image/MNIST/mnist.pkl.gz'
)['Body'].read()

with open('mnist.pkl.gz', 'wb') as f:
    f.write(fobj)

# Load the dataset
with gzip.open("mnist.pkl.gz", "rb") as f:
    train_set, valid_set, test_set = pickle.load(f, encoding="latin1")

In [None]:
### Checking values present in the data set

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (2,10)


def show_digit(img, caption='', subplot=None):
    if subplot == None:
        _, (subplot) = plt.subplots(1,1)
    imgr = img.reshape((28,28))
    subplot.axis('off')
    subplot.imshow(imgr, cmap='gray')
    plt.title(caption)

show_digit(train_set[0][30], 'This is a {}'.format(train_set[1][30]))

In [None]:
### Transform our data into I/O Photobuff format, sagemaker takes care of it for us

In [None]:
from sagemaker import KMeans

data_location = 's3://{}/kmeans_highlevel_example/data'.format(bucket)
output_location = 's3://{}/kmeans_highlevel_example/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
                train_instance_count=2,
                train_instance_type='ml.c4.8xlarge',
                output_path=output_location,
                k=10,
                epochs=25,
                data_location=data_location)

In [None]:
### Traning our model

In [None]:
%%time

kmeans.fit(kmeans.record_set(train_set[0]))

In [None]:
### Deploy our model

In [None]:
%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

In [None]:
### validate our model

In [None]:
%%time 

result = kmeans_predictor.predict(valid_set[0][0:100])
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]

In [None]:
for cluster in range(10):
    print('\n\n\nCluster {}:'.format(int(cluster)))
    digits = [ img for l, img in zip(clusters, valid_set[0]) if int(l) == cluster ]
    height = ((len(digits)-1)//5) + 1
    width = 5
    plt.rcParams["figure.figsize"] = (width,height)
    _, subplots = plt.subplots(height, width)
    subplots = numpy.ndarray.flatten(subplots)
    for subplot, image in zip(subplots, digits):
        show_digit(image, subplot=subplot)
    for subplot in subplots[len(digits):]:
        subplot.axis('off')

    plt.show()

In [None]:
### Check for a particular value

In [None]:
result = kmeans_predictor.predict(valid_set[0][230:231])
print(result)

In [None]:
show_digit(valid_set[0][230], 'This is a {}'.format(valid_set[1][230]))

In [None]:
### for information on price check, https://aws.amazon.com/sagemaker/pricing/