In [49]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

import os
import boto3
import re
from sagemaker import get_execution_role

In [None]:
role = get_execution_role()
region = boto3.Session().region_name

s3_bucket='chandra-ml-sagemaker' # put your s3 bucket name here, and create s3 bucket
s3_prefix = 'iris/xgboost-classifier'

source_data = 'iris.data.csv'
training_file = 'iris_data_train.csv'
validation_file = 'iris_data_test.csv'


# customize to your bucket where you have stored the data
s3_bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,s3_bucket)

In [20]:
iris_df = pd.read_csv('iris.data.csv')

In [21]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [22]:
iris_df["class"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [23]:
# Encode Class Labels to integers
le = preprocessing.LabelEncoder()
le.fit(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

LabelEncoder()

In [24]:
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], 
      dtype='<U15')

In [25]:
iris_df["class_integer"] = le.transform(iris_df["class"])

In [26]:
iris_df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,class_integer
145,6.7,3.0,5.2,2.3,Iris-virginica,2
146,6.3,2.5,5.0,1.9,Iris-virginica,2
147,6.5,3.0,5.2,2.0,Iris-virginica,2
148,6.2,3.4,5.4,2.3,Iris-virginica,2
149,5.9,3.0,5.1,1.8,Iris-virginica,2


In [27]:
# Randomize the datset
np.random.seed(5)
l = list(iris_df.index)
np.random.shuffle(l)

In [28]:
iris_df = iris_df.iloc[l]

In [29]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,class_integer
82,5.8,2.7,3.9,1.2,Iris-versicolor,1
134,6.1,2.6,5.6,1.4,Iris-virginica,2
114,5.8,2.8,5.1,2.4,Iris-virginica,2
42,4.4,3.2,1.3,0.2,Iris-setosa,0
109,7.2,3.6,6.1,2.5,Iris-virginica,2


In [39]:
rows = iris_df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [40]:
train,test

(105, 45)

In [None]:
iris_df.to_csv

In [47]:
# Write Training Set
iris_df[:train].to_csv(training_file,index=False,index_label='Row',header=False,
columns=['class_integer','sepal_length','sepal_width','petal_length','petal_width'])

In [48]:
# Write Test Set
iris_df[train:].to_csv(validation_file,index=False,index_label='Row',header=False,
columns=['class_integer','sepal_length','sepal_width','petal_length','petal_width'])

In [None]:
# Upload to S3
def upload_to_s3(bucket, prefix, filename):
    with open(filename, 'rb') as f:
        key = prefix + '/' + filename
        url = 's3://{}/{}'.format(bucket, key)
        print('Writing to {}'.format(url))
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

## Upload To S3

In [None]:
upload_to_s3(s3_bucket, s3_prefix, training_file)
upload_to_s3(s3_bucket, s3_prefix, validation_file)

## Train XGBoost Classifier

In [None]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}
container = containers[boto3.Session().region_name]

In [None]:
%%time
from time import gmtime, strftime

job_name = 'xgboost-single-machine-iris-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Training job", job_name)

#Ensure that the training and validation data folders generated above are reflected in the "InputDataConfig" parameter below.

create_training_params = \
{
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "RoleArn": role,
    "OutputDataConfig": {
        "S3OutputPath": s3_bucket_path + "/" + s3_prefix + "/model"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.m4.4xlarge",
        "VolumeSizeInGB": 5
    },
    "TrainingJobName": job_name,
    "HyperParameters": {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "silent":"0",
        "objective":"reg:logistic",
        "num_round":"50"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 3600
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3_bucket_path + "/" + s3_prefix + "/" + training_file,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri":s3_bucket_path + "/" + s3_prefix + "/" + test_file,
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "ContentType": "csv",
            "CompressionType": "None"
        }
    ]
}


client = boto3.client('sagemaker')
client.create_training_job(**create_training_params)

import time

status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
print(status)
while status !='Completed' and status!='Failed':
    time.sleep(60)
    status = client.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']
    print(status)