In [1]:
import os
import boto3
import re
from sagemaker import get_execution_role

In [2]:
role = get_execution_role()

In [3]:
bucket = 'sagemaker-clever-babbage'# enter your s3 bucket where you will copy data and model artifacts
prefix = 'clever-babbage' # place to upload training files within the bucket

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

In [24]:
data_location = 's3://{}/{}'.format(bucket, 'Period.csv')

data = pd.read_csv(data_location, header = None)

# specify columns extracted from wbdc.names
data.columns = ["id","start_date", "end_date", "User_id"] 

# save the data
data.to_csv("periods-raw.csv", sep=',', index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

(34943, 4)


Unnamed: 0,id,start_date,end_date,User_id
0,id,start_date,end_date,User_id
1,352,5/4/15,10/4/15,252
2,353,23/3/15,27/3/15,253
3,354,6/4/15,11/4/15,254
4,355,21/3/15,24/3/15,255


Unnamed: 0,id,start_date,end_date,User_id
count,34943,34940,32169,34943
unique,34943,1585,1572,6704
top,4747,1/1/16,30/7/18,2386
freq,1,53,45,60


In [26]:
data['start_date'] = pd.to_datetime(data['start_date'],errors='coerce')
sorted_data = data.sort_values(by=['start_date'])
sorted_data

Unnamed: 0,id,start_date,end_date,User_id
137,503,1997-04-17,24/4/97,366
2027,2950,2014-02-11,7/11/14,1374
1201,2052,2014-02-12,8/12/14,1020
67,430,2014-03-16,21/3/14,309
181,898,2014-04-08,6/8/14,447
2026,2949,2014-04-12,9/12/14,1374
1354,2224,2014-05-11,9/11/14,1083
176,893,2014-05-12,9/12/14,447
238,981,2014-07-09,13/9/14,468
1200,2051,2014-07-11,13/11/14,1020


In [107]:
from itertools import islice
historical_period_list = []

for user in sorted_data['User_id'].unique():
    userdf = sorted_data[sorted_data['User_id'] == user]
    prev = userdf.iloc[0]
    if userdf.shape[0] > 1:
        current = userdf.iloc[1]
        for index, row in islice(userdf.iterrows(), 2, None):
            prev = current
            current = row
            historical_period_list.append((prev['start_date'], current['start_date'], user))
        
historical_period = pd.DataFrame(historical_period_list, columns=['previous_start_date', 'next_start_date', 'user_id'])
historical_period

Unnamed: 0,previous_start_date,next_start_date,user_id
0,2014-04-12,2014-12-31,1374
1,2014-12-31,2015-01-27,1374
2,2015-01-27,2015-02-24,1374
3,2015-02-24,2015-03-27,1374
4,2015-03-27,2015-04-26,1374
5,2015-04-26,2015-05-23,1374
6,2015-05-23,2015-06-24,1374
7,2015-06-24,2015-07-23,1374
8,2015-07-23,2015-08-23,1374
9,2015-08-23,2015-09-19,1374


In [145]:
rand_split = np.random.rand(len(historical_period))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = historical_period[train_list]
data_val = historical_period[val_list]
data_test = historical_period[test_list]

train_y = data_train.iloc[:,1].as_matrix();
train_X = data_train.iloc[:,:1].as_matrix();

val_y = data_val.iloc[:,1].as_matrix();
val_X = data_val.iloc[:,:1].as_matrix();

test_y = data_test.iloc[:,1].as_matrix();
test_X = data_test.iloc[:,:1].as_matrix();
test_X

array([['2015-09-19T00:00:00.000000000'],
       ['2015-01-26T00:00:00.000000000'],
       ['2015-02-21T00:00:00.000000000'],
       ...,
       ['2018-12-16T00:00:00.000000000'],
       ['2059-06-25T00:00:00.000000000'],
       ['2060-04-25T00:00:00.000000000']], dtype='datetime64[ns]')

In [151]:
train_file = 'linear_train.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(f)

In [152]:
validation_file = 'linear_validation.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', validation_file)).upload_fileobj(f)

In [153]:
# See 'Algorithms Provided by Amazon SageMaker: Common Parameters' in the SageMaker documentation for an explanation of these values.
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [154]:
linear_job = 'clever-babbage-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())



print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {
        "TrainingImage": container,
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.c4.2xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "HyperParameters": {
        "feature_dim": "1",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "32",
        "loss": "absolute_loss"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

Job name is: clever-babbage-2019-03-31-05-07-27


In [155]:
%%time

region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress
CPU times: user 72.9 ms, sys: 412 µs, total: 73.3 ms
Wall time: 4min


In [156]:
linear_hosting_container = {
    'Image': container,
    'ModelDataUrl': sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts']['S3ModelArtifacts']
}

create_model_response = sm.create_model(
    ModelName=linear_job,
    ExecutionRoleArn=role,
    PrimaryContainer=linear_hosting_container)

print(create_model_response['ModelArn'])

arn:aws:sagemaker:ap-southeast-1:422808909929:model/clever-babbage-2019-03-31-05-07-27


In [157]:
linear_endpoint_config = 'clever-babbage-endpoint-config-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[{
        'InstanceType': 'ml.m4.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': linear_job,
        'VariantName': 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

clever-babbage-endpoint-config-2019-03-31-05-18-31
Endpoint Config Arn: arn:aws:sagemaker:ap-southeast-1:422808909929:endpoint-config/clever-babbage-endpoint-config-2019-03-31-05-18-31


In [159]:
%%time

linear_endpoint = 'clever-babbage-endpoint-' + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint,
    EndpointConfigName=linear_endpoint_config)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Status: " + status)

sm.get_waiter('endpoint_in_service').wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

if status != 'InService':
    raise Exception('Endpoint creation did not succeed')

clever-babbage-endpoint-201903310519
arn:aws:sagemaker:ap-southeast-1:422808909929:endpoint/clever-babbage-endpoint-201903310519
Status: Creating
Arn: arn:aws:sagemaker:ap-southeast-1:422808909929:endpoint/clever-babbage-endpoint-201903310519
Status: InService
CPU times: user 185 ms, sys: 8.31 ms, total: 193 ms
Wall time: 6min 32s


In [165]:
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip()

In [171]:
runtime= boto3.client('runtime.sagemaker')

payload = np2csv(test_X.astype('float32'))
response = runtime.invoke_endpoint(EndpointName=linear_endpoint,
                                   ContentType='text/csv',
                                   Body=payload)
result = json.loads(response['Body'].read().decode())
test_pred = np.array([r['score'] for r in result['predictions']])

In [174]:
test_pred

array([1.43757201e+18, 1.44008068e+18, 1.45251451e+18, ...,
       2.84070539e+18, 2.84692203e+18, 2.81142979e+18])

In [178]:
test_mae_linear = np.mean(np.abs(test_y.astype('float32') - test_pred))
test_mae_baseline = np.mean(np.abs(test_y.astype('float32') - np.median(train_y.astype('float32')))) ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear,3))

Test MAE Baseline : 3.2343934e+16
Test MAE Linear: 1998938127548429.5


In [179]:
c = (test_pred > 0.5)+0;
test_pred_baseline = np.repeat(np.median(train_y.astype('float32')), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class))*100
baseline_accuracy = np.mean((test_y == test_pred_baseline))*100

print("Prediction Accuracy:", round(prediction_accuracy,1), "%")
print("Baseline Accuracy:", round(baseline_accuracy,1), "%")

Prediction Accuracy: 0.0 %
Baseline Accuracy: 0.0 %




In [180]:
test_pred_baseline

array([1.499472e+18, 1.499472e+18, 1.499472e+18, ..., 1.499472e+18,
       1.499472e+18, 1.499472e+18], dtype=float32)