In [1]:
%%writefile aws_sklearn_split_data.py 
import datetime
import time
import tarfile

import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston


sm_boto3 = boto3.client('sagemaker')

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = sess.default_bucket()  # this could also be a hard-coded bucket name

print('Using bucket ' + bucket)



s3 = boto3.client('s3')
obj = s3.get_object(Bucket='hamzatestbucket', Key='original_data/testsensor6_all.csv')

data = pd.read_csv(obj['Body']) # 'Body' is a key word

data = data.sample(frac=1).reset_index(drop=True)
train = data[:-3]
deploy_test = data[-3:]

train.to_csv('train.csv')
deploy_test.to_csv('deploy_test.csv')




Overwriting aws_sklearn_split_data.py


In [2]:
!python aws_sklearn_split_data.py

Using bucket sagemaker-ap-southeast-2-819846678795


In [3]:
%%writefile aws_sklearn_upload_train_data.py

import boto3
bucket = 'sagemaker-learning-to-deploy-scikitlearn-hamza'
region = 'ap-southeast-2'
s3_session = boto3.Session().resource('s3')
s3_session.create_bucket(Bucket=bucket, 
                         CreateBucketConfiguration=
                         {'LocationConstraint': region})
s3_session.Bucket(bucket).Object('train/train.csv').upload_file('train.csv')



Overwriting aws_sklearn_upload_train_data.py


In [4]:
!python aws_sklearn_upload_train_data.py

Traceback (most recent call last):
  File "aws_sklearn_upload_train_data.py", line 8, in <module>
    {'LocationConstraint': region})
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/boto3/resources/factory.py", line 520, in do_action
    response = action(self, *args, **kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/boto3/resources/action.py", line 83, in __call__
    response = getattr(parent.meta.client, operation_name)(*args, **params)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/botocore/client.py", line 357, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/botocore/client.py", line 676, in _make_api_call
    raise error_class(parsed_response, operation_name)
botocore.errorfactory.BucketAlreadyOwnedByYou: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Yo

In [5]:
%%writefile aws_sklearn_training_draft.py

from xgboost import XGBRegressor
import argparse
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.metrics import mean_squared_error as mse
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from matplotlib import pyplot as PLT
from matplotlib.pyplot import cm
from sklearn.model_selection import train_test_split
from io import StringIO, BytesIO # python3;  BytesIO for images StringIO for files
import boto3

if __name__ =='__main__':

    # Create a parser object to collect the environment variables that are in the
    # default AWS Scikit-learn Docker container.
    parser = argparse.ArgumentParser()

    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))

    args = parser.parse_args()
    
    # Load data from the location specified by args.train (In this case, an S3 bucket).
    data = pd.read_csv(os.path.join(args.train,'train.csv'), engine="python")
    data = data.sample(frac=1).reset_index(drop=True)
    X = data[['SiPM1','SiPM2','SiPM3','SiPM4','SiPM5','SiPM6']]
    y = data[['X','Y']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    
    # train
    print('training model')
    model = XGBRegressor(objective='reg:squarederror', learning_rate=0.2) 
    model = MultiOutputRegressor(model, n_jobs=-1)
    
    model.fit(X_train, y_train)

    # print abs error
    print('validating model')
    abs_err = np.sqrt(mse(y_test, model.predict(X_test)))
    
    # print couple perf metrics
    for q in [10, 50, 90]:
        print('AE-at-' + str(q) + 'th-percentile: '
              + str(np.percentile(a=abs_err, q=q)))
        
    pred = model.predict(X_test)
    rmse_manual = (abs(pred - y_test)**2)
    print(rmse_manual.shape)
    rmse_manual = rmse_manual.iloc[:,0] + rmse_manual.iloc[:,1]
    print(rmse_manual)
    
    
    x = y_test.iloc[:,0]
    y = y_test.iloc[:,1]
    z = np.sqrt(rmse_manual)
    c = pd.concat([y_test, z.rename('RMSE')], ignore_index = False, axis=1) #This is for exporting the csv
    print(c)

    PLT.show() 
    
    gridsize=100
    PLT.figure(figsize=(10, 8 ))
    PLT.subplot(111)
    PLT.xlabel("X")
    PLT.ylabel("Y")
    PLT.title("SENSOR 6 RMSE HEATMAP")

    PLT.hexbin(x, y, C=z, gridsize=gridsize, cmap=cm.rainbow, reduce_C_function= np.mean, bins='log')

    PLT.axis([x.min(), x.max(), y.min(), y.max()])

    cb = PLT.colorbar( )
    cb.set_label('RMSE')



    PLT.show() 

    
    img_data = BytesIO() #This is for images
    PLT.savefig(img_data, format='png')
    bucket = 'sagemaker-learning-to-deploy-scikitlearn-hamza'# already created on S3
    img_data.seek(0)
    image = img_data.read()


    # put the image into S3
    s3 = boto3.resource('s3')
    s3.Object(bucket, 'predictions/results.png').put(ACL='public-read', Body=image)


    
    csv_buffer = StringIO()
    c.to_csv(csv_buffer)
    s3_resource = boto3.resource('s3')
    s3_resource.Object(bucket, 'predictions/results.csv').put(Body=csv_buffer.getvalue())
        
    pickle.dump(model, open(os.path.join(args.model_dir, "model.joblib"), 'wb'))


def model_fn(model_dir):
    
    model = pickle.load(open(os.path.join(model_dir, "model.joblib"), 'rb'))
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == 'text/csv':
        samples = []
        for r in request_body.split('|'):
            samples.append(list(map(float,r.split(','))))
        return np.array(samples)
    else:
        raise ValueError("Thie model only supports text/csv input")
        
        
def predict_fn(input_data, model):
    return model.predict(input_data)



Overwriting aws_sklearn_training_draft.py


In [6]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.xgboost.estimator import XGBoost

role = 'AmazonSageMaker-ExecutionRole-20201012T123014'

# Create the XGBoost Object by directing it to the aws_sklearn_main.py script
aws_sklearn = XGBoost(entry_point='aws_sklearn_training_draft.py',
                      train_instance_type='local',
                      role=role,
                     framework_version= '1.0-1',
                     train_instance_count = 1)

# Train the model using by passing the path to the S3 bucket with the training data
aws_sklearn.fit({'train': 's3://sagemaker-learning-to-deploy-scikitlearn-hamza/train'})


train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Creating tmpuwjmtz1k_algo-1-ugj0v_1 ... 
[1BAttaching to tmpuwjmtz1k_algo-1-ugj0v_12mdone[0m
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker_xgboost_container.training:Invoking user training script.
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker-containers:Module aws_sklearn_training_draft does not provide a setup.py. 
[36malgo-1-ugj0v_1  |[0m Generating setup.py
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker-containers:Generating setup.cfg
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker-containers:Generating MANIFEST.in
[36malgo-1-ugj0v_1  |[0m INFO:sagemaker-containers:Installing module with the following command:
[36malgo-1-ugj0v_1  |[0m /miniconda3/bin/python -m pip install . 
[36malgo-1-ugj0v_1  |[0m Processing /opt/ml/code
[36malgo-1-ugj0v_1  |[0m Building wheels for collected package

[36mtmpuwjmtz1k_algo-1-ugj0v_1 exited with code 0
[0mAborting on container exit...
===== Job Complete =====


In [7]:
# Deploy model
aws_sklearn_predictor = aws_sklearn.deploy(instance_type='local', 
                                           initial_instance_count=1)

# Print the endpoint to test in next step
print(aws_sklearn_predictor.endpoint)


Attaching to tmpoib75ixz_algo-1-1ny1n_1
[36malgo-1-1ny1n_1  |[0m [2020-11-17:05:41:48:INFO] No GPUs detected (normal if no gpus installed)
[36malgo-1-1ny1n_1  |[0m [2020-11-17:05:41:48:INFO] No GPUs detected (normal if no gpus installed)
[36malgo-1-1ny1n_1  |[0m [2020-11-17:05:41:48:INFO] nginx config: 
[36malgo-1-1ny1n_1  |[0m worker_processes auto;
[36malgo-1-1ny1n_1  |[0m daemon off;
[36malgo-1-1ny1n_1  |[0m pid /tmp/nginx.pid;
[36malgo-1-1ny1n_1  |[0m error_log  /dev/stderr;
[36malgo-1-1ny1n_1  |[0m 
[36malgo-1-1ny1n_1  |[0m worker_rlimit_nofile 4096;
[36malgo-1-1ny1n_1  |[0m 
[36malgo-1-1ny1n_1  |[0m events {
[36malgo-1-1ny1n_1  |[0m   worker_connections 2048;
[36malgo-1-1ny1n_1  |[0m }
[36malgo-1-1ny1n_1  |[0m 
[36malgo-1-1ny1n_1  |[0m http {
[36malgo-1-1ny1n_1  |[0m   include /etc/nginx/mime.types;
[36malgo-1-1ny1n_1  |[0m   default_type application/octet-stream;
[36malgo-1-1ny1n_1  |[0m   access_log /dev/stdout combined;
[36malgo-1-1ny1n_1  

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[36malgo-1-1ny1n_1  |[0m 172.18.0.1 - - [17/Nov/2020:05:41:57 +0000] "GET /ping HTTP/1.1" 200 0 "-" "-"
sagemaker-xgboost-2020-11-17-05-41-43-352


In [15]:
from sagemaker.predictor import csv_serializer

aws_sklearn_predictor.serializer = csv_serializer

import boto3
import pandas as pd
import numpy as np

# Load in the deploy_test data
deploy_test = pd.read_csv("deploy_test.csv", header= None).iloc[1:,1:]
deploy_test = deploy_test.values.tolist()

# Format the deploy_test data features
request_body = ""
for sample in deploy_test:
    request_body += ",".join([str(n) for n in sample[2:]]) + "|"

request_body = request_body[:-1]
print(request_body)
prediction = aws_sklearn_predictor.predict(request_body)
print(prediction)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


103,177,120,236,123,141|134,337,83,106,141,146|355,345,326,376,340,423
[['-17.505434', '-73.20314'], ['76.36983', '-86.70647'], ['2.8188188', '1.1041712']]
[36malgo-1-1ny1n_1  |[0m 172.18.0.1 - - [17/Nov/2020:05:48:19 +0000] "POST /invocations HTTP/1.1" 200 60 "-" "-"


In [16]:

aws_sklearn_predictor.delete_model()
aws_sklearn_predictor.delete_endpoint()

Gracefully stopping... (press Ctrl+C again to force)


In [17]:
a = np.array(prediction)
a = pd.DataFrame(a[:,:], columns = ['pred_X','pred_Y']).astype(float)

In [18]:
a

Unnamed: 0,pred_X,pred_Y
0,-17.505434,-73.20314
1,76.36983,-86.70647
2,2.818819,1.104171
