In [1]:
import boto3
import os
import sagemaker
import tensorflow as tf

sess = sagemaker.session.Session()
bucket = sess.default_bucket() 
region = boto3.Session().region_name

data_dir = os.path.join(os.getcwd(), 'data')
os.makedirs(data_dir, exist_ok=True)
train_dir = os.path.join(os.getcwd(), 'data/train')
os.makedirs(train_dir, exist_ok=True)
test_dir = os.path.join(os.getcwd(), 'data/test')
os.makedirs(test_dir, exist_ok=True)
raw_dir = os.path.join(os.getcwd(), 'data/raw')
os.makedirs(raw_dir, exist_ok=True)




In [2]:
data_dir

'/home/ec2-user/SageMaker/data'

In [3]:
train_dir

'/home/ec2-user/SageMaker/data/train'

In [4]:
test_dir

'/home/ec2-user/SageMaker/data/test'

In [5]:
raw_dir

'/home/ec2-user/SageMaker/data/raw'

In [6]:
import numpy as np
from tensorflow.python.keras.datasets import boston_housing
from sklearn.preprocessing import StandardScaler

(x_train, y_train), (x_test, y_test) = boston_housing.load_data()

np.save(os.path.join(raw_dir, 'x_train.npy'), x_train)
np.save(os.path.join(raw_dir, 'x_test.npy'), x_test)
np.save(os.path.join(raw_dir, 'y_train.npy'), y_train)
np.save(os.path.join(raw_dir, 'y_test.npy'), y_test)

In [7]:
bucket

'sagemaker-us-east-1-058528764918'

In [8]:
s3_prefix = 'vpc'
rawdata_s3_prefix = '{}/victor/raw'.format(s3_prefix)
rawdata_s3_prefix

'vpc/victor/raw'

In [9]:
raw_s3 = sess.upload_data(path='./data/raw/', key_prefix=rawdata_s3_prefix)
print(raw_s3)

s3://sagemaker-us-east-1-058528764918/vpc/victor/raw


In [10]:
%%writefile preprocessing.py

import glob
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

if __name__=='__main__':
    
    input_files = glob.glob('{}/*.npy'.format('/opt/ml/processing/input'))    # /opt/ml/processing/input
    print('\nINPUT FILE LIST: \n{}\n'.format(input_files))
    scaler = StandardScaler()
    for file in input_files:
        raw = np.load(file)
        # only transform feature columns
        if 'y_' not in file:
            transformed = scaler.fit_transform(raw)
        if 'train' in file:
            if 'y_' in file:
                output_path = os.path.join('/opt/ml/processing/train', 'y_train.npy')   # /opt/ml/processing/train
                np.save(output_path, raw)
                print('SAVED LABEL TRAINING DATA FILE\n')
            else:
                output_path = os.path.join('/opt/ml/processing/train', 'x_train.npy')  # /opt/ml/processing/train
                np.save(output_path, transformed)
                print('SAVED TRANSFORMED TRAINING DATA FILE\n')
        else:
            if 'y_' in file:
                output_path = os.path.join('/opt/ml/processing/test', 'y_test.npy')  # /opt/ml/processing/test
                np.save(output_path, raw)
                print('SAVED LABEL TEST DATA FILE\n')
            else:
                output_path = os.path.join('/opt/ml/processing/test', 'x_test.npy')  # /opt/ml/processing/test
                np.save(output_path, transformed)
                print('SAVED TRANSFORMED TEST DATA FILE\n')

Overwriting preprocessing.py


In [11]:
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

processing_tags = [{'Key': 'cost-center', 'Value': 'TF2WorkflowProcessing'}]

sklearn_processor1 = SKLearnProcessor(framework_version='0.23-1',
                                     role=get_execution_role(),
                                     instance_type='ml.m5.xlarge',
                                     instance_count=1,
                                     tags=processing_tags)

In [12]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

In [13]:
help(ProcessingOutput)

Help on class ProcessingOutput in module sagemaker.processing:

class ProcessingOutput(builtins.object)
 |  Accepts parameters that specify an Amazon S3 output for a processing job.
 |  
 |  It also provides a method to turn those parameters into a dictionary.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, source=None, destination=None, output_name=None, s3_upload_mode='EndOfJob', app_managed=False, feature_store_output=None)
 |      Initializes a ``ProcessingOutput`` instance.
 |      
 |      ``ProcessingOutput`` accepts parameters that specify an Amazon S3 output for a
 |      processing job and provides a method to turn those parameters into a dictionary.
 |      
 |      Args:
 |          source (str): The source for the output.
 |          destination (str): The destination of the output. If a destination
 |              is not provided, one will be generated:
 |              "s3://<default-bucket-name>/<job-name>/output/<output-name>".
 |          output_name (str): The 

In [14]:
from time import gmtime, strftime 

processing_job_name = "pruebitas_noturnas-workflow-{}".format(strftime("%d-%H-%M-%S", gmtime()))
output_destination = 's3://{}/{}/victor/result'.format(bucket, s3_prefix)
output_destination

's3://sagemaker-us-east-1-058528764918/vpc/victor/result'

In [15]:
processing_job_name

'pruebitas_noturnas-workflow-16-00-37-06'

In [16]:
rawdata_s3_prefix

'vpc/victor/raw'

In [17]:
raw_s3

's3://sagemaker-us-east-1-058528764918/vpc/victor/raw'

In [18]:
from time import gmtime, strftime 

processing_job_name = "tf-2-workflow-{}".format(strftime("%d-%H-%M-%S", gmtime()))
output_destination = 's3://{}/{}/data'.format(bucket, s3_prefix)

sklearn_processor1.run(code='preprocessing.py',
                      job_name=processing_job_name,
                      inputs=[ProcessingInput(
                        source=raw_s3,
                        destination='/opt/ml/processing/input',
                        s3_data_distribution_type='ShardedByS3Key')],
                      outputs=[ProcessingOutput(output_name='train',
                                                destination='{}/train'.format(output_destination),
                                                source='/opt/ml/processing/train'),
                               ProcessingOutput(output_name='test',
                                                destination='{}/test'.format(output_destination),
                                                source='/opt/ml/processing/test')])

preprocessing_job_description = sklearn_processor1.jobs[-1].describe()


Job Name:  tf-2-workflow-16-00-37-06
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-058528764918/vpc/victor/raw', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'ShardedByS3Key', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-058528764918/tf-2-workflow-16-00-37-06/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-058528764918/vpc/data/train', 'LocalPath': '/opt/ml/processing/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-058528764918/vpc/data/test', 'LocalPa

In [19]:
type(output_destination)

str

In [20]:
output_destination

's3://sagemaker-us-east-1-058528764918/vpc/data'