In [14]:
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Initialize a SageMaker session
sagemaker_session = sagemaker.Session()

# Specify the SageMaker execution role
role = 'arn:aws:iam::863397112005:role/service-role/AmazonSageMaker-ExecutionRole-20231109T153131'

# Define the SageMaker Scikit-learn Processor
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    instance_type='ml.t3.xlarge',
    instance_count=1,
    base_job_name='sklearn-fraud-detection-job',
    sagemaker_session=sagemaker_session
)

# Path to the S3 location of the preprocessing script
preprocessing_script_uri = 's3://udacity-nano-degree-project/code/preprocess-fraud-dataset-feature-store.py'
# preprocessing_script_uri = 'preprocess-fraud-dataset-feature-store.py';

# Input and output data configuration
input_data_config = [
    ProcessingInput(
        source='s3://sagemaker-us-east-1-863397112005/data/online_fraud_dataset.csv',
        destination='/opt/ml/processing/input',
        input_name='input-1'
    )
]

output_data_config = [
    ProcessingOutput(
        output_name="fraud-train",
        s3_upload_mode="EndOfJob",
        source="/opt/ml/processing/output/fraud/train",
    ),
    ProcessingOutput(
        output_name="fraud-validation",
        s3_upload_mode="EndOfJob",
        source="/opt/ml/processing/output/fraud/validation",
    ),
    ProcessingOutput(
        output_name="fraud-test",
        s3_upload_mode="EndOfJob",
        source="/opt/ml/processing/output/fraud/test",
    ),
]

# Running the processing job
sklearn_processor.run(
    code=preprocessing_script_uri,
    inputs=input_data_config,
    outputs=output_data_config,
    arguments=[
        '--train-split-percentage', '0.7',
        '--validation-split-percentage', '0.15',
        '--test-split-percentage', '0.15'
    ]
)

# The processing job will start and SageMaker will handle the rest. You can monitor the job status in the SageMaker console.


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/zenysisaccount/Library/Application Support/sagemaker/config.yaml


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker:Creating processing-job with name sklearn-fraud-detection-job-2023-11-19-11-13-21-307


........................[34mCollecting fsspec
  Downloading fsspec-2023.1.0-py3-none-any.whl (143 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.0/143.0 kB 15.4 MB/s eta 0:00:00[0m
[34mInstalling collected packages: fsspec[0m
[34mSuccessfully installed fsspec-2023.1.0[0m
[34m[notice] A new release of pip is available: 23.0 -> 23.3.1[0m
[34m[notice] To update, run: pip install --upgrade pip[0m
[34mCollecting s3fs
  Downloading s3fs-2023.1.0-py3-none-any.whl (27 kB)[0m
[34mCollecting aiohttp!=4.0.0a0,!=4.0.0a1
  Downloading aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (987 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 988.0/988.0 kB 49.3 MB/s eta 0:00:00[0m
[34mCollecting aiobotocore~=2.4.2
  Downloading aiobotocore-2.4.2-py3-none-any.whl (66 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.8/66.8 kB 12.9 MB/s eta 0:00:00[0m
[34mCollecting wrapt>=1.10.10
  Downloading wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.man