# Running a basic PySpark application

In [None]:
import sagemaker
print(sagemaker.__version__)

## Setup S3 bucket locations and roles

In [2]:
import logging
from time import gmtime, strftime

sagemaker_logger = logging.getLogger("sagemaker")
sagemaker_logger.setLevel(logging.INFO)
sagemaker_logger.addHandler(logging.StreamHandler())

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

## Example 1: Running a basic PySpark application

In [4]:
# Fetch the dataset from the SageMaker bucket
!mkdir -p ./data
!wget https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv -O ./data/abalone.csv

--2021-01-22 05:08:16--  https://s3-us-west-2.amazonaws.com/sparkml-mleap/data/abalone/abalone.csv
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.236.168
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.236.168|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191873 (187K) [binary/octet-stream]
Saving to: ‘./data/abalone.csv’


2021-01-22 05:08:18 (317 KB/s) - ‘./data/abalone.csv’ saved [191873/191873]



## Write the PySpark script

In [6]:
!mkdir -p ./code

In [7]:
%%writefile ./code/preprocess.py
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import csv
import os
import shutil
import sys
import time

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    OneHotEncoder,
    StringIndexer,
    VectorAssembler,
    VectorIndexer,
)
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    StringType,
    StructField,
    StructType,
)


def csv_line(data):
    r = ','.join(str(d) for d in data[1])
    return str(data[0]) + "," + r


def main():
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_input_bucket", type=str, help="s3 input bucket")
    parser.add_argument("--s3_input_key_prefix", type=str, help="s3 input key prefix")
    parser.add_argument("--s3_output_bucket", type=str, help="s3 output bucket")
    parser.add_argument("--s3_output_key_prefix", type=str, help="s3 output key prefix")
    args = parser.parse_args()

    spark = SparkSession.builder.appName("PySparkApp").getOrCreate()

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([StructField("sex", StringType(), True),
                         StructField("length", DoubleType(), True),
                         StructField("diameter", DoubleType(), True),
                         StructField("height", DoubleType(), True),
                         StructField("whole_weight", DoubleType(), True),
                         StructField("shucked_weight", DoubleType(), True),
                         StructField("viscera_weight", DoubleType(), True),
                         StructField("shell_weight", DoubleType(), True),
                         StructField("rings", DoubleType(), True)])

    # Downloading the data from S3 into a Dataframe
    total_df = spark.read.csv(('s3://' + os.path.join(args.s3_input_bucket, args.s3_input_key_prefix,
                                                   'abalone.csv')), header=False, schema=schema)

    #StringIndexer on the sex column which has categorical value
    sex_indexer = StringIndexer(inputCol="sex", outputCol="indexed_sex")

    #one-hot-encoding is being performed on the string-indexed sex column (indexed_sex)
    sex_encoder = OneHotEncoder(inputCol="indexed_sex", outputCol="sex_vec")

    #vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format
    assembler = VectorAssembler(inputCols=["sex_vec",
                                           "length",
                                           "diameter",
                                           "height",
                                           "whole_weight",
                                           "shucked_weight",
                                           "viscera_weight",
                                           "shell_weight"],
                                outputCol="features")

    # The pipeline comprises of the steps added above
    pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler])

    # This step trains the feature transformers
    model = pipeline.fit(total_df)

    # This step transforms the dataset with information obtained from the previous fit
    transformed_total_df = model.transform(total_df)

    # Split the overall dataset into 80-20 training and validation
    (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2])

    # Convert the train dataframe to RDD to save in CSV format and upload to S3
    train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features))
    train_lines = train_rdd.map(csv_line)
    train_lines.saveAsTextFile('s3://' + os.path.join(args.s3_output_bucket, args.s3_output_key_prefix, 'train'))

    # Convert the validation dataframe to RDD to save in CSV format and upload to S3
    validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features))
    validation_lines = validation_rdd.map(csv_line)
    validation_lines.saveAsTextFile('s3://' + os.path.join(args.s3_output_bucket, args.s3_output_key_prefix, 'validation'))


if __name__ == "__main__":
    main()

Writing ./code/preprocess.py


## Run the SageMaker Processing Job

In [8]:
from sagemaker.spark.processing import PySparkProcessor

# Upload the raw input dataset to a unique S3 location
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
prefix = "sagemaker/spark-preprocess-demo/{}".format(timestamp_prefix)
input_prefix_abalone = "{}/input/raw/abalone".format(prefix)
input_preprocessed_prefix_abalone = "{}/input/preprocessed/abalone".format(prefix)

sagemaker_session.upload_data(path='./data/abalone.csv', bucket=bucket, key_prefix=input_prefix_abalone)

# Run the processing job
spark_processor = PySparkProcessor(
    base_job_name="sm-spark",
    framework_version="2.4",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=1200,
)

spark_processor.run(
    submit_app="./code/preprocess.py",
    arguments=["--s3_input_bucket", bucket,
               "--s3_input_key_prefix", input_prefix_abalone,
               "--s3_output_bucket", bucket,
               "--s3_output_key_prefix", input_preprocessed_prefix_abalone],
    spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, prefix),
    logs=False
)

Creating processing-job with name sm-spark-2021-01-22-05-11-54-716



Job Name:  sm-spark-2021-01-22-05-11-54-716
Inputs:  [{'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sm-spark-2021-01-22-05-11-54-716/input/code/preprocess.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'S3Output': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sagemaker/spark-preprocess-demo/2021-01-22-05-11-54/spark_event_logs', 'LocalPath': '/opt/ml/processing/spark-events/', 'S3UploadMode': 'Continuous'}}]
......................................................................!

## Validate Data Processing Results

In [20]:
print("Top 5 rows from s3://{}/{}/train/".format(bucket, input_preprocessed_prefix_abalone))
!aws s3 ls s3://$bucket/$input_preprocessed_prefix_abalone/train/

!aws s3 cp --quiet s3://$bucket/$input_preprocessed_prefix_abalone/train/part-00000 - | head -n5

Top 5 rows from s3://sagemaker-cn-north-1-876820548815/sagemaker/spark-preprocess-demo/2021-01-22-05-11-54/input/preprocessed/abalone/train/
2021-01-22 05:17:18          0 _SUCCESS
2021-01-22 05:17:18     177556 part-00000
5.0,0.0,0.0,0.275,0.195,0.07,0.08,0.031,0.0215,0.025
6.0,0.0,0.0,0.29,0.21,0.075,0.275,0.113,0.0675,0.035
5.0,0.0,0.0,0.29,0.225,0.075,0.14,0.0515,0.0235,0.04
7.0,0.0,0.0,0.305,0.225,0.07,0.1485,0.0585,0.0335,0.045
7.0,0.0,0.0,0.305,0.23,0.08,0.156,0.0675,0.0345,0.048


In [10]:
## View the Spark UI

In [11]:
spark_processor.start_history_server()

Pulling spark history server image...
docker command: docker pull 671472414489.dkr.ecr.cn-north-1.amazonaws.com.cn/sagemaker-spark-processing:2.4-cpu
image pulled: 671472414489.dkr.ecr.cn-north-1.amazonaws.com.cn/sagemaker-spark-processing:2.4-cpu
History server terminated
Starting history server...
History server failed to start. Please run 'docker logs history_server' to see logs


In [None]:
spark_processor.terminate_history_server()

## Example 2: Specify additional python and jar file dependencies

In [12]:
%%writefile ./code/hello_py_spark_app.py
import argparse
import time

# Import local module to test spark-submit--py-files dependencies
import hello_py_spark_udfs as udfs
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
import time

if __name__ == "__main__":
    print("Hello World, this is PySpark!")

    parser = argparse.ArgumentParser(description="inputs and outputs")
    parser.add_argument("--input", type=str, help="path to input data")
    parser.add_argument("--output", required=False, type=str, help="path to output data")
    args = parser.parse_args()
    spark = SparkSession.builder.appName("SparkTestApp").getOrCreate()
    sqlContext = SQLContext(spark.sparkContext)

    # Load test data set
    inputPath = args.input
    outputPath = args.output
    salesDF = spark.read.json(inputPath)
    salesDF.printSchema()

    salesDF.createOrReplaceTempView("sales")

    # Define a UDF that doubles an integer column
    # The UDF function is imported from local module to test spark-submit--py-files dependencies
    double_udf_int = udf(udfs.double_x, IntegerType())

    # Save transformed data set to disk
    salesDF.select("date", "sale", double_udf_int("sale").alias("sale_double")).write.json(outputPath)

Writing ./code/hello_py_spark_app.py


In [13]:
%%writefile ./code/hello_py_spark_udfs.py
def double_x(x):
    return x + x

Writing ./code/hello_py_spark_udfs.py


In [15]:
# Define job input/output URIs
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
prefix = "sagemaker/spark-preprocess-demo/{}".format(timestamp_prefix)
input_prefix_sales = "{}/input/sales".format(prefix)
output_prefix_sales = "{}/output/sales".format(prefix)
input_s3_uri = "s3://{}/{}".format(bucket, input_prefix_sales)
output_s3_uri = "s3://{}/{}".format(bucket, output_prefix_sales)

sagemaker_session.upload_data(path="./data/data.jsonl", bucket=bucket, key_prefix=input_prefix_sales)

spark_processor = PySparkProcessor(
    base_job_name="sm-spark-udfs",
    framework_version="2.4",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=1200,
)

spark_processor.run(
    submit_app="./code/hello_py_spark_app.py",
    submit_py_files=["./code/hello_py_spark_udfs.py"],
    arguments=["--input", input_s3_uri, "--output", output_s3_uri],
    logs=False
)

Copying dependency from local path ./code/hello_py_spark_udfs.py to tmpdir /tmp/tmphnqo6_b8
Uploading dependencies from tmpdir /tmp/tmphnqo6_b8 to S3 s3://sagemaker-cn-north-1-876820548815/sm-spark-udfs-2021-01-22-05-31-48-206/input/py-files
Creating processing-job with name sm-spark-udfs-2021-01-22-05-31-48-206



Job Name:  sm-spark-udfs-2021-01-22-05-31-48-206
Inputs:  [{'InputName': 'py-files', 'S3Input': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sm-spark-udfs-2021-01-22-05-31-48-206/input/py-files', 'LocalPath': '/opt/ml/processing/input/py-files', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sm-spark-udfs-2021-01-22-05-31-48-206/input/code/hello_py_spark_app.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []
..................................................................!

In [17]:
print('Output files in {}'.format(output_s3_uri))
!aws s3 ls $output_s3_uri/

Output files in s3://sagemaker-cn-north-1-876820548815/sagemaker/spark-preprocess-demo/2021-01-22-05-31-48/output/sales
2021-01-22 05:36:56          0 _SUCCESS
2021-01-22 05:36:56      51313 part-00000-ea69e7fb-431d-4ada-9552-30268e848dc1-c000.json


In [21]:
!aws s3 cp --quiet $output_s3_uri/part-00000-ea69e7fb-431d-4ada-9552-30268e848dc1-c000.json - | head -n5

{"date":"2020-01-04","sale":283,"sale_double":566}
{"date":"2020-01-06","sale":140,"sale_double":280}
{"date":"2020-01-05","sale":820,"sale_double":1640}
{"date":"2020-01-04","sale":452,"sale_double":904}
{"date":"2020-01-06","sale":495,"sale_double":990}


## Example 3: Run a Java/Scala Spark application

In [22]:
from sagemaker.spark.processing import SparkJarProcessor

# Upload the raw input dataset to S3
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
prefix = "sagemaker/spark-preprocess-demo/{}".format(timestamp_prefix)
input_prefix_sales = "{}/input/sales".format(prefix)
output_prefix_sales = "{}/output/sales".format(prefix)
input_s3_uri = "s3://{}/{}".format(bucket, input_prefix_sales)
output_s3_uri = "s3://{}/{}".format(bucket, output_prefix_sales)

sagemaker_session.upload_data(path="./data/data.jsonl", bucket=bucket, key_prefix=input_prefix_sales)

spark_processor = SparkJarProcessor(
    base_job_name="sm-spark-java",
    framework_version="2.4",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=1200,
)

spark_processor.run(
    submit_app="./code/spark-test-app.jar",
    submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp",
    arguments=["--input", input_s3_uri, "--output", output_s3_uri],
    logs=False
)

Creating processing-job with name sm-spark-java-2021-01-22-06-10-17-930



Job Name:  sm-spark-java-2021-01-22-06-10-17-930
Inputs:  [{'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sm-spark-java-2021-01-22-06-10-17-930/input/code/spark-test-app.jar', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []
.............................................................!

## Validate Data Processing Results

In [23]:
print('Output files in {}'.format(output_s3_uri))
!aws s3 ls $output_s3_uri/

Output files in s3://sagemaker-cn-north-1-876820548815/sagemaker/spark-preprocess-demo/2021-01-22-06-10-17/output/sales
2021-01-22 06:15:02          0 _SUCCESS
2021-01-22 06:15:01      51313 part-00000-71e850dd-dbd6-4159-9292-d037276a17ef-c000.json


In [24]:
!aws s3 cp --quiet $output_s3_uri/part-00000-71e850dd-dbd6-4159-9292-d037276a17ef-c000.json - | head -n5

{"date":"2020-01-01","sale":5,"sale_double":10}
{"date":"2020-01-01","sale":7,"sale_double":14}
{"date":"2020-01-01","sale":15,"sale_double":30}
{"date":"2020-01-01","sale":15,"sale_double":30}
{"date":"2020-01-01","sale":23,"sale_double":46}


## Example 4: Specifying additional Spark configuration

In [25]:
# Upload the raw input dataset to a unique S3 location
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
prefix = "sagemaker/spark-preprocess-demo/{}".format(timestamp_prefix)
input_prefix_abalone = "{}/input/raw/abalone".format(prefix)
input_preprocessed_prefix_abalone = "{}/input/preprocessed/abalone".format(prefix)

sagemaker_session.upload_data(path="./data/abalone.csv", bucket=bucket, key_prefix=input_prefix_abalone)

spark_processor = PySparkProcessor(
    base_job_name="sm-spark",
    framework_version="2.4",
    role=role,
    instance_count=2,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=1200,
)

configuration = [{
    "Classification": "spark-defaults",
    "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"},
}]

spark_processor.run(
    submit_app="./code/preprocess.py",
    arguments=["--s3_input_bucket", bucket,
               "--s3_input_key_prefix", input_prefix_abalone,
               "--s3_output_bucket", bucket,
               "--s3_output_key_prefix", input_preprocessed_prefix_abalone],
    configuration=configuration,
    logs=False
)

Creating processing-job with name sm-spark-2021-01-22-07-03-43-145



Job Name:  sm-spark-2021-01-22-07-03-43-145
Inputs:  [{'InputName': 'conf', 'S3Input': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sm-spark-2021-01-22-07-03-43-145/input/conf/configuration.json', 'LocalPath': '/opt/ml/processing/input/conf', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-cn-north-1-876820548815/sm-spark-2021-01-22-07-03-43-145/input/code/preprocess.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []
.......................................................................!

In [None]:
print("Top 5 rows from s3://{}/{}/train/".format(bucket, input_preprocessed_prefix_abalone))
!aws s3 ls s3://$bucket/$input_preprocessed_prefix_abalone/train/

!aws s3 cp --quiet s3://$bucket/$input_preprocessed_prefix_abalone/train/part-00000 - | head -n5