# PySpark Scaling Demo

## SDSC Summer Institute

Mai H. Nguyen - UC San Diego

--- 

## Setup

In [1]:
# Import modules
import os

# Initialize Spark

import pyspark
from pyspark.sql import SparkSession, Row

# Change N in local[N] to change number of resources 
# Note change in execution time
conf = pyspark.SparkConf().setAll([('spark.master', 'local[1]'),
                                   ('spark.app.name', 'Spark Demo'),
                                   ('spark.driver.memory','3G'),
                                   ('spark.driver.maxresultsize','2G'),
                                   ('spark.executor.memory','2G')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

print (pyspark.version.__version__)

3.1.2


In [2]:
print(spark.sparkContext.defaultParallelism)

1


In [3]:
# Record starting time

import time
start_time = time.time()

## Read data

In [4]:
# Read data into Spark DataFrame 
# Data source: https://jmcauley.ucsd.edu/data/amazon/ 

USER = os.environ['USER']
SLURM_JOBID = os.environ['SLURM_JOBID']

data_path = '/scratch/' + USER + '/job_' + SLURM_JOBID
dataFileName = data_path + "/BookReviews_5M.txt"
print (dataFileName)

textDF = spark.read.text(dataFileName).cache()

/scratch/mhnguyen/job_14831921/BookReviews_5M.txt


## Process data

In [5]:
%%time

# Count number of rows
textDF.count()

CPU times: user 2 ms, sys: 127 µs, total: 2.12 ms
Wall time: 7.84 s


5000000

In [6]:
# Show first few rows

textDF.show()

+--------------------+
|               value|
+--------------------+
|This was the firs...|
|Also after going ...|
|As with all of Ms...|
|I've not read any...|
|This romance nove...|
|Carolina Garcia A...|
|Not only can she ...|
|Once again Garcia...|
|The timing is jus...|
|Engaging. Dark. R...|
|Set amid the back...|
|This novel is a d...|
|If readers are ad...|
| Reviewed by Phyllis|
|      APOOO BookClub|
|A guilty pleasure...|
|In the tradition ...|
|Beryl Unger, top ...|
|What follows is a...|
|The book flap say...|
+--------------------+
only showing top 20 rows



In [7]:
# Print time since execution start

print(time.time() - start_time)

9.365109205245972


## Stop Spark Session

In [8]:
spark.stop()