# Caribbean Spark

### *Demonstration of in-memory distributed data processing with Apache Spark*
PyCaribbean 2019

In [1]:
# A SparkContext represents the connection to a Spark cluster

import re
from pyspark import SparkContext

FILE="pycaribbean.html"

sc = SparkContext(
    appName="PyCaribbeanWordCount", 
    master="local[*]")\
.getOrCreate()

In [2]:
# Parallelise the text file to Spark Workers as a Resilient Distributed Dataset

linesRDD = sc.textFile(FILE, 8)  # set 8 partitions as local has 8 cores
wordsRDD = linesRDD.flatMap(lambda line: re.findall(r'\w+', line.lower()))
print(wordsRDD.take(5), "\n\n", type(wordsRDD), wordsRDD.getNumPartitions())

['pycaribbean', '2019', 'santo', 'domingo', 'dominican'] 

 <class 'pyspark.rdd.PipelinedRDD'> 8


In [3]:
# RDD Transformation: Map

wordsMap = wordsRDD.map(lambda word: (word, 1))
print(wordsMap.take(5), "\n\n",
      type(wordsMap), wordsMap.getNumPartitions())

[('pycaribbean', 1), ('2019', 1), ('santo', 1), ('domingo', 1), ('dominican', 1)] 

 <class 'pyspark.rdd.PipelinedRDD'> 8


In [4]:
# RDD Transformation: Reduce

wordsReduce = wordsMap.reduceByKey(lambda a, b: a + b)
print(wordsReduce.take(5), "\n\n", 
      type(wordsReduce), wordsReduce.getNumPartitions())

[('python', 2), ('of', 16), ('16', 2), ('february', 2), ('book', 1)] 

 <class 'pyspark.rdd.PipelinedRDD'> 8


In [5]:
# RDD Action: Collect to Spark Driver

output = wordsReduce\
    .sortBy(lambda word_count: word_count[1],  ascending=False)\
    .collect()
print(output[:5], "\n\n", type(output))

[('the', 27), ('of', 16), ('in', 9), ('and', 9), ('pycaribbean', 8)] 

 <class 'list'>


In [6]:
sc.stop()