# Getting Started with Spark Streaming with Python and Kafka

Performance Tuning
Reference: https://spark.apache.org/docs/latest/streaming-programming-guide.html#performance-tuning
Reducing the Batch Processing Times
Level of Parallelism in Data Receiving
Level of Parallelism in Data Processing
Data Serialization
Task Launching Overheads
Setting the Right Batch Interval
Memory Tuning
Integration with Kafka
Introduction to Kafka
Why integrate with Kafka
DEMO: Demo


In [None]:
import findspark
# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/home/matthew/spark-2.1.0-bin-hadoop2.7')

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

In [2]:
# Spark
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json

In [3]:
sc = SparkContext(appName="PythonSparkStreamingKafka")
sc.setLogLevel("WARN")

In [4]:
ssc = StreamingContext(sc, 60) #Set batch duration for 1 minute

In [5]:
kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181', 'spark-streaming', {'twitter':1})

In [6]:
parsed = kafkaStream.map(lambda v: json.loads(v[1]))

In [7]:
# Count number of tweets in the batch
parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint()

In [8]:
# Extract Author name from each tweet
authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name'])

In [9]:
# Count the number of tweets per author
author_counts = authors_dstream.countByValue()
author_counts.pprint()

In [10]:
# Sort the author count
author_counts_sorted_dstream = author_counts.transform(\
  (lambda foo:foo\
   .sortBy(lambda x:( -x[1]))))
#   .sortBy(lambda x:(x[0].lower(), -x[1]))\
#  ))

In [11]:
author_counts_sorted_dstream.pprint()

In [12]:
# Get top 5 authors by tweet count
top_five_authors = author_counts_sorted_dstream.transform\
  (lambda rdd:sc.parallelize(rdd.take(5)))
top_five_authors.pprint()

In [13]:
# Get authors with more than one tweet, or whose username starts with 'a'
filtered_authors = author_counts.filter(lambda x:\
                                                x[1]>1 \
                                                or \
                                                x[0].lower().startswith('rm'))

In [14]:
filtered_authors.transform\
  (lambda rdd:rdd\
  .sortBy(lambda x:-x[1]))\
  .pprint()

In [15]:
# List the most common words in the tweets
parsed.\
    flatMap(lambda tweet:tweet['text'].split(" "))\
    .countByValue()\
    .transform\
      (lambda rdd:rdd.sortBy(lambda x:-x[1]))\
    .pprint()

In [16]:
ssc.start()
ssc.awaitTermination(timeout=180)

-------------------------------------------
Time: 2017-01-11 15:34:00
-------------------------------------------
Tweets in this batch: 188

-------------------------------------------
Time: 2017-01-11 15:34:00
-------------------------------------------
(u'jenniekmz', 1)
(u'SpamNewton', 1)
(u'ShawtieMac', 1)
(u'agathatochetti', 1)
(u'Tommyguns_____', 1)
(u'zwonderwomanzzz', 1)
(u'Blesschubstin', 1)
(u'Prikes5', 1)
(u'MayaParms', 1)
...

-------------------------------------------
Time: 2017-01-11 15:34:00
-------------------------------------------
(u'RitaBezerra12', 3)
(u'xKYLN', 2)
(u'yourmydw', 2)
(u'wintersheat', 2)
(u'biebercuzou', 2)
(u'pchrin_', 2)
(u'uslaybieber', 2)
(u'rowblanchsrd', 2)
(u'__Creammy__', 2)
(u'jenniekmz', 1)
...

-------------------------------------------
Time: 2017-01-11 15:34:00
-------------------------------------------
(u'RitaBezerra12', 3)
(u'xKYLN', 2)
(u'yourmydw', 2)
(u'wintersheat', 2)
(u'biebercuzou', 2)

-------------------------------------------