# Kafka Consumer

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

### Import dependencies


In [2]:
#Stream processing
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json

In [3]:
#Text processing
import preprocessor as p
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

### Text cleaning

In [4]:
stop_words = set(stopwords.words("english")) #create a set of stopwords
stop_words = stop_words.union(set(("im", "thats","rt"))) #add these stopwords
stop_words = stop_words - set("not") #remove not from the stopwords

p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) #params to remove from the text

def preprocess(text):
    
    #remove url, mentions, smiley and emojis
    text = p.clean(text)
    
    #to lowercase
    text = text.lower()
    
    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #removing stopwords and stem each word
    tokens = word_tokenize(text)
    
    #is_noun = lambda pos: pos[:2] == 'NN'
    text = [i for i in tokens if not i in stop_words]
    #text = [word for (word, pos) in nltk.pos_tag(tokens) if is_noun(pos)] 
    
    return ' '.join(text)

### Create Spark context


In [5]:
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01").getOrCreate()
sc.setLogLevel("WARN")

### Create Streaming Context


In [6]:
ssc = StreamingContext(sc, 10)
ssc.checkpoint('ssc_checkpoint')

### Connect to Kafka


In [7]:
ip = 'localhost' #35.228.250.247
kafkaParams = {"metadata.broker.list": ip+':9092', "auto.offset.reset": 'smallest'}
myStream = KafkaUtils.createDirectStream(ssc, ['DIC'], kafkaParams)

In [8]:
tweets = myStream.map(lambda item: json.loads(item[1]))
tweets = tweets.map(lambda x: ((x['tag'], x['date']), preprocess(x['tweet']).split()))

pairs = tweets.flatMapValues(lambda x: x)
words = pairs.map(lambda x: ((x[0][0], x[0][1], x[1]), 1))                     

In [9]:
def updateFunction(newValues, runningCount):
    if runningCount is None:
        runningCount = 0
        
    return sum(newValues, runningCount)

counts = words.updateStateByKey(updateFunction)

In [10]:
counts.pprint() 

ssc.start()
ssc.awaitTermination(timeout=60)

-------------------------------------------
Time: 2019-10-19 11:35:00
-------------------------------------------

-------------------------------------------
Time: 2019-10-19 11:35:10
-------------------------------------------
(('#UFCBoston', '2019-10-19', 'gets'), 1)
(('#UFCBoston', '2019-10-19', 'first'), 1)
(('#UFCBoston', '2019-10-19', 'dominick'), 1)
(('#UFCBoston', '2019-10-19', 'chris'), 1)
(('#UFCBoston', '2019-10-19', '33'), 1)
(('#UFCBoston', '2019-10-19', 'round'), 1)
(('#UFCBoston', '2019-10-19', 'home'), 1)
(('#UFCBoston', '2019-10-19', 'lakes'), 1)
(('#UFCBoston', '2019-10-19', 'minutes'), 1)
(('#UFCBoston', '2019-10-19', 'move'), 1)
...

-------------------------------------------
Time: 2019-10-19 11:35:20
-------------------------------------------
(('#UFCBoston', '2019-10-19', 'gets'), 1)
(('#UFCBoston', '2019-10-19', 'first'), 1)
(('#UFCBoston', '2019-10-19', 'dominick'), 1)
(('#UFCBoston', '2019-10-19', 'chris'), 1)
(('#UFCBoston', '2019-10-19', '33'), 1)
(('#UFCBo