# Kafka Consumer

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

### Import dependencies


In [2]:
#Stream processing
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json

In [3]:
#Text processing
import preprocessor as p
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

### Text cleaning

In [4]:
stop_words = set(stopwords.words("english")) #create a set of stopwords
stop_words = stop_words.union(set(("im", "thats","rt"))) #add these stopwords
stop_words = stop_words - set("not") #remove not from the stopwords

p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY) #params to remove from the text

def preprocess(text):
    
    #remove url, mentions, smiley and emojis
    text = p.clean(text)
    
    #to lowercase
    text = text.lower()
    
    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #removing stopwords and stem each word
    tokens = word_tokenize(text)
    
    #is_noun = lambda pos: pos[:2] == 'NN'
    text = [i for i in tokens if not i in stop_words]
    #text = [word for (word, pos) in nltk.pos_tag(tokens) if is_noun(pos)] 
    
    return ' '.join(text)

### Create Spark context


In [5]:
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01").getOrCreate()
sc.setLogLevel("WARN")

### Create Streaming Context


In [6]:
ssc = StreamingContext(sc, 10)
ssc.checkpoint('ssc_checkpoint')

### Connect to Kafka


In [7]:
kafkaParams = {"metadata.broker.list": '35.228.250.247:9092', "auto.offset.reset": 'largest'}
myStream = KafkaUtils.createDirectStream(ssc, ['DIC'], kafkaParams)

In [8]:
tweets = myStream.map(lambda item: json.loads(item[1]))
pairs = tweets.map(lambda x: ((x['tag'], x['date']), preprocess(x['tweet']).split()))
trial = pairs.flatMapValues(lambda x: x)

trial.pprint()
                      
#tweets = myStream.map(lambda x: preprocess(x[1]))
##words = tweets.flatMap(lambda x: x.split(" "))

##pairs = words.map(lambda x: (x, 1))
##counts = pairs.reduceByKey(lambda n1, n2: n1+n2)
##cumulated_count = counts.reduceByKeyAndWindow(lambda n1, n2: n1 + n2, lambda n1,n2: n1-n2, windowDuration = 30, slideDuration = 10)
##cumulated_count.pprint()

In [9]:
ssc.start()
ssc.awaitTermination(timeout=60)

-------------------------------------------
Time: 2019-10-18 19:37:20
-------------------------------------------
(('#MakeItRight', '2019-10-18 17:37:04'), 'always')
(('#MakeItRight', '2019-10-18 17:37:04'), 'stream')
(('#MakeItRight', '2019-10-18 17:37:04'), 'new')
(('#MakeItRight', '2019-10-18 17:37:04'), 'collab')
(('#MakeItRight', '2019-10-18 17:37:04'), '▶️')
(('#MakeItRight', '2019-10-18 17:37:04'), 'https…')
(('#MakeItRight', '2019-10-18 17:37:05'), 'thank')
(('#MakeItRight', '2019-10-18 17:37:05'), 'u')
(('#MakeItRight', '2019-10-18 17:37:05'), 'guys')
(('#MakeItRight', '2019-10-18 17:37:05'), 'thank')
...

-------------------------------------------
Time: 2019-10-18 19:37:30
-------------------------------------------
(('#MakeItRight', '2019-10-18 17:37:14'), 'compiled')
(('#MakeItRight', '2019-10-18 17:37:14'), 'animation')
(('#MakeItRight', '2019-10-18 17:37:14'), 'scenes')
(('#MakeItRight', '2019-10-18 17:37:14'), 'makes')
(('#MakeItRight', '2019-10-18 17:37:14'), 'sense')
