# Spark Streaming - Process Input

In [2]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp

## Manually Input From Netcat

The method here will read any input typed in the terminal that has Netcat on using `nc -lk 9879`

In [2]:
sc = SparkContext("local[2]", "Spark Streaming Input")
ssc = StreamingContext(sc, 1)

In [3]:
lines = ssc.socketTextStream("localhost", 9879)

In [None]:
words = lines.flatMap(lambda line: line.split(" "))

pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

wordCounts.pprint()

In [None]:
ssc.start()             # Start the computation
ssc.awaitTermination()

## Read File as Streaming

In [33]:
sc = SparkContext("local[2]", "Spark Streaming Input")
ssc = StreamingContext(sc, 1)
spark = SparkSession(sc)

In [34]:
df = spark.read.csv('sample.csv', header=True)

df.show()
df.printSchema()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+

root
 |-- id: string (nullable = true)
 |-- label: string (nullable = true)
 |-- tweet: string (nullable = true)



In [35]:
from time import sleep

rddQueue = []

for r in df.rdd.collect():
    rddQueue += [sc.parallelize([r['label']+r['tweet']])]  # parallelize() to make rdd distributable
    
inputStream = ssc.queueStream(rddQueue)
inputStream.map(lambda x: "Label: " + x[0] + ", Tweet: " + x[1:]).pprint()

ssc.start()
sleep(4)  # the time decides when the program will stop, stop earlier, all the data may not be processed
ssc.stop(stopSparkContext=True, stopGraceFully=True)

-------------------------------------------
Time: 2019-12-22 00:10:00
-------------------------------------------
Label: 0, Tweet:  @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run

-------------------------------------------
Time: 2019-12-22 00:10:01
-------------------------------------------
Label: 0, Tweet: @user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked

-------------------------------------------
Time: 2019-12-22 00:10:02
-------------------------------------------
Label: 0, Tweet:   bihday your majesty

-------------------------------------------
Time: 2019-12-22 00:10:03
-------------------------------------------
Label: 0, Tweet: #model   i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦  

-------------------------------------------
Time: 2019-12-22 00:10:04
-------------------------------------------
Label: 0, Tweet:  facts