In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp

### Load Data

In [2]:
# initializing spark session
sc = SparkContext("local[2]", appName="spark streaming twitter sentiment")  # local n means n threads can be used
ssc = StreamingContext(sc, 1)
spark = SparkSession(sc)

In [3]:
schema = tp.StructType([tp.StructField('id', tp.StringType(), True),
         tp.StructField('label', tp.StringType(), True),
         tp.StructField('tweet', tp.StringType(), True)])

sample_df = spark.read.schema(schema).csv('sample.csv', header=True)

sample_df.show()
sample_df.printSchema()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+

root
 |-- id: string (nullable = true)
 |-- label: string (nullable = true)
 |-- tweet: string (nullable = true)



### Basic Sentiment Analysis

I'm in my hometown at this moment, the network is too bad and most websites are not reachable. So the searchable resources are very limited. Meanwhile, spark machine learning has many limitations when you want to do sentmengt analysis yourself with machine learning algorithms. Therefore, here I'm using the most basic pre-trained sentiment analysis tool to show how spark streaming works.

In [4]:
from time import sleep
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

rddQueue = []

for r in sample_df.rdd.collect():
    rddQueue += [sc.parallelize([r['label'] + r['tweet']])]  # parallelize() to make rdd distributable
    
inputStream = ssc.queueStream(rddQueue)
inputStream.map(lambda x: "Predicted Results: " + str(sia.polarity_scores(x[1:])) + ",  Label: " + x[0]).pprint()

ssc.start()
sleep(4)  # the time decides when the program will stop, stop earlier, all the data may not be processed
ssc.stop(stopSparkContext=True, stopGraceFully=True)

-------------------------------------------
Time: 2020-01-05 17:15:38
-------------------------------------------
Predicted Results: {'neg': 0.385, 'neu': 0.615, 'pos': 0.0, 'compound': -0.8296},  Label: 0

-------------------------------------------
Time: 2020-01-05 17:15:39
-------------------------------------------
Predicted Results: {'neg': 0.0, 'neu': 0.744, 'pos': 0.256, 'compound': 0.6705},  Label: 0

-------------------------------------------
Time: 2020-01-05 17:15:40
-------------------------------------------
Predicted Results: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},  Label: 0

-------------------------------------------
Time: 2020-01-05 17:15:41
-------------------------------------------
Predicted Results: {'neg': 0.0, 'neu': 0.663, 'pos': 0.337, 'compound': 0.7249},  Label: 0

-------------------------------------------
Time: 2020-01-05 17:15:42
-------------------------------------------
Predicted Results: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 