In [1]:

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from textblob import TextBlob

In [2]:
def preprocessing(lines):
    words = lines.select(explode(split(lines.value, "t_end")).alias("word"))
    words = words.na.replace('', None)
    words = words.na.drop()
    words = words.withColumn('word', F.regexp_replace('word', r'http\S+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '@\w+', ''))
    words = words.withColumn('word', F.regexp_replace('word', '#', ''))
    words = words.withColumn('word', F.regexp_replace('word', 'RT', ''))
    words = words.withColumn('word', F.regexp_replace('word', ':', ''))
    return words

In [3]:
# text classification
def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, StringType())
    words = words.withColumn("polarity", polarity_detection_udf("word"))
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, StringType())
    words = words.withColumn("subjectivity", subjectivity_detection_udf("word"))
    return words

In [4]:
if __name__ == "__main__":
    # create Spark session
    spark = SparkSession.builder.appName("TwitterSentimentAnalysis").getOrCreate()
    # read the tweet data from socket
    lines = spark.readStream.format("socket").option("host", "0.0.0.0").option("port", 5555).load()
    # Preprocess the data
    words = preprocessing(lines)
    # text classification to define polarity and subjectivity
    words = text_classification(words)
    words = words.repartition(1)
    query = words.writeStream.queryName("all_tweets")\
        .outputMode("append").format("parquet")\
        .option("path", "./parc")\
        .option("checkpointLocation", "./check")\
        .trigger(processingTime='60 seconds').start()
    query.awaitTermination()

StreamingQueryException: Connection refused
=== Streaming Query ===
Identifier: all_tweets [id = cb2ffd5b-76e6-4c5b-bbe3-e144f96ff228, runId = 53c0eaa3-fa92-4895-90d2-fc79ff7fb803]
Current Committed Offsets: {}
Current Available Offsets: {TextSocketV2[host: 0.0.0.0, port: 5555]: -1}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Repartition 1, true
+- Project [word#17, polarity#20, subjectivity_detection(word#17) AS subjectivity#24]
   +- Project [word#17, polarity_detection(word#17) AS polarity#20]
      +- Project [regexp_replace(word#15, :, ) AS word#17]
         +- Project [regexp_replace(word#13, RT, ) AS word#15]
            +- Project [regexp_replace(word#11, #, ) AS word#13]
               +- Project [regexp_replace(word#9, @\w+, ) AS word#11]
                  +- Project [regexp_replace(word#6, http\S+, ) AS word#9]
                     +- Filter AtLeastNNulls(n, word#6)
                        +- Project [CASE WHEN (word#3 = ) THEN cast(null as string) ELSE word#3 END AS word#6]
                           +- Project [word#3]
                              +- Generate explode(split(value#0, t_end, -1)), false, [word#3]
                                 +- StreamingDataSourceV2Relation [value#0], org.apache.spark.sql.execution.streaming.sources.TextSocketTable$$anon$1@495a0f2e, TextSocketV2[host: 0.0.0.0, port: 5555]
