In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

### Load Data

In [2]:
# initializing spark session
sc = SparkContext(appName="spark_streaming_twitter_sentiment")
spark = SparkSession(sc)

In [3]:
df = spark.read.csv('twitter_sentiment.csv', header=True)
df.show(5)

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+
only showing top 5 rows



In [4]:
print(df.dtypes)
df = df.withColumn('label', df['label'].cast(tp.IntegerType()))
print(df.dtypes)

[('id', 'string'), ('label', 'string'), ('tweet', 'string')]
[('id', 'string'), ('label', 'int'), ('tweet', 'string')]


In [5]:
sample_df = spark.read.csv('sample.csv', header=True)
sample_df.show(5)

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+



In [6]:
print(sample_df.dtypes)
sample_df = sample_df.withColumn('label', sample_df['label'].cast(tp.IntegerType()))
print(sample_df.dtypes)

[('id', 'string'), ('label', 'string'), ('tweet', 'string')]
[('id', 'string'), ('label', 'int'), ('tweet', 'string')]


### Build Model Pipeline - Logistic Regression

In [7]:
# define stage 1: tokenize the tweet text    
stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W')

# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')

# define stage 3: create a word vector of the size 200
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 200)

# define stage 4: Logistic Regression Model
model = LogisticRegression(featuresCol= 'vector', labelCol= 'label')

In [8]:
# setup the pipeline
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(df)

### Process Streaming

In [None]:
host = 'localhost'
port = 9879

# initialize the streaming context 
ssc = StreamingContext(sc, batchDuration=3)  # 3 sec

lines = ssc.socketTextStream(host, port)
counts = lines.flatMap(lambda line: line.split("TWEET APP"))\
                  .map(lambda word: (word, 1))\
                  .reduceByKey(lambda a, b: a+b)
counts.pprint()

ssc.start() # start the computation
ssc.awaitTermination()  # wait till finishing

-------------------------------------------
Time: 2019-12-18 23:46:27
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:30
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:33
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:36
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:39
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:42
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:45
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:48
-------------------------------------------

-------------------------------------------
Time: 2019-12-18 23:46:51
----------