In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as tp
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row

### Load Data

In [2]:
# initializing spark session
sc = SparkContext("local[2]", appName="spark streaming twitter sentiment")  # local n means n threads can be used
ssc = StreamingContext(sc, 1)
spark = SparkSession(sc)

In [3]:
df = spark.read.csv('twitter_sentiment.csv', header=True)
df = df.withColumn("label", df["label"].cast(tp.IntegerType()))
df = df.withColumn("id", df["id"].cast(tp.IntegerType()))

df.show(5)
df.printSchema()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+
only showing top 5 rows

root
 |-- id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- tweet: string (nullable = true)



In [4]:
print(df.count())

training_df = df.limit(31000)
print(training_df.count())

testing_df = df.filter(df['id'] > 31000)
print(testing_df.count())
print(testing_df.groupBy().min('id').collect(), testing_df.groupBy().max('id').collect())

31962
31000
962
[Row(min(id)=31001)] [Row(max(id)=31962)]


## Train with Logistic Regression

In [13]:
# define stage 1: tokenize the tweet text    
stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100)
# define stage 4: Logistic Regression Model
model_lg = LogisticRegression(featuresCol= 'vector', labelCol= 'label')

In [14]:
# setup the pipeline
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model_lg])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(training_df)

In [15]:
def get_prediction(tweet_text):
	try:
    # create a spark dataframe
		wordsDataFrame = spark.createDataFrame(tweet_text)
    # transform the data using the pipeline and get the predicted sentiment
		pipelineFit.transform(wordsDataFrame).select('tweet','prediction').show()
	except : 
		print('No data')

In [5]:
from time import sleep

test_df = spark.read.csv('sample.csv', header=True)

rddQueue = []

regexTokenizer = RegexTokenizer(inputCol="tweet", outputCol="words", pattern="\\W")
regexTokenized = regexTokenizer.transform(test_df)

for r in regexTokenized.rdd.collect():
    rddQueue += [sc.parallelize([r['label']+str(r['words'])])]  # parallelize() to make rdd distributable
    
inputStream = ssc.queueStream(rddQueue)
inputStream.map(lambda x: "Output " + x[0]).pprint()
inputStream.map(lambda x: "Output " + x[1:]).pprint()

ssc.start()
sleep(4)  # the time decides when the program will stop, stop earlier, all the data may not be processed
ssc.stop(stopSparkContext=True, stopGraceFully=True)

-------------------------------------------
Time: 2019-12-25 10:06:09
-------------------------------------------
Output 0

-------------------------------------------
Time: 2019-12-25 10:06:09
-------------------------------------------
Output ['user', 'when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', 'run']

-------------------------------------------
Time: 2019-12-25 10:06:10
-------------------------------------------
Output 0

-------------------------------------------
Time: 2019-12-25 10:06:10
-------------------------------------------
Output ['user', 'user', 'thanks', 'for', 'lyft', 'credit', 'i', 'can', 't', 'use', 'cause', 'they', 'don', 't', 'offer', 'wheelchair', 'vans', 'in', 'pdx', 'disapointed', 'getthanked']

-------------------------------------------
Time: 2019-12-25 10:06:11
-------------------------------------------
Output 0

-------------------------------------------
Time: 2019