Importing pyspark libraries

In [0]:
# Imports.
from pyspark.sql.functions import lit
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import random

random.seed(42)
spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")

We have three separate files each for negative, positive and neutral sentiments with tweets as columns.
In the below block, we have transposed the data into a single column named 'tweets' and created another column named 'sentiment'. We have assigned 0 to negative tweets, 1 to positive tweets and 2 to neutral tweets.

In [0]:
from functools import partial
from pyspark.sql import Row

def flatten_table(column_names, column_values):
    row = zip(column_names, column_values)
    return [
        Row(ColumnValue=value)
        for column, value in row
    ]

negative_tweets = spark.read.format("csv") \
  .option("inferSchema", "true") \
  .option("header", "false") \
  .load("/FileStore/tables/processedNegative.csv")
negative_tweets = negative_tweets.rdd.flatMap(partial(flatten_table, negative_tweets.columns)).toDF()
negative_tweets = negative_tweets.selectExpr("ColumnValue as tweets")
negative_tweets = negative_tweets.withColumn("sentiment", lit(0))
# display(negative_tweets)

positive_tweets = spark.read.format("csv") \
  .option("inferSchema", "true") \
  .option("header", "false") \
  .load("/FileStore/tables/processedPositive.csv")
positive_tweets = positive_tweets.rdd.flatMap(partial(flatten_table, positive_tweets.columns)).toDF()
positive_tweets = positive_tweets.selectExpr("ColumnValue as tweets")
positive_tweets = positive_tweets.withColumn("sentiment", lit(1))
# display(positive_tweets)

neutral_tweets = spark.read.format("csv") \
  .option("inferSchema", "true") \
  .option("header", "false") \
  .load("/FileStore/tables/processedNeutral.csv")
neutral_tweets = neutral_tweets.rdd.flatMap(partial(flatten_table, neutral_tweets.columns)).toDF()
neutral_tweets = neutral_tweets.selectExpr("ColumnValue as tweets")
neutral_tweets = neutral_tweets.withColumn("sentiment", lit(2))
# display(neutral_tweets)

We have merged the three separate dataframes into 1 single dataframe

In [0]:
import functools
  
# explicit function
def unionAll(dfs):
    return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)

tweets_combined = unionAll([negative_tweets, neutral_tweets, positive_tweets])
print((tweets_combined.count(), len(tweets_combined.columns)))
tweets_combined = tweets_combined.dropna()
print((tweets_combined.count(), len(tweets_combined.columns)))
tweets_combined.head()

(3873, 2)
(3868, 2)
Out[3]: Row(tweets='How unhappy  some dogs like it though', sentiment=0)

In [0]:
display(tweets_combined.groupBy('sentiment').count())

sentiment,count
0,1116
2,1569
1,1183


Here we have created a permanent table 'tweets_table' on the dataframe

In [0]:
# Creating a permanent tweets_table.
# tweets_combined = spark.read.format("csv") \
#   .option("inferSchema", "true") \
#   .option("header", "true") \
#   .load("/FileStore/tables/tweets-3.csv")

permanent_table_name = "tweets_table"
tweets_combined.write.mode("overwrite").format("parquet").saveAsTable(permanent_table_name)

In [0]:
%sql
select * From tweets_table limit 5

tweets,sentiment
Pak PM survives removal scare,2
but court orders further probe into corruption charge.,2
Supreme Court quashes criminal complaint against cricketer for allegedly depicting himself as on magazine cover.,2
Art of Living's fights back over Yamuna floodplain damage,2
livid.,2


In [0]:
tweets_df = spark.table('tweets_table')
tweets_df.head()

Out[23]: Row(tweets='Pak PM survives removal scare', sentiment=2)

Here we have splitted the data into train and test sets by a 80:20 ratio

In [0]:
# Train and Test split.
(tweets_train, tweets_test) = tweets_combined.randomSplit([0.8, 0.2])

In [0]:
display(tweets_train.groupBy('sentiment').count())

sentiment,count
0,914
2,1244
1,958


In [0]:
tweets_train.show(n=5)

+--------------------+---------+
|              tweets|sentiment|
+--------------------+---------+
|  i was going to ...|        0|
|        2010 Week 11|        0|
|    2017 at 08:45AM)|        0|
| Another ATBB on ...|        0|
| Hi Tharakaram! T...|        0|
+--------------------+---------+
only showing top 5 rows



Data cleaning and pre-processing steps: we have tokenized the tweets and removed any stop words and then used the IDF function to assign weightage to the words

In [0]:
# Data cleaning and pre-processing (Used in ML Pipeline).
tokenizer = Tokenizer(inputCol="tweets", outputCol="tokenized_words")

stop_words_remover = StopWordsRemover(inputCol="tokenized_words", outputCol="stop_words_removed")

hashing_tf = HashingTF(inputCol="stop_words_removed", outputCol="tf_features")

idf = IDF(inputCol="tf_features", outputCol="features", minDocFreq=3)

Defining NaiveBayes classification model

In [0]:
# Naive Bayes Classification Model.
nbc = NaiveBayes(labelCol="sentiment", featuresCol="features")

In [0]:
pipeline_nbc = Pipeline(stages=[tokenizer, stop_words_remover, hashing_tf, idf, nbc])

# Fit the pipeline to training data.
model_nbc = pipeline_nbc.fit(tweets_train)

# Tranform for the test data.
prediction_nbc = model_nbc.transform(tweets_test)

In [0]:
prediction_nbc.write.mode("overwrite").format("parquet").saveAsTable("prediction_nbc")

In [0]:
%sql
select * from prediction_nbc

tweets,sentiment,tokenized_words,stop_words_removed,tf_features,features,rawPrediction,probability,prediction
1 short of two-thirds majority.,2,"List(, 1, short, of, two-thirds, majority.)","List(, 1, short, two-thirds, majority.)","Map(vectorType -> sparse, length -> 262144, indices -> List(81662, 92651, 215543, 249180, 261901), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(81662, 92651, 215543, 249180, 261901), values -> List(0.0, 5.211412935711121, 0.0, 0.9971090584100414, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(-58.62026728835531, -54.800573508869896, -53.70677752357365))","Map(vectorType -> dense, length -> 3, values -> List(0.005473338815922423, 0.24953086040286174, 0.7449958007812157))",2.0
3 switch to Selvam's side.,2,"List(, 3, switch, to, selvam's, side.)","List(, 3, switch, selvam's, side.)","Map(vectorType -> sparse, length -> 262144, indices -> List(122915, 124884, 168590, 188087, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(122915, 124884, 168590, 188087, 249180), values -> List(0.0, 0.0, 4.677330449780863, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-50.1160888463819, -52.26245871441274, -45.36883485273467))","Map(vectorType -> dense, length -> 3, values -> List(0.008592229191281244, 0.0010044984320469873, 0.9904032723766718))",2.0
6 injured,2,"List(, 6, injured)","List(, 6, injured)","Map(vectorType -> sparse, length -> 262144, indices -> List(124674, 244179, 249180), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(124674, 244179, 249180), values -> List(6.252866810539282, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-73.94287003467647, -85.96940492744062, -65.27559831077377))","Map(vectorType -> dense, length -> 3, values -> List(1.7209845395512862E-4, 1.029720190869102E-9, 0.9998279005163246))",2.0
BSP's is keeping her rivals edgy.,2,"List(, bsp's, is, keeping, her, rivals, edgy.)","List(, bsp's, keeping, rivals, edgy.)","Map(vectorType -> sparse, length -> 262144, indices -> List(25228, 49528, 179770, 211195, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(25228, 49528, 179770, 211195, 249180), values -> List(0.0, 0.0, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-7.918412079031931, -7.533971228556859, -7.245563594456858))","Map(vectorType -> dense, length -> 3, values -> List(0.22580475576395748, 0.3316604033071079, 0.4425348409289345))",2.0
Bihar Jharkhand for nice try filling police vacancie,2,"List(, bihar, , jharkhand, for, nice, try, filling, police, vacancie)","List(, bihar, , jharkhand, nice, try, filling, police, vacancie)","Map(vectorType -> sparse, length -> 262144, indices -> List(22346, 48110, 92748, 122265, 210212, 213605, 233077, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(22346, 48110, 92748, 122265, 210212, 213605, 233077, 249180), values -> List(5.4796769223058, 0.0, 5.965184738087501, 0.0, 5.4796769223058, 4.712421769592133, 0.0, 1.9942181168200828))","Map(vectorType -> dense, length -> 3, values -> List(-245.41145014512603, -250.08462346567288, -228.41091588881358))","Map(vectorType -> dense, length -> 3, values -> List(4.137726348844048E-8, 3.865702060914704E-10, 0.9999999582361663))",2.0
CM assures impartial probe,2,"List(, cm, assures, impartial, probe)","List(, cm, assures, impartial, probe)","Map(vectorType -> sparse, length -> 262144, indices -> List(49002, 91588, 178423, 226642, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(49002, 91588, 178423, 226642, 249180), values -> List(6.435188367333237, 4.953583826409021, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-150.73893108975543, -138.55757568408762, -106.33895087881584))","Map(vectorType -> dense, length -> 3, values -> List(5.2159521398568537E-20, 1.017721694978828E-14, 0.9999999999999898))",2.0
Election Commission on plea seeking scrapping of and other benefits given to,2,"List(, election, commission, on, plea, seeking, scrapping, of, and, other, benefits, given, to)","List(, election, commission, plea, seeking, scrapping, benefits, given)","Map(vectorType -> sparse, length -> 262144, indices -> List(6702, 49553, 155149, 178607, 217316, 219879, 224031, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(6702, 49553, 155149, 178607, 217316, 219879, 224031, 249180), values -> List(6.252866810539282, 0.0, 0.0, 5.847401702431117, 5.646731006968967, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-219.2244330323213, -230.15083584081825, -163.3322121684651))","Map(vectorType -> dense, length -> 3, values -> List(5.324966715656212E-25, 9.57283229690688E-30, 1.0))",2.0
Gandhi Champaran,2,"List(, gandhi, , champaran)","List(, gandhi, , champaran)","Map(vectorType -> sparse, length -> 262144, indices -> List(14400, 114438, 249180), values -> List(1.0, 1.0, 2.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(14400, 114438, 249180), values -> List(0.0, 0.0, 1.9942181168200828))","Map(vectorType -> dense, length -> 3, values -> List(-14.61048050673158, -13.888565815389352, -13.57275028120472))","Map(vectorType -> dense, length -> 3, values -> List(0.1700341014598239, 0.3499931927813333, 0.47997270575884277))",2.0
India versus Australia is Holyfield versus Tyson,2,"List(, india, versus, australia, is, holyfield, versus, tyson)","List(, india, versus, australia, holyfield, versus, tyson)","Map(vectorType -> sparse, length -> 262144, indices -> List(2284, 40551, 60825, 140220, 249180, 249629), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(2284, 40551, 60825, 140220, 249180, 249629), values -> List(0.0, 0.0, 4.6434288981051814, 12.197432261424048, 0.9971090584100414, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(-195.2047539453051, -194.87839516735116, -156.1614836575766))","Map(vectorType -> dense, length -> 3, values -> List(1.1059185904138929E-17, 1.5327064145184982E-17, 1.0))",2.0
India's air pollution,2,"List(, india's, air, pollution)","List(, india's, air, pollution)","Map(vectorType -> sparse, length -> 262144, indices -> List(98424, 199351, 211143, 249180), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(98424, 199351, 211143, 249180), values -> List(6.098716130712024, 5.100187300600896, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-148.3578881877838, -148.01207098696017, -102.44444829534794))","Map(vectorType -> dense, length -> 3, values -> List(1.1482763218905332E-20, 1.6226801053417978E-20, 1.0))",2.0


NaiveBayes model evaluation

In [0]:
# Test accuracy Evaluation (Naive Bayes Classifier).
evaluator_nbc = MulticlassClassificationEvaluator(
    labelCol="sentiment", 
    predictionCol="prediction", 
    metricName="accuracy"
)
accuracy_nbc = evaluator_nbc.evaluate(prediction_nbc)
print("Test Accuracy (Naive Bayes Classifier) = %g " % (accuracy_nbc * 100))

Test Accuracy (Naive Bayes Classifier) = 83.9096 


Defining decision tree classification model

In [0]:
dtc = DecisionTreeClassifier(labelCol="sentiment", featuresCol="features", maxDepth=5)

In [0]:
pipeline_dtc = Pipeline(stages=[tokenizer, stop_words_remover, hashing_tf, idf, dtc])

# Fit the pipeline to training data.
model_dtc = pipeline_dtc.fit(tweets_train)

# Tranform for the test data.
prediction_dtc = model_dtc.transform(tweets_test)

In [0]:
prediction_dtc.write.mode("overwrite").format("parquet").saveAsTable("prediction_dtc")

In [0]:
%sql
select * from prediction_dtc

tweets,sentiment,tokenized_words,stop_words_removed,tf_features,features,rawPrediction,probability,prediction
1 short of two-thirds majority.,2,"List(, 1, short, of, two-thirds, majority.)","List(, 1, short, two-thirds, majority.)","Map(vectorType -> sparse, length -> 262144, indices -> List(81662, 92651, 215543, 249180, 261901), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(81662, 92651, 215543, 249180, 261901), values -> List(0.0, 5.211412935711121, 0.0, 0.9971090584100414, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
3 switch to Selvam's side.,2,"List(, 3, switch, to, selvam's, side.)","List(, 3, switch, selvam's, side.)","Map(vectorType -> sparse, length -> 262144, indices -> List(122915, 124884, 168590, 188087, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(122915, 124884, 168590, 188087, 249180), values -> List(0.0, 0.0, 4.677330449780863, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
6 injured,2,"List(, 6, injured)","List(, 6, injured)","Map(vectorType -> sparse, length -> 262144, indices -> List(124674, 244179, 249180), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(124674, 244179, 249180), values -> List(6.252866810539282, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
BSP's is keeping her rivals edgy.,2,"List(, bsp's, is, keeping, her, rivals, edgy.)","List(, bsp's, keeping, rivals, edgy.)","Map(vectorType -> sparse, length -> 262144, indices -> List(25228, 49528, 179770, 211195, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(25228, 49528, 179770, 211195, 249180), values -> List(0.0, 0.0, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
Bihar Jharkhand for nice try filling police vacancie,2,"List(, bihar, , jharkhand, for, nice, try, filling, police, vacancie)","List(, bihar, , jharkhand, nice, try, filling, police, vacancie)","Map(vectorType -> sparse, length -> 262144, indices -> List(22346, 48110, 92748, 122265, 210212, 213605, 233077, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(22346, 48110, 92748, 122265, 210212, 213605, 233077, 249180), values -> List(5.4796769223058, 0.0, 5.965184738087501, 0.0, 5.4796769223058, 4.712421769592133, 0.0, 1.9942181168200828))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
CM assures impartial probe,2,"List(, cm, assures, impartial, probe)","List(, cm, assures, impartial, probe)","Map(vectorType -> sparse, length -> 262144, indices -> List(49002, 91588, 178423, 226642, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(49002, 91588, 178423, 226642, 249180), values -> List(6.435188367333237, 4.953583826409021, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
Election Commission on plea seeking scrapping of and other benefits given to,2,"List(, election, commission, on, plea, seeking, scrapping, of, and, other, benefits, given, to)","List(, election, commission, plea, seeking, scrapping, benefits, given)","Map(vectorType -> sparse, length -> 262144, indices -> List(6702, 49553, 155149, 178607, 217316, 219879, 224031, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(6702, 49553, 155149, 178607, 217316, 219879, 224031, 249180), values -> List(6.252866810539282, 0.0, 0.0, 5.847401702431117, 5.646731006968967, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
Gandhi Champaran,2,"List(, gandhi, , champaran)","List(, gandhi, , champaran)","Map(vectorType -> sparse, length -> 262144, indices -> List(14400, 114438, 249180), values -> List(1.0, 1.0, 2.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(14400, 114438, 249180), values -> List(0.0, 0.0, 1.9942181168200828))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
India versus Australia is Holyfield versus Tyson,2,"List(, india, versus, australia, is, holyfield, versus, tyson)","List(, india, versus, australia, holyfield, versus, tyson)","Map(vectorType -> sparse, length -> 262144, indices -> List(2284, 40551, 60825, 140220, 249180, 249629), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(2284, 40551, 60825, 140220, 249180, 249629), values -> List(0.0, 0.0, 4.6434288981051814, 12.197432261424048, 0.9971090584100414, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0
India's air pollution,2,"List(, india's, air, pollution)","List(, india's, air, pollution)","Map(vectorType -> sparse, length -> 262144, indices -> List(98424, 199351, 211143, 249180), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(98424, 199351, 211143, 249180), values -> List(6.098716130712024, 5.100187300600896, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(229.0, 293.0, 1242.0))","Map(vectorType -> dense, length -> 3, values -> List(0.12981859410430838, 0.1660997732426304, 0.7040816326530612))",2.0


Evaluating Decision tree classification model

In [0]:
# Test accuracy Evaluation (Decision Tree Classifier).
evaluator_dtc = MulticlassClassificationEvaluator(
    labelCol="sentiment", 
    predictionCol="prediction", 
    metricName="accuracy"
)
accuracy_dtc = evaluator_dtc.evaluate(prediction_dtc)
print("Test Accuracy (Decision Tree Classifier) = %g " % (accuracy_dtc * 100))

Test Accuracy (Decision Tree Classifier) = 81.6489 


Defining logistic regression model

In [0]:
lr = LogisticRegression(maxIter=15, regParam=0.01, elasticNetParam=0.5, labelCol="sentiment", featuresCol="features")

In [0]:
pipeline_lr = Pipeline(stages=[tokenizer, stop_words_remover, hashing_tf, idf, lr])

# Fit the pipeline to training data.
model_lr = pipeline_lr.fit(tweets_train)

# Tranform for the test data.
prediction_lr = model_lr.transform(tweets_test)

In [0]:
prediction_lr.write.mode("overwrite").format("parquet").saveAsTable("prediction_lr")

In [0]:
%sql
select * from prediction_lr

tweets,sentiment,tokenized_words,stop_words_removed,tf_features,features,rawPrediction,probability,prediction
1 short of two-thirds majority.,2,"List(, 1, short, of, two-thirds, majority.)","List(, 1, short, two-thirds, majority.)","Map(vectorType -> sparse, length -> 262144, indices -> List(81662, 92651, 215543, 249180, 261901), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(81662, 92651, 215543, 249180, 261901), values -> List(0.0, 5.211412935711121, 0.0, 0.9971090584100414, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.2883325450934082))","Map(vectorType -> dense, length -> 3, values -> List(0.09942524437650718, 0.1379730139841086, 0.7626017416393841))",2.0
3 switch to Selvam's side.,2,"List(, 3, switch, to, selvam's, side.)","List(, 3, switch, selvam's, side.)","Map(vectorType -> sparse, length -> 262144, indices -> List(122915, 124884, 168590, 188087, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(122915, 124884, 168590, 188087, 249180), values -> List(0.0, 0.0, 4.677330449780863, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.2883325450934082))","Map(vectorType -> dense, length -> 3, values -> List(0.09942524437650718, 0.1379730139841086, 0.7626017416393841))",2.0
6 injured,2,"List(, 6, injured)","List(, 6, injured)","Map(vectorType -> sparse, length -> 262144, indices -> List(124674, 244179, 249180), values -> List(1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(124674, 244179, 249180), values -> List(6.252866810539282, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.2883325450934082))","Map(vectorType -> dense, length -> 3, values -> List(0.09942524437650718, 0.1379730139841086, 0.7626017416393841))",2.0
BSP's is keeping her rivals edgy.,2,"List(, bsp's, is, keeping, her, rivals, edgy.)","List(, bsp's, keeping, rivals, edgy.)","Map(vectorType -> sparse, length -> 262144, indices -> List(25228, 49528, 179770, 211195, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(25228, 49528, 179770, 211195, 249180), values -> List(0.0, 0.0, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.2883325450934082))","Map(vectorType -> dense, length -> 3, values -> List(0.09942524437650718, 0.1379730139841086, 0.7626017416393841))",2.0
Bihar Jharkhand for nice try filling police vacancie,2,"List(, bihar, , jharkhand, for, nice, try, filling, police, vacancie)","List(, bihar, , jharkhand, nice, try, filling, police, vacancie)","Map(vectorType -> sparse, length -> 262144, indices -> List(22346, 48110, 92748, 122265, 210212, 213605, 233077, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(22346, 48110, 92748, 122265, 210212, 213605, 233077, 249180), values -> List(5.4796769223058, 0.0, 5.965184738087501, 0.0, 5.4796769223058, 4.712421769592133, 0.0, 1.9942181168200828))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.6247632796962295))","Map(vectorType -> dense, length -> 3, values -> List(0.07618813758491975, 0.10572674010858293, 0.8180851223064974))",2.0
CM assures impartial probe,2,"List(, cm, assures, impartial, probe)","List(, cm, assures, impartial, probe)","Map(vectorType -> sparse, length -> 262144, indices -> List(49002, 91588, 178423, 226642, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(49002, 91588, 178423, 226642, 249180), values -> List(6.435188367333237, 4.953583826409021, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.9684737153656888))","Map(vectorType -> dense, length -> 3, values -> List(0.057046085570538446, 0.07916319855706912, 0.8637907158723924))",2.0
Election Commission on plea seeking scrapping of and other benefits given to,2,"List(, election, commission, on, plea, seeking, scrapping, of, and, other, benefits, given, to)","List(, election, commission, plea, seeking, scrapping, benefits, given)","Map(vectorType -> sparse, length -> 262144, indices -> List(6702, 49553, 155149, 178607, 217316, 219879, 224031, 249180), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(6702, 49553, 155149, 178607, 217316, 219879, 224031, 249180), values -> List(6.252866810539282, 0.0, 0.0, 5.847401702431117, 5.646731006968967, 0.0, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.9906092262030708))","Map(vectorType -> dense, length -> 3, values -> List(0.05596409592482334, 0.07766171497053058, 0.8663741891046461))",2.0
Gandhi Champaran,2,"List(, gandhi, , champaran)","List(, gandhi, , champaran)","Map(vectorType -> sparse, length -> 262144, indices -> List(14400, 114438, 249180), values -> List(1.0, 1.0, 2.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(14400, 114438, 249180), values -> List(0.0, 0.0, 1.9942181168200828))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 1.4063224820800784))","Map(vectorType -> dense, length -> 3, values -> List(0.09075764079595125, 0.12594492798313192, 0.7832974312209168))",2.0
India versus Australia is Holyfield versus Tyson,2,"List(, india, versus, australia, is, holyfield, versus, tyson)","List(, india, versus, australia, holyfield, versus, tyson)","Map(vectorType -> sparse, length -> 262144, indices -> List(2284, 40551, 60825, 140220, 249180, 249629), values -> List(1.0, 1.0, 1.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(2284, 40551, 60825, 140220, 249180, 249629), values -> List(0.0, 0.0, 4.6434288981051814, 12.197432261424048, 0.9971090584100414, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 2.911475278772557))","Map(vectorType -> dense, length -> 3, values -> List(0.02423219105553534, 0.03362715833727401, 0.9421406506071905))",2.0
India's air pollution,2,"List(, india's, air, pollution)","List(, india's, air, pollution)","Map(vectorType -> sparse, length -> 262144, indices -> List(98424, 199351, 211143, 249180), values -> List(1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(98424, 199351, 211143, 249180), values -> List(6.098716130712024, 5.100187300600896, 0.0, 0.9971090584100414))","Map(vectorType -> dense, length -> 3, values -> List(-0.7489973371564365, -0.42134527095030166, 2.475854828779502))","Map(vectorType -> dense, length -> 3, values -> List(0.03631403543987623, 0.050393206986668826, 0.9132927575734548))",2.0


Evaluating logistic regression model

In [0]:
# Test accuracy Evaluation (Logistic Regression).
evaluator_lr = MulticlassClassificationEvaluator(
    labelCol="sentiment", 
    predictionCol="prediction", 
    metricName="accuracy"
)
accuracy_lr = evaluator_lr.evaluate(prediction_lr)
print("Test Accuracy (Logistic Regression) = %g " % (accuracy_lr * 100))

Test Accuracy (Logistic Regression) = 86.1702 


Inference:
    Among the above three models, 
    NaiveBayes model is 83.9% accurate.
    Decision Tree model is 81.65% accurate.
    Logistic regression model is 86.17% accurate.
    Hence,logistic regression model is best suited to analyse the sentiment of the tweets.

Interpretation:
    We can use the logistic regression model to understand the sentiment behind tweets and we will be about 86% accuarate in predicting the sentiments correctly. 
    This will be specifically useful in understanding overall public opinion about ceratin trending topics on twitter.