#UCR ASSINGMENT

**PROJECT 1**

NLP: Perform Text Classification on the attached file Coronavirus tweets using pyspark. You can
use any algorithm of your choice. The example we learnt during class can be used for reference

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('corona').getOrCreate()

In [None]:
corona=spark.read.csv('', header = True, inferSchema=True,sep= ',')

In [None]:
corona.show(5)

In [None]:
corona = corona.dropDuplicates()
print(corona.count(),",",len(corona.columns))

31647 , 6


In [None]:
corona = corona.na.drop()
print(corona.count(),",",len(corona.columns))

10696 , 6


In [None]:
corona.printSchema()

root
 |-- UserName: string (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- TweetAt: string (nullable = true)
 |-- OriginalTweet: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [None]:
corona.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [None]:
sentiments = ['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

In [None]:
corona = corona.filter(corona.Sentiment.isin(sentiments))

In [None]:
corona.select('Sentiment').distinct().count()

5

In [None]:
corona.select('Sentiment').distinct().show()

+------------------+
|         Sentiment|
+------------------+
|Extremely Negative|
|           Neutral|
|          Positive|
|          Negative|
|Extremely Positive|
+------------------+



In [None]:
from pyspark.sql.functions import length

In [None]:
corona=corona.withColumn('length', length(corona['OriginalTweet']))

In [None]:
corona.show(10)

+--------+----------+--------------------+----------+--------------------+------------------+------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         Sentiment|length|
+--------+----------+--------------------+----------+--------------------+------------------+------+
|    4244|     49196|           worldwide|16-03-2020|#Amazon #delivery...|          Negative|   110|
|    4441|     49393|      Staying humble|16-03-2020|I miss going shop...|          Positive|   245|
|    4517|     49469|                Kcmo|17-03-2020|Looking for toile...|          Positive|    50|
|    5273|     50225|    Toronto, Ontario|17-03-2020|@JackieKarmatica ...|Extremely Positive|   255|
|    5478|     50430|    Johnson City, TN|17-03-2020|I'm joining a con...|           Neutral|   134|
|    5640|     50592|       Southern Ohio|17-03-2020|T.p. aside I dont...|          Negative|   272|
|    5774|     50726|              London|17-03-2020|Got verbally abus...|Extremely Negativ

In [None]:
corona=corona.withColumnRenamed("Sentiment","sentiment")

In [None]:
corona.groupby('Sentiment').mean().show()

+------------------+------------------+
|         Sentiment|       avg(length)|
+------------------+------------------+
|Extremely Negative| 211.4938590820944|
|          Positive|197.22226148409894|
|           Neutral|152.10291777188328|
|          Negative| 195.5119139123751|
|Extremely Positive|          218.0775|
+------------------+------------------+



In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_text")
stopremove=StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
count_vec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")

# we also need to convert our labels in numbers
ham_samp_to_num = StringIndexer(inputCol="sentiment", outputCol='label')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [None]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

In [None]:
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, DecisionTreeClassifier

nb=NaiveBayes()
rf=RandomForestClassifier(numTrees=200)
dtc=DecisionTreeClassifier(maxDepth=15)

In [None]:
from pyspark.ml import Pipeline
data_prep_pipeline= Pipeline(stages=[ham_samp_to_num, tokenizer, stopremove,count_vec, idf,clean_up])

In [None]:
clean=data_prep_pipeline.fit(df)

In [None]:
data=clean.transform(df)

In [None]:
data.show(10)

+--------+----------+--------------------+----------+--------------------+------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|UserName|ScreenName|            Location|   TweetAt|       OriginalTweet|         sentiment|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+--------+----------+--------------------+----------+--------------------+------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    4244|     49196|           worldwide|16-03-2020|#Amazon #delivery...|          Negative|   110|  1.0|[#amazon, #delive...|[#amazon, #delive...|(38477,[8,96,550,...|(38477,[8,96,550,...|(38478,[8,96,550,...|
|    4441|     49393|      Staying humble|16-03-2020|I miss going shop...|          Positive|   245|  0.0|[i, miss, going, ...|[miss, going, sho...|(384

In [None]:
data=data.select(['label', 'features'])

In [None]:
data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(38478,[8,96,550,...|
|  0.0|(38478,[0,13,15,1...|
|  0.0|(38478,[3,4,19,30...|
|  3.0|(38478,[2,6,15,58...|
|  2.0|(38478,[2,5,8,212...|
|  1.0|(38478,[3,4,25,44...|
|  4.0|(38478,[0,13,34,4...|
|  0.0|(38478,[2,9,17,20...|
|  0.0|(38478,[0,17,18,2...|
|  0.0|(38478,[0,5,6,24,...|
+-----+--------------------+
only showing top 10 rows



In [None]:
training,testing = data.randomSplit([0.75, 0.25])

In [None]:
spam_prediction=dtc.fit(training)

In [None]:
test_result=spam_prediction.transform(testing)

In [None]:
test_result.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(38478,[0,1,2,26,...|[39.0,0.0,0.0,9.0...|[0.8125,0.0,0.0,0...|       0.0|
|  0.0|(38478,[0,1,3,4,5...|[929.0,980.0,503....|[0.28228501975083...|       1.0|
|  0.0|(38478,[0,1,3,4,1...|[10.0,0.0,0.0,0.0...|[1.0,0.0,0.0,0.0,...|       0.0|
|  0.0|(38478,[0,1,3,4,1...|[929.0,980.0,503....|[0.28228501975083...|       1.0|
|  0.0|(38478,[0,1,3,4,1...|[929.0,980.0,503....|[0.28228501975083...|       1.0|
|  0.0|(38478,[0,1,3,4,2...|[929.0,980.0,503....|[0.28228501975083...|       1.0|
|  0.0|(38478,[0,1,5,13,...|[46.0,8.0,5.0,36....|[0.45544554455445...|       0.0|
|  0.0|(38478,[0,1,5,15,...|[929.0,980.0,503....|[0.28228501975083...|       1.0|
|  0.0|(38478,[0,1,5,54,...|[0.0,1.0,0.0,7.0,...|[0.0,0.125,0.0,0....|       3.0|
|  0.0|(38478,[0

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
accuracy_evaluation=MulticlassClassificationEvaluator()
accuracy=accuracy_evaluation.evaluate(test_results)

In [None]:
print ("Accuracy of the model is::", accuracy)

Accuracy of the model is:: 0.352999822339585
