<a href="https://colab.research.google.com/github/kartika-nair/ML-spark-streaming/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# INSTALLING SPARK

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

!tar xf spark-3.1.2-bin-hadoop3.2.tgz

!pip install -q findspark
!pip install pyspark



In [3]:
# SETTING SPARK ENV PATH

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [4]:
# FINDING SPARK IN SYSTEM

import findspark
findspark.init()

findspark.find()

'/content/spark-3.1.2-bin-hadoop3.2'

In [5]:
# STARTING SPARK SESSION

from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [6]:
spark

In [7]:
import pyspark

train_file = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/train.csv', header=True)
train_file.printSchema

<bound method DataFrame.printSchema of DataFrame[Sentiment: string, Tweet: string]>

In [8]:
num_rows = train_file.count()

train_file = train_file.dropDuplicates().dropna()

num_rows_noNull = train_file.count()

print(num_rows, num_rows_noNull)

1519999 1504818


In [15]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

tokenizer = Tokenizer(inputCol="Tweet", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')

idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
label_stringIdx = StringIndexer(inputCol = "Sentiment", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

(train_set, val_set) = train_file.randomSplit([0.98, 0.02], seed = 2000)

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)

lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.8536005802591876

In [41]:
# VECTOR ASSEMBLER

inputCols = [row[0] for row in train_set.select('Tweet').collect()]

from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols = inputCols, outputCol = "features").setHandleInvalid("skip")

In [42]:
# STANDARD SCALER

from pyspark.ml.feature import StandardScaler
stdScaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)

In [44]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(maxIter=10, regParam=0.1, featuresCol="scaledFeatures", labelCol='Sentiment')

from pyspark.ml import Pipeline
pipeline_lsvc = Pipeline(stages=[vecAssembler, stdScaler, lsvc])
pipelineModel_lsvc = pipeline_lsvc.fit(train_set)

IllegalArgumentException: ignored

In [None]:
predDF_lsvc = pipelineModel_lsvc.transform(test_set)
lr_accuracy = evaluator.evaluate(predDF_lsvc)

print(lr_accuracy)

In [46]:
train_set[train_set.Sentiment == 4].head(10)

[Row(Sentiment='4', Tweet=' - Iowa No. 2 in happy! Yea!'),
 Row(Sentiment='4', Tweet=" @kuttyedathi&gt; My 2 yr old boy is sleeping and the music plays ' if u r happy &amp; u know it clap ur hands ' and he is clapping!"),
 Row(Sentiment='4', Tweet=' Hows everybody.?'),
 Row(Sentiment='4', Tweet=' SMILING FACES.'),
 Row(Sentiment='4', Tweet=' Welcome to http://twitter.com/katuuu'),
 Row(Sentiment='4', Tweet=' had a good night, love my best buds in the world!!! adamcheeeserosie'),
 Row(Sentiment='4', Tweet=" hihooo(:  but,, i think i'm gonna be on my granda's house :3 haha  til late so... i'll be here few minutes :O"),
 Row(Sentiment='4', Tweet=' monday monday'),
 Row(Sentiment='4', Tweet=' smiling.everyone should try it.'),
 Row(Sentiment='4', Tweet='#Follow @MaryKateOlsen9 and @ashleyolsen7 love them so much u guys are my idols ')]

In [47]:
train_set[train_set.Sentiment == 0].head(10)

[Row(Sentiment='0', Tweet='      I must think about positive..'),
 Row(Sentiment='0', Tweet="  ''Love, save the empty''"),
 Row(Sentiment='0', Tweet='  hi nia im bored'),
 Row(Sentiment='0', Tweet=" #asylm J2 panel is over. Guess it's back to normal life."),
 Row(Sentiment='0', Tweet=" Alone in my room...again.. I'm bored.. "),
 Row(Sentiment='0', Tweet=" I'D RATHER BE IN THE  BAHAMAS!"),
 Row(Sentiment='0', Tweet=" I'll get home like 5pm today it will be a long day no hangover just my body is drain out n my legs hurt ;-( I'm dehrydrated"),
 Row(Sentiment='0', Tweet=' Mammoth cave here I come '),
 Row(Sentiment='0', Tweet=" My moodswings, nobody's online, meh.")]