In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q findspark
!pip install -q pyspark

In [3]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [4]:
sc = SparkContext()

In [5]:
sql = SQLContext(sc)

In [6]:
reviews = sql.read.json("/content/drive/MyDrive/PersonalProjects/reviews_Cell_Phones_and_Accessories_5.json")

We have loaded the dataset. Now, we need to remove the unwanted columns so that we only have those columns that are needed to perform sentiment analysis. 

In [7]:
reviews = reviews.drop('asin').drop('helpful').drop('summary').drop('unixReviewTime').drop('reviewTime')
reviews.show(25)

+-------+--------------------+--------------+--------------------+
|overall|          reviewText|    reviewerID|        reviewerName|
+-------+--------------------+--------------+--------------------+
|    4.0|They look good an...|A30TL5EWN6DFXT|           christina|
|    5.0|These stickers wo...| ASY55RVNIL0UD|            emily l.|
|    5.0|These are awesome...|A2TMXE2AFO7ONB|               Erica|
|    4.0|Item arrived in g...| AWJ0WZQYMYFQ4|                  JM|
|    5.0|awesome! stays on...| ATX7CZYFXI1KW|    patrice m rogoza|
|    3.0|These make using ...| APX47D16JOP7H|                 RLH|
|    5.0|Came just as desc...|A1JVVYYO7G56DS|         Tyler Evans|
|    1.0|it worked for the...| A6FGO4TBZ3QFZ|    Abdullah Albyati|
|    5.0|Good case, solid ...|A2JWEDW5FSVB0F|                Adam|
|    5.0|This is a fantast...| A8AJS1DW7L3JJ|     Agata Majchrzak|
|    5.0|this case fits pe...|A2YO4SCWAWNYBI|       Alex Maslakov|
|    5.0|This is the first...|A3AFELPYTZH90T|           Baja A

Let's re-order the columns so that it's easier to perform the sentiment analysis.

In [8]:
reviews = reviews.select('reviewerID','reviewerName','reviewText','overall')

In [9]:
reviews.show(25)

+--------------+--------------------+--------------------+-------+
|    reviewerID|        reviewerName|          reviewText|overall|
+--------------+--------------------+--------------------+-------+
|A30TL5EWN6DFXT|           christina|They look good an...|    4.0|
| ASY55RVNIL0UD|            emily l.|These stickers wo...|    5.0|
|A2TMXE2AFO7ONB|               Erica|These are awesome...|    5.0|
| AWJ0WZQYMYFQ4|                  JM|Item arrived in g...|    4.0|
| ATX7CZYFXI1KW|    patrice m rogoza|awesome! stays on...|    5.0|
| APX47D16JOP7H|                 RLH|These make using ...|    3.0|
|A1JVVYYO7G56DS|         Tyler Evans|Came just as desc...|    5.0|
| A6FGO4TBZ3QFZ|    Abdullah Albyati|it worked for the...|    1.0|
|A2JWEDW5FSVB0F|                Adam|Good case, solid ...|    5.0|
| A8AJS1DW7L3JJ|     Agata Majchrzak|This is a fantast...|    5.0|
|A2YO4SCWAWNYBI|       Alex Maslakov|this case fits pe...|    5.0|
|A3AFELPYTZH90T|           Baja Alan|This is the first...|    

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import TrainValidationSplit

Split the dataset into train and test datasets.

In [11]:
train, test = reviews.randomSplit([0.9, 0.1], seed=12345)

Let's create a pipeline. The steps for performing the sentiment analysis includes first aplitting the text into tokens and then hashing these tokens before using the hashes as the input for the logistic regression model. So instead of executing this seperately for each entry, we can rather enter them into pipeline and execute the pipeline for the training data

In [12]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="predictions")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001,labelCol='overall')
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [13]:
model = pipeline.fit(train)

In [14]:
actual = test.select('overall')

Apply the pipeline model to transform the testing dataset.

In [15]:
prediction = model.transform(test)

In [16]:
prediction.show()

+--------------------+------------------+--------------------+-------+--------------------+--------------------+--------------------+--------------------+----------+
|          reviewerID|      reviewerName|          reviewText|overall|         predictions|            features|       rawPrediction|         probability|prediction|
+--------------------+------------------+--------------------+-------+--------------------+--------------------+--------------------+--------------------+----------+
|A01623621DS97QCLQ...|    Paul A. Nation|this item was bro...|    1.0|[this, item, was,...|(262144,[11941,12...|[-8.4089477788623...|[6.40048219047167...|       1.0|
|A01623621DS97QCLQ...|    Paul A. Nation|very nice keeps m...|    5.0|[very, nice, keep...|(262144,[5381,223...|[-8.4089304325109...|[2.76422568650299...|       5.0|
|A027168223K80PQU4...|      brooke rieth|all of them came ...|    5.0|[all, of, them, c...|(262144,[19036,27...|[-8.4092453849753...|[1.06956781429084...|       5.0|
|A03

In [17]:
total = prediction.count()

In [18]:
error = 0

In [19]:
for row in prediction.collect():
  if row['prediction'] < row['overall']-0.2 or row['prediction'] > row['overall']+0.2:
    error += 1


In [20]:
error /= total
accuracy = 1 - error

In [22]:
accuracy

0.9592374278647981
