# Setup

### General Imports

In [1]:
import numpy as np
import pandas as pd
import gc
import time
import warnings
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib.gridspec as gridspec

# General settings
start_time = time.time()
color = sns.color_palette()
warnings.filterwarnings("ignore")

%matplotlib inline

### Spark Imports

In [10]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

In [6]:
spark = SparkSession.builder \
  .appName('Yelp! Sentiment Analysis')\
  .getOrCreate()

# Sentiment Analysis

In [7]:
df_review = spark.read.json("gs://ca4022_yelp_data/data_raw/yelp_academic_dataset_review.json")

In [8]:
df_review.limit(10).toPandas()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,-MhfebM0QIsKt87iDN-FNw,0,2015-04-15 05:21:16,0,xQY8N_XvtGbearJ5X4QryQ,2.0,"As someone who has worked with many museums, I...",5,OwjRMXRC0KyPrIlcjaXeFQ
1,lbrU8StCq3yDfr-QMnGrmQ,0,2013-12-07 03:16:52,1,UmFMZ8PyXZTY2QcwzsfQYA,1.0,I am actually horrified this place is still in...,1,nIJD_7ZXHq-FX8byPMOkMQ
2,HQl28KMwrEKHqhFrrDqVNQ,0,2015-12-05 03:18:11,0,LG2ZaYiOgpr2DK_90pYjNw,5.0,I love Deagan's. I do. I really do. The atmosp...,1,V34qejxNsCbcgD8C0HVk-Q
3,5JxlZaqCnk1MnbgRirs40Q,0,2011-05-27 05:30:52,0,i6g_oA9Yf9Y31qt0wibXpw,1.0,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",0,ofKDkJKXSKZXu5xJNGiiBQ
4,IS4cv902ykd8wj1TR0N3-A,0,2017-01-14 21:56:57,0,6TdNDKywdbjoTkizeMce8A,4.0,"Oh happy day, finally have a Canes near my cas...",0,UgMW8bLE0QMJDCkQ1Ax5Mg
5,nlxHRv1zXGT0c0K51q3jDg,0,2013-05-07 07:25:25,0,L2O_INwlrRuoX05KSjc4eg,5.0,This is definitely my favorite fast food sub s...,2,5vD2kmE25YBrbayKhykNxQ
6,Pthe4qk5xh4n-ef-9bvMSg,0,2015-11-05 23:11:05,0,ZayJ1zWyWgY9S_TRLT_y9Q,5.0,"Really good place with simple decor, amazing f...",1,aq_ZxGHiri48TUXJlpRkCQ
7,FNCJpSn0tL9iqoY3JC73qw,0,2017-07-18 18:31:54,0,lpFIJYpsvDxyph-kPzZ6aA,5.0,"Awesome office and staff, very professional an...",0,dsd-KNYKMpx6ma_sRWCSkQ
8,e_BiI4ej1CW1F0EyVLr-FQ,0,2015-02-16 06:48:47,0,JA-xnyHytKiOIHl_ztnK9Q,5.0,Most delicious authentic Italian I've had in t...,0,P6apihD4ASf1vpPxHODxAQ
9,Ws8V970-mQt2X9CwCuT5zw,1,2009-10-13 04:16:41,0,z4BCgTkfNtCu4XY5Lp97ww,4.0,I have been here twice. Very nice and laid bac...,3,jOERvhmK6_lo_XGUBPws_w


In [24]:
df_review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In order to carry out sentiment analysis on reviews, we need to frame the problem. In this instance, we are assuming that the sentiment of the review is reflected by the star rating given, which should be the case naturally.

Therefore, for an unseen review, we want to be able to predict what the sentiment of the review is and thus, the star rating. To simplify the problem, we assume:
* A review will have a **positive** sentiment if a star rating of [4.0, 4.5, 5.0] is present.
* A review will have a **neutral** sentiment if a star rating of [2,5, 3.0, 3.5] is present.
* A review will have a **negative** sentiment if a star rating of [1.0, 1.5, 2.0] is present.

These sentiments will be denoted as follows:
* **positive** $\Leftrightarrow$ 1
* **neutral** $\Leftrightarrow$ 0
* **negative** $\Leftrightarrow$ -1

In order to prepare the data for this experiment, we simply need the review itself and then a sentiment score based on the star rating present. Essentially, the question which we are seeking to answer is: *is the star rating associated with a review an accurate representation of the sentiment of the review itself?*

In [14]:
df_sent = df_review.select(["text", "stars"])

In [18]:
df_sent = df_sent.na.drop()

In [20]:
df_sent.limit(10).toPandas()

Unnamed: 0,text,stars
0,"As someone who has worked with many museums, I...",2.0
1,I am actually horrified this place is still in...,1.0
2,I love Deagan's. I do. I really do. The atmosp...,5.0
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",1.0
4,"Oh happy day, finally have a Canes near my cas...",4.0
5,This is definitely my favorite fast food sub s...,5.0
6,"Really good place with simple decor, amazing f...",5.0
7,"Awesome office and staff, very professional an...",5.0
8,Most delicious authentic Italian I've had in t...,5.0
9,I have been here twice. Very nice and laid bac...,4.0


Now, we need a function to convert the star ratings to sentiment scores.

In [27]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def stars_to_sent(rating):
    if rating > 3.5:
        return 1
    elif rating < 2.5:
        return -1
    else:
        return 0

udfStarsToSent = udf(stars_to_sent, IntegerType())

In [28]:
df_sent = df_sent.withColumn("sentiment", udfStarsToSent("stars"))

In [29]:
df_sent.limit(10).toPandas()

Unnamed: 0,text,stars,sentiment
0,"As someone who has worked with many museums, I...",2.0,-1
1,I am actually horrified this place is still in...,1.0,-1
2,I love Deagan's. I do. I really do. The atmosp...,5.0,1
3,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",1.0,-1
4,"Oh happy day, finally have a Canes near my cas...",4.0,1
5,This is definitely my favorite fast food sub s...,5.0,1
6,"Really good place with simple decor, amazing f...",5.0,1
7,"Awesome office and staff, very professional an...",5.0,1
8,Most delicious authentic Italian I've had in t...,5.0,1
9,I have been here twice. Very nice and laid bac...,4.0,1


Due to the masssive size of the dataset (over 8 million reviews), and the long training time this would result in for the sentiment analysis task, we decided to take a random sample of 20% from the original dataset. The data is then split into the train, validation and test sets. Due to the fact that the dataset is still so large, we only require 1% of the data for validation and a further 1% for testing.

In [33]:
# Random 20% sample
df_samp = df_sent.sample(True, 0.2, seed=0)

In [34]:
(train_set, val_set, test_set) = df_samp.randomSplit([0.98, 0.01, 0.01], seed = 445)

The size of each set is shown below...

In [39]:
print("Train: ", train_set.count())
print("Validation: ", val_set.count())
print("Test: ", test_set.count())

Train:  1571827
Validation:  16189
Test:  15884


### HashingTF for tf-idf Calculation with Logistic Regression

After a bit of research, we identified a number of approaches which can be used to perform sentiment analysis. One of which is utilisng tf-idf with a Logistic Regression model. This involves:
1. Tokenising the review text.
2. Using a Hashing term frequency (tf) function to calculate the frequency of terms.
3. Computing the inverse document frequency (idf) of each term.
4. Mapping the sentiment labels.

Spark has a useful Pipeline functionality which enables us to chain these processing steps together.

In [35]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.limit(10).toPandas()

Unnamed: 0,text,stars,sentiment,words,tf,features,label
0,!!! STAY AWAY!!!!\nTheir name says it all as i...,1.0,-1,"[!!!, stay, away!!!!, their, name, says, it, a...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
1,"""A Night at the Roxbury"" of Japanese restauran...",2.0,-1,"[""a, night, at, the, roxbury"", of, japanese, r...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
2,"""All of THAT ...and a bag of chips""!!\nThis pl...",5.0,1,"[""all, of, that, ...and, a, bag, of, chips""!!,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
3,"""Authority"" keeps guard here, so this must be ...",1.0,-1,"[""authority"", keeps, guard, here,, so, this, m...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
4,"""Each chicken, feed some people""- Their slogan...",5.0,1,"[""each, chicken,, feed, some, people""-, their,...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
5,"""Ew! Alan, did you just eat sofa pizza?""\n-Stu...",5.0,1,"[""ew!, alan,, did, you, just, eat, sofa, pizza...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
6,"""First timers!"" Multiple servers yelled as the...",5.0,1,"[""first, timers!"", multiple, servers, yelled, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
7,"""Fun atmosphere, good cocktails, and yummy smo...",4.0,1,"[""fun, atmosphere,, good, cocktails,, and, yum...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
8,"""Hey! Hey! Hey!\n""I don't like walking around ...",3.0,0,"[""hey!, hey!, hey!, ""i, don't, like, walking, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2.0
9,"""Holy cow, this man just gave me 'f**k me hair...",5.0,1,"[""holy, cow,, this, man, just, gave, me, 'f**k...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


Some sample processed reviews are shown above. Note however that these are not representative of how the tf and idf values should look due to how the DataFrame is sorted (i.e. exclamation marks and quotations come first alphabetically).

The next step is to train our Logistic Regression model and see how it fares in predicting the sentiment of unseen reviews.

In [36]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=50)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.9070288452310928

By default, Spark returns the area under ROC curve. A simpler evaluation metric is accuracy which is calculated below...

In [37]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
accuracy

0.8624374575328927

So, based on the unseen reviews in the test set, the classifier predicts the sentiment of these reviews with an accuracy of 86%, which is very good! This gives us an early indication that the stars ratings correlate well with the sentiment of the review.

### CountVectorizer for tf-idf Calculation with Logistic Regression

tf-idf can also be calculated in another manner using the CountVectorizer, available in the SparkML package. The CountVectorizer differs from the HashingTF approach in how it filters the features. HashingTF employs a dimensionality reduction technique which allows the possibility of collisons. Conversely, the CountVectorizer approach discards infrequent tokens. 

Below, we evaluate the performance of Logistic Regression using the CountVectorizer approach.

In [None]:
%%time
from pyspark.ml.feature import CountVectorizer

tokenizer = Tokenizer(inputCol="text", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(val_set.count())
roc_auc = evaluator.evaluate(predictions)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.8646
ROC-AUC: 0.9085
CPU times: user 761 ms, sys: 305 ms, total: 1.07 s
Wall time: 6min 56s


From the above results, we can see that the CountVectorizer approach has marginally improved the performance of the Logistic Regression model.

### N-gram Implementation

Another way of embedding the text from the reviews is through the use of n-grams. The N-gram implementation is slightly more difficult through Spark than with scikit-learn. Originally, approximately 16,000 features for each of unigram, bigram and trigram were extracted (A VectorAssembler is used in the pipeline to combine these features). Then, these features were reduced using Chi-Squared feature selection. However, the training time for this approach was far too long.

As a result, we decided to extract 5,460 features each from unigram, bigram and trigram firstly, and then we would have 16,000 features without having to perform the cumbersome Chi-Squared feature selection. Using this approach, training time dropped drastically but similar accuracy was achieved, making it a preferable approach.

In [46]:
from pyspark.ml.feature import NGram, VectorAssembler

def build_ngrams_wocs(inputCol=["text","sentiment"], n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=5460,inputCol="{0}_grams".format(i),
            outputCol="{0}_tf".format(i))
        for i in range(1, n + 1)
    ]
    idf = [IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1)]

    assembler = [VectorAssembler(
        inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]
    label_stringIdx = [StringIndexer(inputCol = "sentiment", outputCol = "label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + cv + idf+ assembler + label_stringIdx+lr)

In [None]:
%%time
trigramwocs_pipelineFit = build_ngrams_wocs().fit(train_set)
predictions_wocs = trigramwocs_pipelineFit.transform(val_set)
accuracy_wocs = predictions_wocs.filter(predictions_wocs.label == predictions_wocs.prediction).count() / float(val_set.count())
roc_auc_wocs = evaluator.evaluate(predictions_wocs)

# print accuracy, roc_auc
print("Accuracy Score: {0:.4f}".format(accuracy_wocs))
print("ROC-AUC: {0:.4f}".format(roc_auc_wocs))

Accuracy Score: 0.8784
ROC-AUC: 0.9265
CPU times: user 1.07 s, sys: 497 ms, total: 1.56 s
Wall time: 38min 20s


So, again, slightly better accuracy than the previous algorithms.

### Evaluation

The final step is to run the algorithm with the highest accuracy on the test set to get a final accuracy result.

In [51]:
test_predictions = trigramwocs_pipelineFit.transform(test_set)
test_accuracy = test_predictions.filter(test_predictions.label == test_predictions.prediction).count() / float(test_set.count())
test_roc_auc = evaluator.evaluate(test_predictions)

# print accuracy, roc_auc
print("Accuracy Score: {0:.4f}".format(test_accuracy))
print("ROC-AUC: {0:.4f}".format(test_roc_auc))

Accuracy Score: 0.8806
ROC-AUC: 0.9291


The final accuracy result on the test set was 88.06% which is very good!