In [50]:
import findspark
findspark.init('directory_to_spark_installation')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

data = spark.read.csv('Restaurant_Reviews.tsv',
                      inferSchema = True, 
                      header = True, # no header defined in the data
                     sep = '\t' # since the document is tab-separated
                     )
data.show()

+--------------------+-----+
|              Review|Liked|
+--------------------+-----+
|Wow... Loved this...|    1|
|  Crust is not good.|    0|
|Not tasty and the...|    0|
|Stopped by during...|    1|
|The selection on ...|    1|
|Now I am getting ...|    0|
|Honeslty it didn'...|    0|
|The potatoes were...|    0|
|The fries were gr...|    1|
|      A great touch.|    1|
|Service was very ...|    1|
|  Would not go back.|    0|
|The cashier had n...|    0|
|I tried the Cape ...|    1|
|I was disgusted b...|    0|
|I was shocked bec...|    0|
| Highly recommended.|    1|
|Waitress was a li...|    0|
|This place is not...|    0|
|did not like at all.|    0|
+--------------------+-----+
only showing top 20 rows



### Data Preprocessing

In [51]:
from pyspark.sql.functions import length
data = data.withColumn('Length', length(data['Review']))
data.show()

+--------------------+-----+------+
|              Review|Liked|Length|
+--------------------+-----+------+
|Wow... Loved this...|    1|    24|
|  Crust is not good.|    0|    18|
|Not tasty and the...|    0|    41|
|Stopped by during...|    1|    87|
|The selection on ...|    1|    59|
|Now I am getting ...|    0|    46|
|Honeslty it didn'...|    0|    37|
|The potatoes were...|    0|   111|
|The fries were gr...|    1|    25|
|      A great touch.|    1|    14|
|Service was very ...|    1|    24|
|  Would not go back.|    0|    18|
|The cashier had n...|    0|    99|
|I tried the Cape ...|    1|    60|
|I was disgusted b...|    0|    62|
|I was shocked bec...|    0|    50|
| Highly recommended.|    1|    19|
|Waitress was a li...|    0|    38|
|This place is not...|    0|    51|
|did not like at all.|    0|    20|
+--------------------+-----+------+
only showing top 20 rows



In [5]:
data.groupBy('Liked').mean().show()

+-----+----------+-----------+
|Liked|avg(Liked)|avg(length)|
+-----+----------+-----------+
|    1|       1.0|      55.88|
|    0|       0.0|      60.75|
+-----+----------+-----------+



*** This shows that simply the length of the review does not tell whether a customer like the restaurant or not ***

*** Moving Forward ***

In [55]:
# Removing punctuations and converting to lowercase
from pyspark.sql.functions import regexp_replace, lower

clean_data = data.select('Liked', lower(regexp_replace('Review', '[^a-zA-Z]', ' ')).alias('Clean_Review'))
clean_data.show(truncate = False)

+-----+---------------------------------------------------------------------------------------------------------------+
|Liked|Clean_Review                                                                                                   |
+-----+---------------------------------------------------------------------------------------------------------------+
|1    |wow    loved this place                                                                                        |
|0    |crust is not good                                                                                              |
|0    |not tasty and the texture was just nasty                                                                       |
|1    |stopped by during the late may bank holiday off rick steve recommendation and loved it                         |
|1    |the selection on the menu was great and so were the prices                                                     |
|0    |now i am getting angry and i want

### Feature Transformation

In [58]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF

tokenizer = Tokenizer(inputCol = 'Clean_Review', outputCol = 'tokenized_review')
stop_remove = StopWordsRemover(inputCol = 'tokenized_review', outputCol = 'stop_review')
count_vec = CountVectorizer(inputCol = 'stop_review', outputCol = 'vectorized_review')
idf = IDF(inputCol = 'vectorized_review', outputCol = 'idf_review')

In [59]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['idf_review'], outputCol = 'features')

In [65]:
# Let's create a pipeline with all the objects defined above
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[tokenizer, stop_remove, count_vec, idf, assembler])
final_data = pipeline.fit(clean_data).transform(clean_data)
final_data = final_data.select(['Liked', 'features'])
final_data = final_data.withColumnRenamed('Liked', 'label')
final_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|(1891,[0,2,91,369...|
|    0|(1891,[3,511],[2....|
|    0|(1891,[81,375,447...|
|    1|(1891,[91,220,338...|
|    1|(1891,[5,49,97,98...|
|    0|(1891,[59,133,176...|
|    0|(1891,[58,62,1726...|
|    0|(1891,[7,9,41,44,...|
|    1|(1891,[5,103],[2....|
|    1|(1891,[5,600],[2....|
|    1|(1891,[4,1058],[2...|
|    0|(1891,[6,8,12],[2...|
|    0|(1891,[13,42,74,1...|
|    1|(1891,[0,40,163,5...|
|    0|(1891,[24,130,497...|
|    0|(1891,[1015,1033,...|
|    1|(1891,[295,355],[...|
|    0|(1891,[4,73,86,11...|
|    0|(1891,[0,2,9,22,8...|
|    0|(1891,[7],[3.1020...|
+-----+--------------------+
only showing top 20 rows



### Training and Evaluation

In [66]:
# Naive bayes model is very common to use with NLP
from pyspark.ml.classification import NaiveBayes

# Splitting the data into train set and test set
train_data, test_data = final_data.randomSplit([0.8, 0.2])

# Fitting the classifier model
nb = NaiveBayes()
nbModel = nb.fit(train_data)

# Making prediction using text data and trained model
result = nbModel.transform(test_data)
result.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|(1891,[0,1,2,4,56...|[-210.84794317662...|[0.99999999999989...|       0.0|
|    0|(1891,[0,1,4,73,8...|[-341.30044283886...|[1.0,2.0656000869...|       0.0|
|    0|(1891,[0,1,7,18,2...|[-505.22567187105...|[1.0,3.8578479560...|       0.0|
|    0|(1891,[0,1,10,45,...|[-218.88000634057...|[1.54057614300251...|       1.0|
|    0|(1891,[0,1,31,57,...|[-154.63041719070...|[0.99981339687064...|       0.0|
|    0|(1891,[0,1,33,35,...|[-568.62079875290...|[0.99996648951684...|       0.0|
|    0|(1891,[0,1,33,60,...|[-299.85607601610...|[1.0,2.7042994614...|       0.0|
|    0|(1891,[0,1,44,282...|[-224.67092040834...|[0.99999999672894...|       0.0|
|    0|(1891,[0,2,6,75,1...|[-195.75889691146...|[0.98874269190624...|       0.0|
|    0|(1891,[0,

In [67]:
# let's evaluate othe performance of our model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

eval1 = MulticlassClassificationEvaluator()
accuracy = eval1.evaluate(result)
print("Accuracy of model at predicting spam was: {}".format(accuracy))

Accuracy of model at predicting spam was: 0.7046917595293892
