### Requirement: Build a reviewer filter. Use the various NLP tools and a new classifier, Naive Bayes to predict if one review text is like (overall >=4), don't like (overall <=2), neutral (2<overall<4).

In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, when, col, explode, array, lit

from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('nlp_musical').getOrCreate()

In [5]:
# Load data
data = spark.read.json("../../Data/Musical_Instruments_5.json")

In [6]:
data.show(5)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [7]:
data = data.withColumn('class', when(data.overall >=4, 'like').when(data.overall <=2, 'not_like').otherwise("neutral"))

In [8]:
data = data.select('reviewText', 'overall', 'class')

### Clean and prepare the data

In [9]:
data = data.withColumn('length', length(data['reviewText']))

In [10]:
data.show(5)

+--------------------+-------+-----+------+
|          reviewText|overall|class|length|
+--------------------+-------+-----+------+
|Not much to write...|    5.0| like|   268|
|The product does ...|    5.0| like|   544|
|The primary job o...|    5.0| like|   436|
|Nice windscreen p...|    5.0| like|   206|
|This pop filter i...|    5.0| like|   159|
+--------------------+-------+-----+------+
only showing top 5 rows



In [11]:
# Pretty Clear Difference
data.groupby('class').count().show()

+--------+-----+
|   class|count|
+--------+-----+
|not_like|  467|
| neutral|  772|
|    like| 9022|
+--------+-----+



### Feature Transformation

In [12]:
class_to_num = StringIndexer(inputCol='class', outputCol='label')
tokenizer = Tokenizer(inputCol='reviewText', outputCol='token_text')
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

In [13]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

### The model

In [14]:
nb = NaiveBayes()

### Pipeline

In [15]:
data_prep_pipe = Pipeline(stages=[class_to_num, tokenizer, stopremove, count_vec, idf, clean_up])

In [16]:
clearer = data_prep_pipe.fit(data)

In [17]:
clean_data = clearer.transform(data)

### Traing and Evaluation

In [18]:
clean_data = clean_data.select(['label', 'features'])

In [19]:
clean_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[3,12,14,3...|
|  0.0|(51949,[2,3,12,16...|
|  0.0|(51949,[11,19,44,...|
|  0.0|(51949,[18,37,57,...|
|  0.0|(51949,[2,122,132...|
|  0.0|(51949,[0,5,15,21...|
|  0.0|(51949,[5,16,29,1...|
|  1.0|(51949,[1,3,4,8,1...|
|  0.0|(51949,[0,3,12,33...|
|  0.0|(51949,[1,6,15,52...|
+-----+--------------------+
only showing top 10 rows



In [20]:
(training, testing) = clean_data.randomSplit([0.7, 0.3])

In [21]:
training.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6303|
|  1.0|  557|
|  2.0|  318|
+-----+-----+



In [22]:
testing.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 2719|
|  1.0|  215|
|  2.0|  149|
+-----+-----+



In [23]:
predictor = nb.fit(training)

In [24]:
data.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- class: string (nullable = false)
 |-- length: integer (nullable = true)



In [25]:
test_results = predictor.transform(testing)

In [26]:
test_results.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(51949,[0],[1.025...|[-6.4391482826490...|[0.88759228226343...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-11931.040260643...|[1.0,1.0538841179...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-7697.9785301232...|[1.0,1.3564401283...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-37925.988998147...|[1.77897342503172...|       1.0|
|  0.0|(51949,[0,1,2,3,4...|[-12955.590556716...|[2.61968636701946...|       2.0|
|  0.0|(51949,[0,1,2,3,4...|[-19081.867903993...|[8.181824552061E-...|       2.0|
|  0.0|(51949,[0,1,2,3,4...|[-10820.847583772...|[1.0,1.2526855708...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-3676.5065587054...|[1.0,1.1192537795...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-7672.7682272594...|[1.68965789743225...|       1.0|
|  0.0|(51949,[0

In [27]:
test_results.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   64|
|  1.0|       1.0|   64|
|  0.0|       1.0|  546|
|  1.0|       0.0|  134|
|  2.0|       2.0|   38|
|  2.0|       1.0|   47|
|  1.0|       2.0|   17|
|  0.0|       0.0| 1962|
|  0.0|       2.0|  211|
+-----+----------+-----+



In [28]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting was: {}".format(acc))

Accuracy of model at predicting was: 0.7283937514311138


#### Not very good result, try LogisticRegression or RandomForestClassifier

### Logistic Regression

In [29]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [30]:
predictor_lg = lg.fit(training)

In [31]:
test_result_lg = predictor_lg.transform(testing)

In [32]:
test_result_lg.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  146|
|  1.0|       1.0|    3|
|  0.0|       1.0|    2|
|  1.0|       0.0|  210|
|  2.0|       2.0|    2|
|  2.0|       1.0|    1|
|  1.0|       2.0|    2|
|  0.0|       0.0| 2716|
|  0.0|       2.0|    1|
+-----+----------+-----+



In [33]:
acc_lg = acc_eval.evaluate(test_result_lg)
print("Accuracy of model at predicting was: {}".format(acc_lg))

Accuracy of model at predicting was: 0.830408353503968


#### Higher but not better result

### Random Forest

In [34]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=500, maxDepth=5, maxBins=64)

In [35]:
predictor_rf = rf.fit(training)

In [36]:
test_result_rf = predictor_rf.transform(testing)

In [37]:
test_result_rf.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  149|
|  1.0|       0.0|  215|
|  0.0|       0.0| 2719|
+-----+----------+-----+



In [38]:
test_result_rf.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 3083|
+----------+-----+



In [39]:
acc_rf = acc_eval.evaluate(test_result_rf)
print("Accuracy of model at predicting was: {}".format(acc_rf))

Accuracy of model at predicting was: 0.8266033511770132


#### Higher accuracy but too bad result

# Need to resample

In [40]:
like_df = training.filter(col("label")==0)
neutral_df = training.filter(col("label")==1)
notlike_df = training.filter(col("label")==2)

ratio_1 = int(like_df.count()/neutral_df.count())
ratio_2 = int(like_df.count()/notlike_df.count())

print("ratio like/neutral {}".format(ratio_1))
print("ratio like/not like {}".format(ratio_2))

ratio like/neutral 11
ratio like/not like 19


In [41]:
# resample neutral
a1 = range(ratio_1)
# duplicate the minority rows
oversampled_neutral_df = neutral_df.withColumn("dummy", explode(array([lit(x) for x in a1]))).drop('dummy')

In [42]:
# combine both oversampled minority rows and previous majority rows
combined_df = like_df.unionAll(oversampled_neutral_df)
combined_df.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[0],[1.025...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
+-----+--------------------+
only showing top 10 rows



In [43]:
combined_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6303|
|  1.0| 6127|
+-----+-----+



In [44]:
# resample not like
a2 = range(ratio_2)
# duplicate the minority rows
oversampled_notlike_df = notlike_df.withColumn("dummy", explode(array([lit(x) for x in a2]))).drop('dummy')

In [45]:
# combine both oversampled minority rows and previous majority rows
combined_df = combined_df.unionAll(oversampled_notlike_df)
combined_df.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[0],[1.025...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
+-----+--------------------+
only showing top 10 rows



In [46]:
combined_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6303|
|  1.0| 6127|
|  2.0| 6042|
+-----+-----+



### Naive Bayer

In [47]:
predictor_nb2 = nb.fit(combined_df)
test_result_nb2 = predictor_nb2.transform(testing)
test_result_nb2.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  116|
|  1.0|       1.0|   31|
|  0.0|       1.0|  161|
|  1.0|       0.0|  180|
|  2.0|       2.0|   19|
|  2.0|       1.0|   14|
|  1.0|       2.0|    4|
|  0.0|       0.0| 2510|
|  0.0|       2.0|   48|
+-----+----------+-----+



In [48]:
acc_nb2 = acc_eval.evaluate(test_result_nb2)
print("Accuracy of model at predicting was: {}".format(acc_nb2))

Accuracy of model at predicting was: 0.8199400225671071


### Logistic Regression

In [49]:
predictor_lg2 = lg.fit(combined_df)
test_result_lg2 = predictor_lg2.transform(testing)
test_result_lg2.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  119|
|  1.0|       1.0|   20|
|  0.0|       1.0|   75|
|  1.0|       0.0|  190|
|  2.0|       2.0|   13|
|  2.0|       1.0|   17|
|  1.0|       2.0|    5|
|  0.0|       0.0| 2629|
|  0.0|       2.0|   15|
+-----+----------+-----+



In [50]:
acc_lg2 = acc_eval.evaluate(test_result_lg2)
print("Accuracy of model at predicting was: {}".format(acc_lg2))

Accuracy of model at predicting was: 0.8351633706419184


### Random Forest

In [51]:
predictor_rf2 = lg.fit(combined_df)
test_result_rf2 = predictor_rf2.transform(testing)
test_result_rf2.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  119|
|  1.0|       1.0|   20|
|  0.0|       1.0|   75|
|  1.0|       0.0|  190|
|  2.0|       2.0|   13|
|  2.0|       1.0|   17|
|  1.0|       2.0|    5|
|  0.0|       0.0| 2629|
|  0.0|       2.0|   15|
+-----+----------+-----+



In [52]:
acc_rf2 = acc_eval.evaluate(test_result_rf2)
print("Accuracy of model at predicting was: {}".format(acc_rf2))

Accuracy of model at predicting was: 0.8351633706419184
