In [1]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [3]:
spark = SparkSession.builder.appName('Project').getOrCreate()

dataset=spark.read.csv("reviews.tbl", inferSchema = True, header = True, sep = '|')

dataset.createTempView("product_reviews")

In [4]:
q="SELECT CASE pr_rating WHEN 1 THEN '0' WHEN 2 THEN '0' WHEN 3 THEN '0' WHEN 4 THEN '1' WHEN 5 THEN '1' END AS pr_r_rating, pr_content FROM product_reviews WHERE pmod(pr_review_id, 5) IN (1,2,3)"

In [5]:
df = spark.sql(q).toDF("label", "sentence")

In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(df)

In [7]:
hashingTF = HashingTF(inputCol="words",outputCol="rawFeatures")

In [8]:
featurizedData = hashingTF.transform(wordsData)

In [9]:
idf = IDF(inputCol = "rawFeatures",outputCol = "features")

In [10]:
idfModel = idf.fit(featurizedData)

In [11]:
rescaledData = idfModel.transform(featurizedData)

In [13]:
rescaledData.select("label", "sentence").show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|    0|attainments are; ...|
|    0|fluffily ironic e...|
|    0|dolphins about ov...|
|    0|bullshit must sub...|
|    0|silent decline or...|
|    0|bold platelets ex...|
|    0|special sustainab...|
|    1|quick winner afte...|
|    0|frays doze whitho...|
|    0|dolphins in place...|
|    0|bravely permanent...|
|    1|ruthlessly risk-f...|
|    0|best-performing d...|
|    0|orbits need to in...|
|    0|idle patience cou...|
|    0|busy deny tithes ...|
|    0|quick brave notor...|
|    0|sheaves will have...|
|    0|daringly fluffy f...|
|    0|carefully express...|
+-----+--------------------+
only showing top 20 rows



In [14]:
rescaledData.printSchema()

root
 |-- label: string (nullable = true)
 |-- sentence: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
df = rescaledData.select(rescaledData["label"].cast("double"),(rescaledData["features"]))

In [16]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262144,[12925,50...|
|  0.0|(262144,[61231,68...|
|  0.0|(262144,[10300,18...|
|  0.0|(262144,[11209,70...|
|  0.0|(262144,[20779,27...|
|  0.0|(262144,[5987,612...|
|  0.0|(262144,[25416,25...|
|  1.0|(262144,[12925,22...|
|  0.0|(262144,[18375,21...|
|  0.0|(262144,[9639,183...|
|  0.0|(262144,[18391,28...|
|  1.0|(262144,[25416,11...|
|  0.0|(262144,[6504,104...|
|  0.0|(262144,[20497,20...|
|  0.0|(262144,[40140,10...|
|  0.0|(262144,[329,1292...|
|  0.0|(262144,[9639,223...|
|  0.0|(262144,[329,2813...|
|  0.0|(262144,[21683,37...|
|  0.0|(262144,[329,1540...|
+-----+--------------------+
only showing top 20 rows



In [17]:
training, test = df.randomSplit([0.6, 0.4])

In [18]:
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')

In [19]:
model = nb.fit(training)

In [20]:
predictions = model.transform(test)

In [21]:
predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(262144,[11,64,86...|[-900.21723400820...|[1.0,9.1136267437...|       0.0|
|  0.0|(262144,[11,64,86...|[-598.88356854243...|[1.0,7.3664213713...|       0.0|
|  0.0|(262144,[11,64,12...|[-221.22851839312...|[0.99999999999934...|       0.0|
|  0.0|(262144,[11,170,2...|[-339.21047824820...|[1.0,5.2211867711...|       0.0|
|  0.0|(262144,[11,170,2...|[-185.73623387179...|[0.99999999999999...|       0.0|
|  0.0|(262144,[11,329,2...|[-880.43014395284...|[1.0,1.7794249793...|       0.0|
|  0.0|(262144,[11,329,4...|[-774.12725915557...|[1.0,4.1711072721...|       0.0|
|  0.0|(262144,[11,329,7...|[-857.65037560890...|[1.0,3.1787527783...|       0.0|
|  0.0|(262144,[11,329,9...|[-432.39164269258...|[1.0,3.5122217036...|       0.0|
|  0.0|(262144,[

In [22]:
evaluator = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")

In [23]:
accuracy=evaluator.evaluate(predictions)

In [24]:
print ("Test set accuracy = "+ str(accuracy))


Test set accuracy = 0.8714461034976477
