<a href="https://colab.research.google.com/github/laurahallaman1/final_project/blob/master/Code/2predictReviewer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [82]:

from pyspark import SparkFiles
url ="https://raw.githubusercontent.com/laurahallaman1/final_project/master/Book3.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("Book3.csv"), sep=",", header=True)

# Show DataFrame
df.show(30)

+--------------------+--------------------+--------------------+--------------------+-----------+----------+------+--------------------+--------------------+---------------+--------+
|                  id|                name|               asins|   primaryCategories|doRecommend|numHelpful|rating|                text|               title|       username|reviewer|
+--------------------+--------------------+--------------------+--------------------+-----------+----------+------+--------------------+--------------------+---------------+--------+
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|     Health & Beauty|       null|      null|     4|Seems to work jus...|                Good|     Bycherubs1|  Person|
|AVqVGZNvQMlgsOJE6eUY|"All-New Kindle E...|          B00ZV9PXP2|         Electronics|      FALSE|         0|     3|This was advertis...|Paper white is th...|       Kindler1|  Person|
|AVqVGWLKnnc1JgDc3jF1|Fire Kids Edition...|          B018Y23MNM|         Electronics|

In [83]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['text']))
data_df.show()

+--------------------+--------------------+--------------------+--------------------+-----------+----------+------+--------------------+--------------------+---------------+--------+------+
|                  id|                name|               asins|   primaryCategories|doRecommend|numHelpful|rating|                text|               title|       username|reviewer|length|
+--------------------+--------------------+--------------------+--------------------+-----------+----------+------+--------------------+--------------------+---------------+--------+------+
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|     Health & Beauty|       null|      null|     4|Seems to work jus...|                Good|     Bycherubs1|  Person|    85|
|AVqVGZNvQMlgsOJE6eUY|"All-New Kindle E...|          B00ZV9PXP2|         Electronics|      FALSE|         0|     3|This was advertis...|Paper white is th...|       Kindler1|  Person|   316|
|AVqVGWLKnnc1JgDc3jF1|Fire Kids Edition...|       

In [0]:
 from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
reviewer_to_num = StringIndexer(inputCol='reviewer',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [0]:
 from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [0]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[reviewer_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [0]:
 # Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [88]:
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[16332,20...|
|  1.0|(262145,[5795,163...|
|  0.0|(262145,[16332,84...|
|  2.0|(262145,[6258,706...|
|  0.0|(262145,[35263,37...|
|  1.0|(262145,[3924,963...|
|  1.0|(262145,[12888,14...|
|  0.0|(262145,[4869,963...|
|  1.0|(262145,[9639,244...|
|  1.0|(262145,[2306,963...|
|  1.0|(262145,[7367,249...|
|  1.0|(262145,[18691,37...|
|  0.0|(262145,[68056,91...|
|  1.0|(262145,[9639,163...|
|  1.0|(262145,[83936,11...|
|  0.0|(262145,[9639,163...|
|  1.0|(262145,[14,254,4...|
|  0.0|(262145,[9639,473...|
|  1.0|(262145,[7062,963...|
|  0.0|(262145,[16426,25...|
+-----+--------------------+
only showing top 20 rows



In [0]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [90]:
test_results = predictor.transform(testing)
test_results.show(30)

+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+---------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                  id|                name|               asins|primaryCategories|doRecommend|numHelpful|rating|                text|               title|          username| reviewer|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+---------+------+-----+--------------------+--------------------+--------------------+--------------------+----------

In [91]:
 # Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.474998


In [0]:
new_test=test_results.drop('id','asins','doRecommend','numHelpful','username','length','label','token_text','hash_token','idf_token','features','rawPrediction','probability','prediction')

In [96]:
new_test.show(20)

+--------------------+-----------------+------+--------------------+--------------------+---------+--------------------+
|                name|primaryCategories|rating|                text|               title| reviewer|         stop_tokens|
+--------------------+-----------------+------+--------------------+--------------------+---------+--------------------+
|AmazonBasics 11.6...|      Electronics|     5|Recently, I spent...|Perfect. 342 peop...|  userBot|[recently,, spent...|
|AmazonBasics AA P...|  Health & Beauty|     1|Burnt up a copper...|They WILL burn up...|   Person|[burnt, copper, s...|
|AmazonBasics AA P...|  Health & Beauty|     1|I bought these se...|        Short life!!|  userBot|[bought, several,...|
|AmazonBasics AA P...|  Health & Beauty|     1|I find these batt...|... these batteri...|   Person|[find, batteries,...|
|AmazonBasics AA P...|  Health & Beauty|     1|I had high hopes ...|I must have recei...|  userBot|[high, hopes, sur...|
|AmazonBasics AA P...|  Health &

In [0]:
pandas_df = new_test.toPandas()

In [0]:
pandas_df.to_json("Pandas_test.JSON")