<a href="https://colab.research.google.com/github/laurahallaman1/final_project/blob/nickfiles/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [0]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://raw.githubusercontent.com/laurahallaman1/final_project/master/Combined_Amazon_Consumer_Reviews.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("Combined_Amazon_Consumer_Reviews.csv"), sep=",", header=True)

# Show DataFrame
df.show()




+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+
|                  id|                name|               asins|primaryCategories|doRecommend|numHelpful|rating|                text|               title|          username|
+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     3|I order 3 of them...|... 3 of them and...|        Byger yang|
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     4|Bulk is always th...|... always the le...|              ByMG|
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     5|Well they are not.

In [0]:
tokenizer = Tokenizer(inputCol="text", outputCol="textreview")
tokenizer

Tokenizer_3d3e6b012311

In [0]:
reviewed = tokenizer.transform(df)
reviewed.show()

+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+--------------------+
|                  id|                name|               asins|primaryCategories|doRecommend|numHelpful|rating|                text|               title|          username|          textreview|
+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+--------------------+
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     3|I order 3 of them...|... 3 of them and...|        Byger yang|[i, order, 3, of,...|
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     4|Bulk is always th...|... always the le...|              ByMG|[bulk, is, always...|
|AVpgNzjwLJeJML43Kpxn|Ama

In [0]:
remover = StopWordsRemover(inputCol="textreview", outputCol="new_review")
remover

StopWordsRemover_38127a480dac

In [0]:
frame = remover.transform(reviewed)
frame.show()

+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+--------------------+--------------------+
|                  id|                name|               asins|primaryCategories|doRecommend|numHelpful|rating|                text|               title|          username|          textreview|          new_review|
+--------------------+--------------------+--------------------+-----------------+-----------+----------+------+--------------------+--------------------+------------------+--------------------+--------------------+
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     3|I order 3 of them...|... 3 of them and...|        Byger yang|[i, order, 3, of,...|[order, 3, one, i...|
|AVpgNzjwLJeJML43Kpxn|AmazonBasics AAA ...|B00QWO9P0O,B00LH3...|  Health & Beauty|       null|      null|     4|Bulk is always th...|...

In [0]:
remover = StopWordsRemover(inputCol=" new_review", outputCol="filtered")

In [0]:
frame.select("rating","new_review").show(truncate=False)


+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|rating|new_review                                                                                                                                                                                                                                                                                                                        |
+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|3  

In [0]:
frame.toPandas().to_csv('StopWords.csv')