In [1]:
import sparknlp
spark = sparknlp.start()

In [2]:
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from py4j.java_gateway import java_import
java_import(spark._sc._jvm, "org.apache.spark.sql.api.python.*")
from pyspark.sql.types import *
from pyspark.ml import Pipeline

In [4]:
schema = StructType([
    StructField("id", StringType(), True),
    StructField("dateAdded", StringType(), True),
    StructField("dateUpdated", StringType(), True),
    StructField("name", StringType(), True),
    StructField("asins", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("primaryCategories", StringType(), True),
    StructField("imageURLs", StringType(), True),
    StructField("keys", StringType(), True),
    StructField("manufacturer", StringType(), True),
    StructField("manufacturerNumber", StringType(), True),
    StructField("reviews.date", StringType(), True),
    StructField("reviews.dateAdded", StringType(), True),
    StructField("reviews.dateSeen", StringType(), True),
    StructField("reviews.doRecommend", StringType(), True),
    StructField("reviews.id", StringType(), True),
    StructField("reviews.numHelpful", StringType(), True),
    StructField("reviews.rating", IntegerType(), True),
    StructField("reviews.sourceURLs", StringType(), True),
    StructField("reviews.text", StringType(), True),
    StructField("reviews.title", StringType(), True),
    StructField("reviews.username", StringType(), True),
    StructField("sourceURLs", StringType(), True)])

In [5]:
raw_data = spark.read.format("csv")\
    .option("header", "true")\
    .option("delimiter", ",")\
    .schema(schema)\
    .csv("/Users/joanne/Documents/School/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")

for name in raw_data.schema.names:
      raw_data = raw_data.withColumnRenamed(name, name.replace('.', '_'))

In [6]:
raw_data.cache()

DataFrame[id: string, dateAdded: string, dateUpdated: string, name: string, asins: string, brand: string, categories: string, primaryCategories: string, imageURLs: string, keys: string, manufacturer: string, manufacturerNumber: string, reviews_date: string, reviews_dateAdded: string, reviews_dateSeen: string, reviews_doRecommend: string, reviews_id: string, reviews_numHelpful: string, reviews_rating: int, reviews_sourceURLs: string, reviews_text: string, reviews_title: string, reviews_username: string, sourceURLs: string]

In [7]:
raw_data.printSchema()

root
 |-- id: string (nullable = true)
 |-- dateAdded: string (nullable = true)
 |-- dateUpdated: string (nullable = true)
 |-- name: string (nullable = true)
 |-- asins: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- primaryCategories: string (nullable = true)
 |-- imageURLs: string (nullable = true)
 |-- keys: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- manufacturerNumber: string (nullable = true)
 |-- reviews_date: string (nullable = true)
 |-- reviews_dateAdded: string (nullable = true)
 |-- reviews_dateSeen: string (nullable = true)
 |-- reviews_doRecommend: string (nullable = true)
 |-- reviews_id: string (nullable = true)
 |-- reviews_numHelpful: string (nullable = true)
 |-- reviews_rating: integer (nullable = true)
 |-- reviews_sourceURLs: string (nullable = true)
 |-- reviews_text: string (nullable = true)
 |-- reviews_title: string (nullable = true)
 |-- reviews_username: string (null

In [8]:
raw_data.show(5)

+--------------------+--------------------+--------------------+--------------------+----------+------+--------------------+-----------------+--------------------+--------------------+------------+------------------+--------------------+-----------------+--------------------+-------------------+----------+------------------+--------------+--------------------+--------------------+--------------------+----------------+--------------------+
|                  id|           dateAdded|         dateUpdated|                name|     asins| brand|          categories|primaryCategories|           imageURLs|                keys|manufacturer|manufacturerNumber|        reviews_date|reviews_dateAdded|    reviews_dateSeen|reviews_doRecommend|reviews_id|reviews_numHelpful|reviews_rating|  reviews_sourceURLs|        reviews_text|       reviews_title|reviews_username|          sourceURLs|
+--------------------+--------------------+--------------------+--------------------+----------+------+-----------

In [9]:
raw_data.select('name').groupby('name').count().orderBy('count', ascending=False).show(raw_data.count(), True)

+--------------------+-----+
|                name|count|
+--------------------+-----+
|"Amazon Echo Show...|  845|
|"All-New Fire HD ...|  797|
|Amazon - Echo Plu...|  590|
|Fire Kids Edition...|  561|
|"Brand New Amazon...|  467|
|Fire Tablet 7 Dis...|  371|
|Amazon Tap - Alex...|  225|
|Fire Kids Edition...|  217|
|Kindle E-reader -...|  159|
|Fire HD 10 Tablet...|  106|
|"Fire Tablet with...|  101|
|"Amazon Kindle E-...|   96|
|"Amazon - Kindle ...|   82|
|All-New Fire HD 8...|   70|
|"All-New Fire HD ...|   58|
|"Fire HD 8 Tablet...|   53|
|All-New Fire HD 8...|   51|
|"All-New Fire HD ...|   40|
|"Kindle Oasis E-r...|   39|
|Kindle Oasis E-re...|   24|
|Amazon 9W PowerFa...|   22|
|Amazon - Kindle V...|   22|
|Amazon Fire TV wi...|    4|
+--------------------+-----+



In [10]:
raw_data.select('reviews_rating').groupBy('reviews_rating').count().orderBy('reviews_rating', ascending=True).show()

+--------------+-----+
|reviews_rating|count|
+--------------+-----+
|             1|   63|
|             2|   54|
|             3|  197|
|             4| 1208|
|             5| 3478|
+--------------+-----+



In [11]:
raw_data.select('reviews_rating').describe().show()

+-------+------------------+
|summary|    reviews_rating|
+-------+------------------+
|  count|              5000|
|   mean|            4.5968|
| stddev|0.7318038448747551|
|    min|                 1|
|    max|                 5|
+-------+------------------+



In [12]:
reviews = raw_data.select('name','reviews_text','reviews_rating')
reviews_df = spark.createDataFrame(reviews.toPandas().reset_index())
reviews_df.show()

+-----+--------------------+--------------------+--------------+
|index|                name|        reviews_text|reviews_rating|
+-----+--------------------+--------------------+--------------+
|    0|"Amazon Kindle E-...|I thought it woul...|             3|
|    1|"Amazon Kindle E-...|This kindle is li...|             5|
|    2|"Amazon Kindle E-...|Didnt know how mu...|             4|
|    3|"Amazon Kindle E-...|I am 100 happy wi...|             5|
|    4|"Amazon Kindle E-...|Solid entry level...|             5|
|    5|"Amazon Kindle E-...|This make an exce...|             5|
|    6|"Amazon Kindle E-...|I ordered this fo...|             5|
|    7|"Amazon Kindle E-...|I bought my Kindl...|             4|
|    8|"Amazon Kindle E-...|amazon kindle is ...|             5|
|    9|"Amazon Kindle E-...|It's beyond my ex...|             5|
|   10|"Amazon Kindle E-...|If you really wan...|             5|
|   11|"Amazon Kindle E-...|Love my kindle ma...|             5|
|   12|"Amazon Kindle E-.

In [47]:
df = raw_data.select('reviews_text')
df.cache()

DataFrame[reviews_text: string]

In [48]:
reviews_df.cache()

DataFrame[index: bigint, name: string, reviews_text: string, reviews_rating: bigint]

In [15]:
document = DocumentAssembler() \
            .setInputCol("reviews_text") \
            .setOutputCol("document")

In [40]:
document.transform(df)

DataFrame[reviews_text: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [17]:
sentence = SentenceDetector() \
            .setInputCols(["document"]) \
            .setOutputCol("sentence")

In [52]:
sentence.setExplodeSentences(True)

SentenceDetector_7b36498736e7

In [19]:
tokenizer = Tokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")

In [20]:
checker = NorvigSweetingModel.pretrained() \
            .setInputCols(["token"]) \
            .setOutputCol("checked")

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


In [21]:
ner = NerDLModel.pretrained() \
        .setInputCols(['sentence','checked']) \
        .setOutputCol('ner')

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [22]:
converter = NerConverter() \
            .setInputCols(['sentence', 'checked', 'ner']) \
            .setOutputCol('chunk')

In [49]:
pipeline = Pipeline() \
    .setStages([
        document, 
        sentence, 
        tokenizer, 
        checker, 
        ner, 
        converter])

In [53]:
model = pipeline.fit(df).transform(df)

Py4JJavaError: An error occurred while calling o220.getParam.
: java.util.NoSuchElementException: Param detectLists does not exist.
	at org.apache.spark.ml.param.Params$$anonfun$getParam$2.apply(params.scala:729)
	at org.apache.spark.ml.param.Params$$anonfun$getParam$2.apply(params.scala:729)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.ml.param.Params$class.getParam(params.scala:728)
	at org.apache.spark.ml.PipelineStage.getParam(Pipeline.scala:42)
	at sun.reflect.GeneratedMethodAccessor77.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
