In [715]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.sql.functions import isnan, when, count, col

In [716]:
DATA_PATH = "/home/ds/notebooks/reviews_Cell_Phones_and_Accessories_5.json.gz"
APP_NAME = "Amazon Reviews Random Forest"
SPARK_URL = "local[*]"


In [717]:
spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()
df = spark.read.options(inferschema = "true").json(DATA_PATH)

In [718]:
df.head(5)

[Row(asin='120401325X', helpful=[0, 0], overall=4.0, reviewText="They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again", reviewTime='05 21, 2014', reviewerID='A30TL5EWN6DFXT', reviewerName='christina', summary='Looks Good', unixReviewTime=1400630400),
 Row(asin='120401325X', helpful=[0, 0], overall=5.0, reviewText='These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)', reviewTime='01 14, 2014', reviewerID='ASY55RVNIL0UD', reviewerName='emily l.', summary='Really great product.', unixReviewTime=1389657600),
 Row(asin='120401325X', helpful=[0, 0], overall=5.0, reviewText='These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!', reviewTime

In [719]:
print(f"Dataset shape is {df.count():d} rows by {len(df.columns):d} columns.")

Dataset shape is 194439 rows by 9 columns.


In [720]:
df = df.drop("helpful")

In [721]:
print(f"Dataset shape is {df.count():d} rows by {len(df.columns):d} columns.")

Dataset shape is 194439 rows by 8 columns.


In [722]:
null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
                         for c in df.columns]).toPandas().to_dict(orient='records')

print(f"There are {sum(null_counts[0].values()):d} null values in the dataset.")

There are 3519 null values in the dataset.


In [723]:
from pyspark.sql import DataFrameNaFunctions as na


In [724]:
df = df.na.drop()

In [725]:
print(f"Dataset shape is {df.count():d} rows by {len(df.columns):d} columns.")

Dataset shape is 190920 rows by 8 columns.


In [726]:
null_counts = df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) 
                         for c in df.columns]).toPandas().to_dict(orient='records')

print(f"There are {sum(null_counts[0].values()):d} null values in the dataset.")

There are 0 null values in the dataset.


In [727]:
df.columns

['asin',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime']

In [728]:
print(f"Dataset shape is {df.count():d} rows by {len(df.columns):d} columns.")

Dataset shape is 190920 rows by 8 columns.


In [729]:
df.head(5)

[Row(asin='120401325X', overall=4.0, reviewText="They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again", reviewTime='05 21, 2014', reviewerID='A30TL5EWN6DFXT', reviewerName='christina', summary='Looks Good', unixReviewTime=1400630400),
 Row(asin='120401325X', overall=5.0, reviewText='These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)', reviewTime='01 14, 2014', reviewerID='ASY55RVNIL0UD', reviewerName='emily l.', summary='Really great product.', unixReviewTime=1389657600),
 Row(asin='120401325X', overall=5.0, reviewText='These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!', reviewTime='06 26, 2014', reviewerID='A2TMXE2AFO7ONB', rev

In [730]:
amazing_data = []

def amazing_count(text):
    if "amazing" in text:
        return 1
    else:
        return 0

In [731]:
from pyspark.sql.functions import udf

In [732]:
amazing_udf = udf(amazing_count)

In [733]:
df = df.withColumn("amazing", amazing_udf(df["reviewText"]))



In [734]:
df.columns

['asin',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime',
 'amazing']

In [735]:
df.head(5)

[Row(asin='120401325X', overall=4.0, reviewText="They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again", reviewTime='05 21, 2014', reviewerID='A30TL5EWN6DFXT', reviewerName='christina', summary='Looks Good', unixReviewTime=1400630400, amazing='0'),
 Row(asin='120401325X', overall=5.0, reviewText='These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)', reviewTime='01 14, 2014', reviewerID='ASY55RVNIL0UD', reviewerName='emily l.', summary='Really great product.', unixReviewTime=1389657600, amazing='0'),
 Row(asin='120401325X', overall=5.0, reviewText='These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!', reviewTime='06 26, 2014', review

In [736]:
great_data = []

def great_count(text):
    if "great" in text:
        return 1
    else:
        return 0

In [737]:
great_udf = udf(great_count)

In [738]:
df = df.withColumn("great", great_udf(df["reviewText"]))



In [739]:
df.head(5)

[Row(asin='120401325X', overall=4.0, reviewText="They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again", reviewTime='05 21, 2014', reviewerID='A30TL5EWN6DFXT', reviewerName='christina', summary='Looks Good', unixReviewTime=1400630400, amazing='0', great='0'),
 Row(asin='120401325X', overall=5.0, reviewText='These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)', reviewTime='01 14, 2014', reviewerID='ASY55RVNIL0UD', reviewerName='emily l.', summary='Really great product.', unixReviewTime=1389657600, amazing='0', great='1'),
 Row(asin='120401325X', overall=5.0, reviewText='These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!', reviewTime

In [740]:
def awful_count(text):
    if "awful" in text:
        return 1
    else:
        return 0

In [741]:
awful_udf = udf(awful_count)

In [742]:
df = df.withColumn("awful", awful_udf(df["reviewText"]))



In [743]:

def perfect_count(text):
    if "perfect" in text:
        return 1
    else:
        return 0

In [744]:
perfect_udf = udf(perfect_count)

In [745]:
df = df.withColumn("perfect", perfect_udf(df["reviewText"]))



In [746]:

def worst_count(text):
    if "worst" in text:
        return 1
    else:
        return 0

In [747]:
worst_udf = udf(worst_count)

In [748]:
df = df.withColumn("worst", worst_udf(df["reviewText"]))



In [749]:

def best_count(text):
    if "best" in text:
        return 1
    else:
        return 0

In [750]:
best_udf = udf(best_count)

In [751]:
df = df.withColumn("best", best_udf(df["reviewText"]))



In [752]:

def worst_count(text):
    if "worst" in text:
        return 1
    else:
        return 0

In [753]:
worst_udf = udf(worst_count)

In [754]:
df = df.withColumn("worst", worst_udf(df["reviewText"]))



In [755]:

def awesome_count(text):
    if "awesome" in text:
        return 1
    else:
        return 0

In [756]:
awesome_udf = udf(awesome_count)

In [757]:
df = df.withColumn("awesome", awesome_udf(df["reviewText"]))



In [758]:
df.columns

['asin',
 'overall',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime',
 'amazing',
 'great',
 'awful',
 'perfect',
 'worst',
 'best',
 'awesome']

In [759]:

def waste_count(text):
    if "waste" in text:
        return 1
    else:
        return 0

In [760]:
waste_udf = udf(waste_count)

In [761]:
df = df.withColumn("waste", waste_udf(df["reviewText"]))



In [762]:

def cheap_count(text):
    if "cheap" in text:
        return 1
    else:
        return 0

In [763]:
cheap_udf = udf(cheap_count)

In [764]:
df = df.withColumn("cheap", cheap_udf(df["reviewText"]))



In [765]:

def only_count(text):
    if "only" in text:
        return 1
    else:
        return 0

In [766]:
only_udf = udf(only_count)

In [767]:
df = df.withColumn("only", only_udf(df["reviewText"]))



In [768]:

def poor_count(text):
    if "poor" in text:
        return 1
    else:
        return 0

In [769]:
poor_udf = udf(poor_count)

In [770]:
df = df.withColumn("poor", poor_udf(df["reviewText"]))



In [771]:

def refund_count(text):
    if "refund" in text:
        return 1
    else:
        return 0

In [772]:
refund_udf = udf(refund_count)

In [773]:
df = df.withColumn("refund", refund_udf(df["reviewText"]))



In [774]:

def disappointed_count(text):
    if "disappointed" in text:
        return 1
    else:
        return 0

In [775]:
disappointed_udf = udf(disappointed_count)

In [776]:
df = df.withColumn("disappointed", disappointed_udf(df["reviewText"]))



In [777]:

def died_count(text):
    if "died" in text:
        return 1
    else:
        return 0

In [778]:
died_udf = udf(died_count)

In [779]:
df = df.withColumn("died", died_udf(df["reviewText"]))



In [780]:

def defective_count(text):
    if "defective" in text:
        return 1
    else:
        return 0

In [781]:
defective_udf = udf(defective_count)

In [782]:
df = df.withColumn("defective", defective_udf(df["reviewText"]))



In [783]:

def stopped_count(text):
    if "stopped" in text:
        return 1
    else:
        return 0

In [784]:
stopped_udf = udf(stopped_count)

In [785]:
df = df.withColumn("stopped", stopped_udf(df["reviewText"]))




def fail_count(text):
    if "fail" in text:
        return 1
    else:
        return 0

fail_udf = udf(fail_count)

df = df.withColumn("fail", fail_udf(df["reviewText"]))




def return_count(text):
    if "return" in text:
        return 1
    else:
        return 0

return_udf = udf(return_count)

df = df.withColumn("return", return_udf(df["reviewText"]))




def tried_count(text):
    if "tried" in text:
        return 1
    else:
        return 0

tried_udf = udf(tried_count)

df = df.withColumn("tried", tried_udf(df["reviewText"]))




def replace_count(text):
    if "replace" in text:
        return 1
    else:
        return 0

replace_udf = udf(replace_count)

df = df.withColumn("replace", replace_udf(df["reviewText"]))




def static_count(text):
    if "static" in text:
        return 1
    else:
        return 0

static_udf = udf(static_count)

df = df.withColumn("static", static_udf(df["reviewText"]))




def problem_count(text):
    if "static" in text:
        return 1
    else:
        return 0

problem_udf = udf(problem_count)

df = df.withColumn("problem", problem_udf(df["reviewText"]))




def muffle_count(text):
    if "muffle" in text:
        return 1
    else:
        return 0

muffle_udf = udf(muffle_count)

df = df.withColumn("muffle", muffle_udf(df["reviewText"]))




def break_count(text):
    if "break" in text:
        return 1
    else:
        return 0

break_udf = udf(break_count)

df = df.withColumn("break", break_udf(df["reviewText"]))




def useless_count(text):
    if "useless" in text:
        return 1
    else:
        return 0

useless_udf = udf(useless_count)

df = df.withColumn("useless", useless_udf(df["reviewText"]))

"static", "replace", "tried", "return", "fail", "useless", "break", "problem", "muffle"


In [786]:
df.groupBy('amazing').count().show()

+-------+------+
|amazing| count|
+-------+------+
|      0|187329|
|      1|  3591|
+-------+------+



In [787]:
df.groupBy('worst').count().show()

+-----+------+
|worst| count|
+-----+------+
|    0|190073|
|    1|   847|
+-----+------+



In [788]:
df.groupBy("best").count().show()

+----+------+
|best| count|
+----+------+
|   0|179119|
|   1| 11801|
+----+------+



In [789]:
df = df.withColumn("sentiment", when(col("overall") >= 3, 1).otherwise(0))



In [790]:
df = df.drop("overall")

In [791]:
df.dtypes

[('asin', 'string'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('reviewerID', 'string'),
 ('reviewerName', 'string'),
 ('summary', 'string'),
 ('unixReviewTime', 'bigint'),
 ('amazing', 'string'),
 ('great', 'string'),
 ('awful', 'string'),
 ('perfect', 'string'),
 ('worst', 'string'),
 ('best', 'string'),
 ('awesome', 'string'),
 ('waste', 'string'),
 ('cheap', 'string'),
 ('only', 'string'),
 ('poor', 'string'),
 ('refund', 'string'),
 ('disappointed', 'string'),
 ('died', 'string'),
 ('defective', 'string'),
 ('stopped', 'string'),
 ('sentiment', 'int')]

In [792]:
from pyspark.sql.types import IntegerType
df = df.withColumn("amazing", df["amazing"].cast(IntegerType()))
df = df.withColumn("great", df["great"].cast(IntegerType()))
df = df.withColumn("awful", df["awful"].cast(IntegerType()))
df = df.withColumn("perfect", df["perfect"].cast(IntegerType()))
df = df.withColumn("worst", df["worst"].cast(IntegerType()))
df = df.withColumn("awesome", df["awesome"].cast(IntegerType()))
df = df.withColumn("best", df["best"].cast(IntegerType()))
df = df.withColumn("waste", df["waste"].cast(IntegerType()))
df = df.withColumn("cheap", df["cheap"].cast(IntegerType()))
df = df.withColumn("only", df["only"].cast(IntegerType()))
df = df.withColumn("poor", df["poor"].cast(IntegerType()))
df = df.withColumn("refund", df["refund"].cast(IntegerType()))
df = df.withColumn("disappointed", df["disappointed"].cast(IntegerType()))
df = df.withColumn("died", df["died"].cast(IntegerType()))
df = df.withColumn("defective", df["defective"].cast(IntegerType()))
df = df.withColumn("stopped", df["stopped"].cast(IntegerType()))
#df = df.withColumn("static", df["static"].cast(IntegerType()))
#df = df.withColumn("replace", df["replace"].cast(IntegerType()))
#df = df.withColumn("tried", df["tried"].cast(IntegerType()))
#df = df.withColumn("return", df["return"].cast(IntegerType()))
#df = df.withColumn("fail", df["fail"].cast(IntegerType()))
#df = df.withColumn("useless", df["useless"].cast(IntegerType()))
#df = df.withColumn("break", df["break"].cast(IntegerType()))
#df = df.withColumn("problem", df["problem"].cast(IntegerType()))
#df = df.withColumn("muffle", df["muffle"].cast(IntegerType()))

In [793]:
df.dtypes

[('asin', 'string'),
 ('reviewText', 'string'),
 ('reviewTime', 'string'),
 ('reviewerID', 'string'),
 ('reviewerName', 'string'),
 ('summary', 'string'),
 ('unixReviewTime', 'bigint'),
 ('amazing', 'int'),
 ('great', 'int'),
 ('awful', 'int'),
 ('perfect', 'int'),
 ('worst', 'int'),
 ('best', 'int'),
 ('awesome', 'int'),
 ('waste', 'int'),
 ('cheap', 'int'),
 ('only', 'int'),
 ('poor', 'int'),
 ('refund', 'int'),
 ('disappointed', 'int'),
 ('died', 'int'),
 ('defective', 'int'),
 ('stopped', 'int'),
 ('sentiment', 'int')]

In [794]:
df = VectorAssembler(inputCols=["amazing", "great", "awful", "perfect", "worst", "awesome", "best", "waste", "cheap", "only", "poor", "refund", "disappointed", "died", "defective", "stopped"], outputCol="features").transform(df)

In [795]:
df.columns

['asin',
 'reviewText',
 'reviewTime',
 'reviewerID',
 'reviewerName',
 'summary',
 'unixReviewTime',
 'amazing',
 'great',
 'awful',
 'perfect',
 'worst',
 'best',
 'awesome',
 'waste',
 'cheap',
 'only',
 'poor',
 'refund',
 'disappointed',
 'died',
 'defective',
 'stopped',
 'sentiment',
 'features']

In [796]:
df.select("reviewText", "sentiment").show(10)

+--------------------+---------+
|          reviewText|sentiment|
+--------------------+---------+
|They look good an...|        1|
|These stickers wo...|        1|
|These are awesome...|        1|
|Item arrived in g...|        1|
|awesome! stays on...|        1|
|These make using ...|        1|
|Came just as desc...|        1|
|it worked for the...|        0|
|Good case, solid ...|        1|
|This is a fantast...|        1|
+--------------------+---------+
only showing top 10 rows



In [797]:
df.groupBy("sentiment").count().show()

+---------+------+
|sentiment| count|
+---------+------+
|        1|166953|
|        0| 23967|
+---------+------+



In [798]:
23967/(166953+23967)

0.12553425518541797

In [819]:
TRAINING_DATA_RATIO = .8
RF_NUM_TREES = 15
RF_MAX_DEPTH = 10
RF_NUM_BINS = 32

In [820]:
labelIndexer = StringIndexer(inputCol="sentiment", outputCol="indexedLabel").fit(df)

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df)

(trainingData, testData) = df.randomSplit([TRAINING_DATA_RATIO, 1 - TRAINING_DATA_RATIO])

rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=RF_NUM_TREES)

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [821]:
model = pipeline.fit(trainingData)

In [822]:
predictions = model.transform(testData)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)

print(f"Test Error = {(1.0 - accuracy):g}")
print(f"Accuracy = {accuracy:g}")

In [803]:
df.filter(df.sentiment == 0).collect()



[Row(asin='3998899561', reviewText='it worked for the first week then it only charge my phone to 20%. it is a waste of money.', reviewTime='11 21, 2013', reviewerID='A6FGO4TBZ3QFZ', reviewerName='Abdullah Albyati', summary='not a good Idea', unixReviewTime=1384992000, amazing=0, great=0, awful=0, perfect=0, worst=0, best=0, awesome=0, waste=1, cheap=0, only=1, poor=0, refund=0, disappointed=0, died=0, defective=0, stopped=0, sentiment=0, features=SparseVector(16, {7: 1.0, 9: 1.0})),
 Row(asin='6073894996', reviewText='It worked great for the first couple of weeks then it just stopped completely.. so basically a small waste of money.', reviewTime='05 29, 2013', reviewerID='A2INSXDTE08WSJ', reviewerName='Barbie', summary='Horrible', unixReviewTime=1369785600, amazing=0, great=1, awful=0, perfect=0, worst=0, best=0, awesome=0, waste=1, cheap=0, only=0, poor=0, refund=0, disappointed=0, died=0, defective=0, stopped=1, sentiment=0, features=SparseVector(16, {1: 1.0, 7: 1.0, 15: 1.0})),
 Row