In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

# create a SparkSession
spark = SparkSession.builder.appName("ReadJSON").config("spark.executor.memory", "500mb").config("spark.driver.memory", "1g").getOrCreate()
# .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
#"reviewerID": "A8WEXFRWX1ZHH", 
# "asin": "0209688726", 
# "style": {"Color:": " AC"}, 
# "reviewerName": "Goldengate",
# Define the schema
schema = StructType([
    StructField("overall", FloatType(), True),
    StructField("verified", BooleanType(), True),
    StructField("reviewTime", StringType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True),
    StructField("style", StructType([StructField("Color:", StringType(), True)]), True),
    StructField("reviewerName", StringType(), True),
    StructField("reviewText", StringType(), True),
    StructField("unixReviewTime", IntegerType(), True)
    
])

In [3]:
df = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\amazon_review_data\AMAZON_FASHION_5.json")

In [4]:
df.show()

+-------+--------+-----------+--------------+----------+--------------------+------------------+--------------------+--------------+
|overall|verified| reviewTime|    reviewerID|      asin|               style|      reviewerName|          reviewText|unixReviewTime|
+-------+--------+-----------+--------------+----------+--------------------+------------------+--------------------+--------------+
|    5.0|    true| 09 4, 2015| ALJ66O1Y6SLHA|B000K2PJ4K|      { Blue/Orange}|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true| 09 4, 2015| ALJ66O1Y6SLHA|B000K2PJ4K|{ Black (37467610...|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true| 09 4, 2015| ALJ66O1Y6SLHA|B000K2PJ4K|   { Blue/Gray Logo}|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true| 09 4, 2015| ALJ66O1Y6SLHA|B000K2PJ4K|{ Blue (37867638-...|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true| 09 4, 2015| ALJ66O1Y6SLHA|B000K2PJ4K|        { Blu

In [5]:
from pyspark.sql.functions import col, struct
df_modified = df.withColumn("style", struct(col("style.Color:").alias("Color")))

In [6]:
from pyspark.sql.functions import to_date
df_with_date = df.withColumn("reviewTime", to_date(df.reviewTime, "MM d, yyyy"))

In [7]:
df_with_date.show()

+-------+--------+----------+--------------+----------+--------------------+------------------+--------------------+--------------+
|overall|verified|reviewTime|    reviewerID|      asin|               style|      reviewerName|          reviewText|unixReviewTime|
+-------+--------+----------+--------------+----------+--------------------+------------------+--------------------+--------------+
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|      { Blue/Orange}|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|{ Black (37467610...|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|   { Blue/Gray Logo}|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|{ Blue (37867638-...|          Tonya B.|Great product and...|    1441324800|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|        { Blue/Pink}|

In [8]:

import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType, ArrayType
# targetUDF = F.udf(lambda x: 1 if x >= 4.0 else (0 if x == 3.0 else -1), IntegerType())
targetUDF = F.udf(lambda x: 1 if x >= 4.0 else 0, IntegerType())
import re
import nltk
from nltk.corpus import stopwords

In [9]:
def preProcess(text):
    # Should return a list of tokens
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)    
    text = text.lower()
    tokens = word_tokenize(text)    
    return tokens

In [10]:
df_sentiment = df_with_date.withColumn("sentiment", targetUDF(df_with_date["overall"]))

In [11]:
df_sentiment.show()

+-------+--------+----------+--------------+----------+--------------------+------------------+--------------------+--------------+---------+
|overall|verified|reviewTime|    reviewerID|      asin|               style|      reviewerName|          reviewText|unixReviewTime|sentiment|
+-------+--------+----------+--------------+----------+--------------------+------------------+--------------------+--------------+---------+
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|      { Blue/Orange}|          Tonya B.|Great product and...|    1441324800|        1|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|{ Black (37467610...|          Tonya B.|Great product and...|    1441324800|        1|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|   { Blue/Gray Logo}|          Tonya B.|Great product and...|    1441324800|        1|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|{ Blue (37867638-...|          Tonya B.|Great product and...|    1441324800|        1|
|    5

In [12]:

from pyspark.ml.feature import Tokenizer

# use PySparks build in tokenizer to tokenize tweets
tokenizer = Tokenizer(inputCol  = "reviewText",
                      outputCol = "token")
df4 = tokenizer.transform(df_sentiment.filter(df.reviewText.isNotNull()))

In [13]:
df4.show()

+-------+--------+----------+--------------+----------+--------------------+------------------+--------------------+--------------+---------+--------------------+
|overall|verified|reviewTime|    reviewerID|      asin|               style|      reviewerName|          reviewText|unixReviewTime|sentiment|               token|
+-------+--------+----------+--------------+----------+--------------------+------------------+--------------------+--------------+---------+--------------------+
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|      { Blue/Orange}|          Tonya B.|Great product and...|    1441324800|        1|[great, product, ...|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|{ Black (37467610...|          Tonya B.|Great product and...|    1441324800|        1|[great, product, ...|
|    5.0|    true|2015-09-04| ALJ66O1Y6SLHA|B000K2PJ4K|   { Blue/Gray Logo}|          Tonya B.|Great product and...|    1441324800|        1|[great, product, ...|
|    5.0|    true|2015

In [14]:
import re

def removeRegex(tokens: list) -> list:
    """
    Removes hashtags, call outs and web addresses from tokens.
    """
    expr    = '(@[A-Za-z0-a9_]+)|(#[A-Za-z0-9_]+)|'+\
              '(https?://[^\s<>"]+|www\.[^\s<>"]+)'
        
    regex   = re.compile(expr)

    cleaned = [t for t in tokens if not(regex.search(t)) if len(t) > 0]

    return list(filter(None, cleaned))

In [15]:
removeWEBUDF = F.udf(removeRegex, ArrayType(StringType()))

In [16]:
def normalize(tokens : list) -> list:
    """
    Removes non-english characters and returns lower case versions of words.
    """
    subbed   = [re.sub("[^a-zA-Z]+", "", s).lower() for s in tokens]
    
    filtered = filter(None, subbed)
    
    return list(filtered)


normalizeUDF = F.udf(normalize, ArrayType(StringType()))

In [17]:
# remove hashtags, call outs and web addresses
df4 = df4.withColumn("tokens_re", removeWEBUDF(df4["token"]))

# remove non english characters
df4 = df4.withColumn("tokens_clean", normalizeUDF(df4["tokens_re"]))

# rename columns
df5 = df4.drop("token","tokens_re")
df5 = df5.withColumnRenamed("tokens_clean", "tokens")\

# remove reviews where the tokens array is empty, i.e. where it was just
# a hashtag, callout, numbers, web adress etc.
df6 = df5.where(F.size(F.col("tokens")) > 0)

In [18]:

df6.limit(2).toPandas()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,unixReviewTime,sentiment,tokens
0,5.0,True,2015-09-04,ALJ66O1Y6SLHA,B000K2PJ4K,"( Blue/Orange,)",Tonya B.,Great product and price!,1441324800,1,"[great, product, and, price]"
1,5.0,True,2015-09-04,ALJ66O1Y6SLHA,B000K2PJ4K,"( Black (37467610) / Red/White,)",Tonya B.,Great product and price!,1441324800,1,"[great, product, and, price]"


In [19]:
# db_name = "poc"
# collection_name = "reviews"
# df_sentiment.write.format("mongo").option("uri", "mongodb://localhost:27017/poc.reviews").mode("overwrite").save()





# import pymongo



# conn = pymongo.MongoClient('mongodb://localhost:27017')
# db = conn.poc

In [20]:
# reviews = db.reviews
# query = {"asin": "0209688726"}
# # for res in results:
# #     print("Document = {}\n".format(res))

In [21]:
# count_sentiment = {"$group": 
#                      {"_id" : {"sentiment":"$sentiment"},  # note use a $ on the field
#                       "ct"  : {"$sum":1}
#                      }
#                   }
# results = reviews.aggregate([count_sentiment], allowDiskUse=True)

# for res in results:
#     print(res)

In [22]:
df7 = df6.select("reviewText","sentiment")\
        .withColumnRenamed("sentiment", "label")

In [23]:
df7.show(2)

+--------------------+-----+
|          reviewText|label|
+--------------------+-----+
|Great product and...|    1|
|Great product and...|    1|
+--------------------+-----+
only showing top 2 rows



In [24]:
train, test = df7.randomSplit([0.80, 0.20], 1234)

In [25]:
train.cache()

DataFrame[reviewText: string, label: int]

In [26]:
train.groupby("label")\
     .count()\
     .show()

+-----+-----+
|label|count|
+-----+-----+
|    1| 2118|
|    0|  449|
+-----+-----+



In [27]:
test.cache()

DataFrame[reviewText: string, label: int]

In [28]:
test.groupby("label")\
    .count()\
    .show()

+-----+-----+
|label|count|
+-----+-----+
|    1|  495|
|    0|   98|
+-----+-----+



In [29]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [30]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# get the name of the metric used
evaluator.getMetricName()

'areaUnderROC'

In [31]:
# create tokens from reviews
tk = Tokenizer(inputCol= "reviewText", outputCol = "tokens")

# create term frequencies for each of the tokens
tf1 = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1e5)

# create tf-idf for each of the tokens
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

# create basic logistic regression model
lr = LogisticRegression(maxIter=20)

# create entire pipeline
basic_pipeline = Pipeline(stages=[tk, tf1, idf, lr])

In [32]:
train.show()

+--------------------+-----+
|          reviewText|label|
+--------------------+-----+
|A little more cus...|    1|
|A nice lightweigh...|    1|
|A nice lightweigh...|    1|
|A nice lightweigh...|    1|
|A nice lightweigh...|    1|
|A nice lightweigh...|    1|
|A nice lightweigh...|    1|
|A nice lightweigh...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|A-MA-ZING!  I nee...|    1|
|Absolutely love t...|    1|
|Absolutely love t...|    1|
|Absolutely love t...|    1|
|Absolutely love t...|    1|
+--------------------+-----+
only showing top 20 rows



In [33]:
model1         = basic_pipeline.fit(train)

In [34]:
# predict on test set
predictions1   = model1.transform(test)

# get the performance on the test set
score1         = evaluator.evaluate(predictions1)

print("AUC SCORE: {}".format(score1))

AUC SCORE: 0.9999793856936714


In [35]:
predictedAndLabels = predictions1.select(["prediction","label"])\
                                 .rdd.map(lambda r : (float(r[0]), float(r[1])))

from pyspark.mllib.evaluation import MulticlassMetrics

metrics = MulticlassMetrics(predictedAndLabels)

print("Test Set Accuracy: {}".format(metrics.accuracy))



Test Set Accuracy: 0.9966273187183811


In [36]:
from pyspark.ml.feature import StopWordsRemover

In [37]:
sw  = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tf2 = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1e5)

In [38]:
sw_pipleline  = Pipeline(stages=[tk, sw, tf2, idf, lr])

model2        = sw_pipleline.fit(train)
predictions2  = model2.transform(test)
score2        = evaluator.evaluate(predictions2)

print("AUC SCORE: {}".format(score2))

AUC SCORE: 0.9999587713873429


In [39]:

from nltk.stem.porter import PorterStemmer

In [40]:
# Testing
stemmer = PorterStemmer()

tokens  = "my feelings having studied all day".split(" ")
print("raw tokens: {}".format(tokens))

raw tokens: ['my', 'feelings', 'having', 'studied', 'all', 'day']


In [41]:
tokens_stemmed = [stemmer.stem(token) for token in tokens]
print("clean tokens: {}".format(tokens_stemmed))

clean tokens: ['my', 'feel', 'have', 'studi', 'all', 'day']


In [42]:
from pyspark import keyword_only
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param


class PorterStemming(Transformer, HasInputCol, HasOutputCol):
    """
    PosterStemming class using the NLTK Porter Stemmer
    
    This comes from https://stackoverflow.com/questions/32331848/create-a-custom-transformer-in-pyspark-ml
    Adapted to work with the Porter Stemmer from NLTK.
    """
    
    @keyword_only
    def __init__(self, 
                 inputCol  : str = None, 
                 outputCol : str = None, 
                 min_size  : int = None):
        """
        Constructor takes in the input column name, output column name,
        plus the minimum legnth of a token (min_size)
        """
        # call Transformer classes constructor since were extending it.
        super(Transformer, self).__init__()

        # set Parameter objects minimum token size
        self.min_size = Param(self, "min_size", "")
        self._setDefault(min_size=0)

        # set the input keywork arguments
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

        # initialize Stemmer object
        self.stemmer  = PorterStemmer()

        
    @keyword_only
    def setParams(self, 
                  inputCol  : str = None, 
                  outputCol : str = None, 
                  min_size  : int = None
      ) -> None:
        """
        Function to set the keyword arguemnts
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    

    def _stem_func(self, words  : list) -> list:
        """
        Stemmer function call that performs stemming on a
        list of tokens in words and returns a list of tokens
        that have meet the minimum length requiremnt.
        """
        # We need a way to get min_size and cannot access it 
        # with self.min_size
        min_size       = self.getMinSize()

        # stem that actual tokens by applying 
        # self.stemmer.stem function to each token in 
        # the words list
        stemmed_words  = map(self.stemmer.stem, words)

        # now create the new list of tokens from
        # stemmed_words by filtering out those
        # that are not of legnth > min_size
        filtered_words = filter(lambda x: len(x) > min_size, stemmed_words)

        return list(filtered_words)
    
    def _transform(self, df: DataFrame) -> DataFrame:
        """
        Transform function is the method that is called in the 
        MLPipleline.  We have to override this function for our own use
        and have it call the _stem_func.

        Notice how it takes in a type DataFrame and returns type Dataframe
        """
        # Get the names of the input and output columns to use
        out_col       = self.getOutputCol()
        in_col        = self.getInputCol()

        # create the stemming function UDF by wrapping the stemmer 
        # method function
        stem_func_udf = F.udf(self._stem_func, ArrayType(StringType()))
        
        # now apply that UDF to the column in the dataframe to return
        # a new column that has the same list of words after being stemmed
        df2           = df.withColumn(out_col, stem_func_udf(df[in_col]))

        return df2
  
  
    def setMinSize(self,value):
        """
        This method sets the minimum size value
        for the _paramMap dictionary.
        """
        self._paramMap[self.min_size] = value
        return self

    def getMinSize(self) -> int:
        """
        This method uses the parent classes (Transformer)
        .getOrDefault method to get the minimum
        size of a token.
        """
        return self.getOrDefault(self.min_size)


In [43]:
stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")


In [44]:
stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train)

In [45]:
train_stem = stem_pipeline.transform(train)\
                          .where(F.size(F.col("stemmed")) >= 1)


test_stem  = stem_pipeline.transform(test)\
                          .where(F.size(F.col("stemmed")) >= 1)

# cache them to avoid running stemming 
# each iteration in the grid search
train_stem.cache()
test_stem.cache()

DataFrame[reviewText: string, label: int, tokens: array<string>, stemmed: array<string>]

In [46]:
test_stem.show(5)

+--------------------+-----+--------------------+--------------------+
|          reviewText|label|              tokens|             stemmed|
+--------------------+-----+--------------------+--------------------+
|A little more cus...|    1|[a, little, more,...|[a, littl, more, ...|
|A nice lightweigh...|    1|[a, nice, lightwe...|[a, nice, lightwe...|
|Absolutely love t...|    1|[absolutely, love...|[absolut, love, t...|
|Absolutely love t...|    1|[absolutely, love...|[absolut, love, t...|
|Absolutely love t...|    1|[absolutely, love...|[absolut, love, t...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [47]:
from pyspark.ml.feature import NGram

bigram = NGram(inputCol="tokens", outputCol="bigrams", n=2)

In [48]:
tf5   = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

bigram_pipeline  = Pipeline(stages= [tk, bigram, tf5, idf, lr])

model5           = bigram_pipeline.fit(train)
predictions5     = model5.transform(test)

score5           = evaluator.evaluate(predictions5)

In [49]:
print("AUC SCORE: {}".format(score5))

AUC SCORE: 0.9989692846835705


In [50]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

def stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

# Create a UDF
stem_text_udf = udf(stem_text, StringType())

In [51]:
bigram2 = NGram(inputCol="stemmed", outputCol="bigrams", n=2)

tf6     = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

idf     = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

lr      = LogisticRegression(maxIter=20)

stem_bigram_pipeline  = Pipeline(stages= [bigram2, tf6, idf, lr])

model6                = stem_bigram_pipeline.fit(train_stem)
predictions6          = model6.transform(test_stem)

score6                = evaluator.evaluate(predictions6)
print("AUC SCORE: {}".format(score6))


AUC SCORE: 0.9989692846835704


In [52]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
bigram2 = NGram(inputCol="stemmed", outputCol="bigrams", n=2)

tf6     = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

idf     = IDF(inputCol="rawFeatures", outputCol="features")

lr      = LogisticRegression(maxIter=20)

stem_bigram_pipeline  = Pipeline(stages= [bigram2, tf6, idf, lr])

paramGrid = ParamGridBuilder() \
                        .addGrid(idf.minDocFreq, [2, 5]) \
                        .addGrid(lr.regParam, [0.0, 0.1]) \
                        .build()
crossval = CrossValidator(estimator          = stem_bigram_pipeline,
                          estimatorParamMaps = paramGrid,
                          evaluator          = BinaryClassificationEvaluator(),
                          numFolds           = 3)

model    = crossval.fit(train_stem)
predictions   = model.transform(test_stem)
score         = evaluator.evaluate(predictions)
print("AUC SCORE: {}".format(score))

KeyboardInterrupt: 

In [None]:
# Printing bigram model without tuning
print("AUC SCORE: {}".format(score6))

In [None]:
bestModel = model.bestModel
predictedAndLabels = predictions.select(["prediction","label"])\
                                .rdd.map(lambda r : (float(r[0]), float(r[1])))
metrics = MulticlassMetrics(predictedAndLabels)

print("Test Set Accuracy: {}".format(metrics.accuracy))

In [None]:
bestModel.stages

In [None]:
bestModel.stages[2].explainParam('minDocFreq')

In [None]:
bestModel.stages[-1].explainParam('regParam')

In [None]:
summary = bestModel.stages[-1].summary

import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.plot([0, 1], [0, 1], 'r--')
plt.plot(summary.roc.select('FPR').collect(),
         summary.roc.select('TPR').collect())
plt.xlabel('False Positive Rare')
plt.ylabel('True Positive Rate')
plt.title("ROC Curve")
plt.show()