In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Bigrams") \
    .getOrCreate()

from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
#"reviewerID": "A8WEXFRWX1ZHH",
# "asin": "0209688726",
# "style": {"Color:": " AC"},
# "reviewerName": "Goldengate",
# Define the schema
schema = StructType([
    StructField("overall", FloatType(), True),
    StructField("verified", BooleanType(), True),
    StructField("reviewTime", StringType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True),
    StructField("style", StructType([StructField("Color:", StringType(), True)]), True),
    StructField("reviewerName", StringType(), True),
    StructField("reviewText", StringType(), True),
    StructField("unixReviewTime", IntegerType(), True)

])

# Load JSON file into DataFrame
json_df = spark.read.schema(schema).json("../combined_train_data_chunked_10mb_latest.json")

json_test_df = spark.read.schema(schema).json("../combined_test_data_chunked_10mb_latest.json")

# Sample 10% of the data
train_df, test_df = json_df.randomSplit([0.8, 0.2], seed=42)
json_df_first_half = train_df.sample(withReplacement=False, fraction=0.02, seed=42)
json_df_second_half = test_df.sample(withReplacement=False, fraction=0.02, seed=42)

json_test_df = json_test_df.sample(withReplacement=False, fraction=0.02, seed=42)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/20 12:24:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType, ArrayType
# targetUDF = F.udf(lambda x: 1 if x >= 4.0 else (0 if x == 3.0 else -1), IntegerType())
targetUDF = F.udf(lambda x: 1 if x >= 4.0 else 0, IntegerType())
import re
import nltk
from nltk.corpus import stopwords

In [3]:
reduced_df_1 = json_df_first_half.select("overall", "reviewerID", "asin", "reviewText")
reduced_df_2 = json_df_second_half.select("overall", "reviewerID", "asin", "reviewText")
reduced_test_df = json_test_df.select("overall", "reviewerID", "asin", "reviewText")
unique_df_1 = reduced_df_1.dropDuplicates(["reviewerID", "asin"])
unique_df_2 = reduced_df_2.dropDuplicates(["reviewerID", "asin"])
# print("Number of training entries in the dataframe after removing duplicates: ", unique_df.count())

unique_test_df = reduced_test_df.dropDuplicates(["reviewerID", "asin"])
# print("Number of testing entries in the dataframe after removing duplicates: ", unique_test_df.count())

In [4]:
def preProcess(text):
    # Should return a list of tokens
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

In [5]:
df_sentiment_1 = unique_df_1.withColumn("sentiment", targetUDF(unique_df_1["overall"]))
df_sentiment_2 = unique_df_2.withColumn("sentiment", targetUDF(unique_df_2["overall"]))
df_test_sentiment = unique_test_df.withColumn("sentiment", targetUDF(unique_test_df["overall"]))

In [6]:
from pyspark.ml.feature import Tokenizer

# use PySparks build in tokenizer to tokenize tweets
tokenizer = Tokenizer(inputCol  = "reviewText",
                      outputCol = "token")
# Remove the rows with missing values and tokenize
df_train_tokenized1 = tokenizer.transform(df_sentiment_1.filter(unique_df_1.reviewText.isNotNull()))
df_train_tokenized2 = tokenizer.transform(df_sentiment_2.filter(unique_df_2.reviewText.isNotNull()))
df_test_tokenized = tokenizer.transform(df_test_sentiment.filter(unique_test_df.reviewText.isNotNull()))

In [7]:
import re

def removeRegex(tokens: list) -> list:
    """
    Removes hashtags, call outs and web addresses from tokens.
    """
    # Use a raw string for regular expressions to avoid escape sequence warnings
    expr = r'(@[A-Za-z0-9_]+)|(#[A-Za-z0-9_]+)|'+\
           r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
    regex = re.compile(expr)
    cleaned = [t for t in tokens if not regex.search(t) and len(t) > 0]

    return cleaned

In [8]:
removeWEBUDF = F.udf(removeRegex, ArrayType(StringType()))

In [9]:
def normalize(tokens : list) -> list:
    """
    Removes non-english characters and returns lower case versions of words.
    """
    subbed   = [re.sub("[^a-zA-Z]+", "", s).lower() for s in tokens]

    filtered = filter(None, subbed)

    return list(filtered)


normalizeUDF = F.udf(normalize, ArrayType(StringType()))

In [10]:
# remove hashtags, call outs and web addresses
df4_train1 = df_train_tokenized1.withColumn("tokens_re", removeWEBUDF(df_train_tokenized1["token"]))
df4_train2 = df_train_tokenized2.withColumn("tokens_re", removeWEBUDF(df_train_tokenized2["token"]))
df4_test = df_test_tokenized.withColumn("tokens_re", removeWEBUDF(df_test_tokenized["token"]))
# remove non english characters
df4_train1 = df4_train1.withColumn("tokens_clean", normalizeUDF(df4_train1["tokens_re"]))
df4_train2 = df4_train2.withColumn("tokens_clean", normalizeUDF(df4_train2["tokens_re"]))
df4_test = df4_test.withColumn("tokens_clean", normalizeUDF(df4_test["tokens_re"]))

# rename columns
df5_train1 = df4_train1.drop("token","tokens_re")
df5_train2 = df4_train2.drop("token","tokens_re")
df5_test = df4_test.drop("token","tokens_re")
df5_train1 = df5_train1.withColumnRenamed("tokens_clean", "tokens")
df5_train2 = df5_train2.withColumnRenamed("tokens_clean", "tokens")
df5_test = df5_test.withColumnRenamed("tokens_clean", "tokens")

# remove reviews where the tokens array is empty, i.e. where it was just
# a hashtag, callout, numbers, web adress etc.
df6_train1 = df5_train1.where(F.size(F.col("tokens")) > 0)
df6_train2 = df5_train2.where(F.size(F.col("tokens")) > 0)
df6_test = df5_test.where(F.size(F.col("tokens")) > 0)

In [11]:
df_train_for_model1 = df6_train1.select("reviewText","sentiment")\
        .withColumnRenamed("sentiment", "label")
df_train_for_model2 = df6_train2.select("reviewText","sentiment")\
        .withColumnRenamed("sentiment", "label")
df_test_for_model = df6_test.select("reviewText","sentiment").withColumnRenamed("sentiment", "label")

In [12]:
from pyspark.sql.functions import rand

# Assuming 'df' is your DataFrame
shuffled_train_df1 = df_train_for_model1.orderBy(rand())
shuffled_train_df2 = df_train_for_model2.orderBy(rand())
shuffled_test_df = df_test_for_model.orderBy(rand())

# Show the shuffled DataFrame
# shuffled_train_df.show(10)
# shuffled_test_df.show(10)

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import NGram, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
bigram = NGram(inputCol="tokens", outputCol="bigrams", n=2)
tf5   = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)
# create tokens from reviews
tk = Tokenizer(inputCol= "reviewText", outputCol = "tokens")
# create tf-idf for each of the tokens
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)
# create basic logistic regression model
lr = LogisticRegression(maxIter=20)

bigram_pipeline  = Pipeline(stages= [tk, bigram, tf5, idf, lr])

In [14]:
import pickle

def make_pipeline():
    return bigram_pipeline
# train model with first dataset
make_pipeline().fit(shuffled_train_df1)


with open('bigram_model.pkl', 'wb') as infile:
    pickle.dump(make_pipeline, infile)

with open('bigram_model.pkl', 'rb') as outfile:
    loaded_pipeline_func = pickle.load(outfile)

model1 = loaded_pipeline_func().fit(shuffled_train_df2)

# test dataset
predictions = model1.transform(shuffled_test_df)
score = evaluator.evaluate(predictions)
print("AUC SCORE: {}".format(score))

24/04/20 12:29:12 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:23 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/04/20 12:29:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/04/20 12:29:23 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:35 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 12:29:36 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
24/04/20 

AUC SCORE: 0.7843378188731388
