In [1]:
from pyspark.sql import SparkSession
allocated_memory = 18 * 0.75 

# create a SparkSession
spark = SparkSession.builder.appName("ReadJSON")\
.config("spark.executor.memory", "6g") \
.master("local[*]")  \
.config("spark.driver.memory", "4g") \
.config("spark.network.timeout", "800s")\
.config("spark.executor.heartbeatInterval", "120s")\
.config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")\
.config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")\
.config("spark.memory.fraction", "0.8") \
.getOrCreate()


24/04/22 12:42:18 WARN Utils: Your hostname, albertastein-2.local resolves to a loopback address: 127.0.0.1; using 10.2.12.43 instead (on interface en0)
24/04/22 12:42:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/22 12:42:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
print('Default Parallelism :', spark.sparkContext.defaultParallelism)

Default Parallelism : 12


In [3]:
# Check the default partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 134217728 in bytes and 128.0 in MB


In [4]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
#"reviewerID": "A8WEXFRWX1ZHH", 
# "asin": "0209688726", 
# "style": {"Color:": " AC"}, 
# "reviewerName": "Goldengate",
# Define the schema
schema = StructType([
    StructField("overall", FloatType(), True),
    StructField("verified", BooleanType(), True),
    StructField("reviewTime", StringType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True),
    StructField("style", StructType([StructField("Color:", StringType(), True)]), True),
    StructField("reviewerName", StringType(), True),
    StructField("reviewText", StringType(), True),
    StructField("unixReviewTime", IntegerType(), True)
    
])


json_df = spark.read.schema(schema).json("combined_train_data_chunked_10mb_latest.json")
json_test_df = spark.read.schema(schema).json("combined_test_data_chunked_10mb_latest.json")
json_df.show(5)
json_test_df.show(5)

# 

                                                                                

+-------+--------+-----------+--------------+----------+------+--------------+--------------------+--------------+
|overall|verified| reviewTime|    reviewerID|      asin| style|  reviewerName|          reviewText|unixReviewTime|
+-------+--------+-----------+--------------+----------+------+--------------+--------------------+--------------+
|    5.0|    true| 04 5, 2016|A1274GG1EB2JLJ|0486427706|{NULL}|   barbara ann|The pictures are ...|    1459814400|
|    5.0|    true|02 13, 2016|A30X5EGBYAZQQK|0486427706|{NULL}|      Samantha|I absolutely love...|    1455321600|
|    5.0|    true|12 10, 2015|A3U6UNXLAUY6ZV|0486427706|{NULL}|   CP in Texas|          I love it!|    1449705600|
|    5.0|    true|10 26, 2015|A1SAJF5SNM6WJS|0486427706|{NULL}|   LOIS LABIER|MY HUSBAND LOVED ...|    1445817600|
|    4.0|    true|09 15, 2015| AHJWO3SI0S0OR|0486427706|{NULL}|Saundra Hatley|                cool|    1442275200|
+-------+--------+-----------+--------------+----------+------+--------------+--

In [5]:
# Print the number of entries in the dataframe
print("Number of entries in the dataframe before pre processing: ", json_df.count())
print("Number of entries in the dataframe before pre processing: ", json_test_df.count())

                                                                                

Number of entries in the dataframe before pre processing:  11321389




Number of entries in the dataframe before pre processing:  2832499


                                                                                

In [6]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType, ArrayType
# targetUDF = F.udf(lambda x: 1 if x >= 4.0 else (0 if x == 3.0 else -1), IntegerType())
targetUDF = F.udf(lambda x: 1 if x >= 4.0 else 0, IntegerType())
import re
import nltk
from nltk.corpus import stopwords

In [7]:
reduced_df = json_df.select("overall", "reviewerID", "asin", "reviewText")
reduced_test_df = json_test_df.select("overall", "reviewerID", "asin", "reviewText")



In [8]:
reduced_df.show(5)
reduced_test_df.show(5)

+-------+--------------+----------+--------------------+
|overall|    reviewerID|      asin|          reviewText|
+-------+--------------+----------+--------------------+
|    5.0|A1274GG1EB2JLJ|0486427706|The pictures are ...|
|    5.0|A30X5EGBYAZQQK|0486427706|I absolutely love...|
|    5.0|A3U6UNXLAUY6ZV|0486427706|          I love it!|
|    5.0|A1SAJF5SNM6WJS|0486427706|MY HUSBAND LOVED ...|
|    4.0| AHJWO3SI0S0OR|0486427706|                cool|
+-------+--------------+----------+--------------------+
only showing top 5 rows

+-------+--------------+----------+--------------------+
|overall|    reviewerID|      asin|          reviewText|
+-------+--------------+----------+--------------------+
|    5.0|A2LSCFZM2FBZK7|0486427706|The stained glass...|
|    5.0|A3IXP5VS847GE5|0486427706|My 11 y.o. loved ...|
|    5.0|A2HK5AVQW6AUQ5|0486427706|             love it|
|    5.0|A18MVTKTTE8OS8|0486448789|Sometimes you nee...|
|    5.0|A2C2TLRMMMLAJV|0486448789|These little book...|
+-----

In [9]:
unique_df = reduced_df.dropDuplicates(["reviewerID", "asin"])
print("Number of training entries in the dataframe after removing duplicates: ", unique_df.count())

unique_test_df = reduced_test_df.dropDuplicates(["reviewerID", "asin"])
print("Number of testing entries in the dataframe after removing duplicates: ", unique_test_df.count())

                                                                                

Number of training entries in the dataframe after removing duplicates:  10936206




Number of testing entries in the dataframe after removing duplicates:  2806599


                                                                                

In [10]:
def preProcess(text):
    # Should return a list of tokens
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)    
    text = text.lower()
    tokens = word_tokenize(text)    
    return tokens

In [11]:
df_sentiment = unique_df.withColumn("sentiment", targetUDF(unique_df["overall"]))
df_test_sentiment = unique_test_df.withColumn("sentiment", targetUDF(unique_test_df["overall"]))

In [12]:
df_sentiment.show(5)
df_test_sentiment.show(5)

                                                                                

+-------+--------------------+----------+--------------------+---------+
|overall|          reviewerID|      asin|          reviewText|sentiment|
+-------+--------------------+----------+--------------------+---------+
|    5.0|A0015332H21AK8WZ0ZCS|B005G030TC|These collars are...|        1|
|    4.0| A0020356UF96ZV361ST|B00ZI5OVFM|It is a love stor...|        1|
|    2.0| A0020356UF96ZV361ST|B015X7KEDM|This book is not ...|        0|
|    5.0|A0024936S1WI02OHH9DP|B016AG5DR2|Looks great fits ...|        1|
|    5.0|A0034986DWR7WEDQN0GV|B001VJZO2S|           excellent|        1|
+-------+--------------------+----------+--------------------+---------+
only showing top 5 rows





+-------+-------------------+----------+--------------------+---------+
|overall|         reviewerID|      asin|          reviewText|sentiment|
+-------+-------------------+----------+--------------------+---------+
|    3.0|A0020356UF96ZV361ST|B00FDXFFW2|I guess you can s...|        0|
|    4.0|A0020356UF96ZV361ST|B00H6VZ0SS|This girl has bee...|        1|
|    4.0|A0020356UF96ZV361ST|B00XQOGWV8|Mario is a sorry ...|        1|
|    4.0|A0020356UF96ZV361ST|B014HD23EQ|This father had h...|        1|
|    4.0|A0020356UF96ZV361ST|B018RSH2FW|This guy is and h...|        1|
+-------+-------------------+----------+--------------------+---------+
only showing top 5 rows



                                                                                

In [13]:
from pyspark.ml.feature import Tokenizer

# use PySparks build in tokenizer to tokenize tweets
tokenizer = Tokenizer(inputCol  = "reviewText",
                      outputCol = "token")
# Remove the rows with missing values and tokenize
df_train_tokenized = tokenizer.transform(df_sentiment.filter(unique_df.reviewText.isNotNull()))
df_test_tokenized = tokenizer.transform(df_test_sentiment.filter(unique_test_df.reviewText.isNotNull()))

In [14]:

df_train_tokenized.show(5)
df_test_tokenized.show(5)

                                                                                

+-------+--------------------+----------+--------------------+---------+--------------------+
|overall|          reviewerID|      asin|          reviewText|sentiment|               token|
+-------+--------------------+----------+--------------------+---------+--------------------+
|    5.0|A0015332H21AK8WZ0ZCS|B005G030TC|These collars are...|        1|[these, collars, ...|
|    4.0| A0020356UF96ZV361ST|B00ZI5OVFM|It is a love stor...|        1|[it, is, a, love,...|
|    2.0| A0020356UF96ZV361ST|B015X7KEDM|This book is not ...|        0|[this, book, is, ...|
|    5.0|A0024936S1WI02OHH9DP|B016AG5DR2|Looks great fits ...|        1|[looks, great, fi...|
|    5.0|A0034986DWR7WEDQN0GV|B001VJZO2S|           excellent|        1|         [excellent]|
+-------+--------------------+----------+--------------------+---------+--------------------+
only showing top 5 rows





+-------+-------------------+----------+--------------------+---------+--------------------+
|overall|         reviewerID|      asin|          reviewText|sentiment|               token|
+-------+-------------------+----------+--------------------+---------+--------------------+
|    3.0|A0020356UF96ZV361ST|B00FDXFFW2|I guess you can s...|        0|[i, guess, you, c...|
|    4.0|A0020356UF96ZV361ST|B00H6VZ0SS|This girl has bee...|        1|[this, girl, has,...|
|    4.0|A0020356UF96ZV361ST|B00XQOGWV8|Mario is a sorry ...|        1|[mario, is, a, so...|
|    4.0|A0020356UF96ZV361ST|B014HD23EQ|This father had h...|        1|[this, father, ha...|
|    4.0|A0020356UF96ZV361ST|B018RSH2FW|This guy is and h...|        1|[this, guy, is, a...|
+-------+-------------------+----------+--------------------+---------+--------------------+
only showing top 5 rows



                                                                                

In [15]:
import re

def removeRegex(tokens: list) -> list:
    """
    Removes hashtags, call outs and web addresses from tokens.
    """
    # Use a raw string for regular expressions to avoid escape sequence warnings
    expr = r'(@[A-Za-z0-9_]+)|(#[A-Za-z0-9_]+)|'+\
           r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
    regex = re.compile(expr)
    cleaned = [t for t in tokens if not regex.search(t) and len(t) > 0]

    return cleaned


In [16]:
removeWEBUDF = F.udf(removeRegex, ArrayType(StringType()))

In [17]:
def normalize(tokens : list) -> list:
    """
    Removes non-english characters and returns lower case versions of words.
    """
    subbed   = [re.sub("[^a-zA-Z]+", "", s).lower() for s in tokens]
    
    filtered = filter(None, subbed)
    
    return list(filtered)


normalizeUDF = F.udf(normalize, ArrayType(StringType()))

In [18]:
# remove hashtags, call outs and web addresses
df4_train = df_train_tokenized.withColumn("tokens_re", removeWEBUDF(df_train_tokenized["token"]))
df4_test = df_test_tokenized.withColumn("tokens_re", removeWEBUDF(df_test_tokenized["token"]))
# remove non english characters
df4_train = df4_train.withColumn("tokens_clean", normalizeUDF(df4_train["tokens_re"]))
df4_test = df4_test.withColumn("tokens_clean", normalizeUDF(df4_test["tokens_re"]))

# rename columns
df5_train = df4_train.drop("token","tokens_re")
df5_test = df4_test.drop("token","tokens_re")
df5_train = df5_train.withColumnRenamed("tokens_clean", "tokens")
df5_test = df5_test.withColumnRenamed("tokens_clean", "tokens")

# remove reviews where the tokens array is empty, i.e. where it was just
# a hashtag, callout, numbers, web adress etc.
df6_train = df5_train.where(F.size(F.col("tokens")) > 0)
df6_test = df5_test.where(F.size(F.col("tokens")) > 0)

In [19]:
df_train_for_model = df6_train.select("reviewText","sentiment")\
        .withColumnRenamed("sentiment", "label")
df_test_for_model = df6_test.select("reviewText","sentiment").withColumnRenamed("sentiment", "label")

In [20]:
from pyspark.sql.functions import rand

# Assuming 'df' is your DataFrame
shuffled_train_df = df_train_for_model.orderBy(rand())
shuffled_test_df = df_test_for_model.orderBy(rand())

# Show the shuffled DataFrame
shuffled_train_df.show(10)
shuffled_test_df.show(10)

[Stage 36:>                                                       (0 + 12) / 50]

In [None]:
# # Printing label distribution for training and testing data
# print("Training data label distribution")
# shuffled_train_df.groupBy("label").count().show()

# print("Testing data label distribution")
# shuffled_test_df.groupBy("label").count().show()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# get the name of the metric used
evaluator.getMetricName()

In [None]:
# import pickle
# with open("model1.pkl", "wb") as f:
#     pickle.dump(model1, f)

In [None]:
# create tokens from reviews
tk = Tokenizer(inputCol= "reviewText", outputCol = "tokens")

# create term frequencies for each of the tokens
tf1 = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1e5)

# create tf-idf for each of the tokens
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

# create basic logistic regression model
lr = LogisticRegression(maxIter=20)

In [None]:
from pyspark.ml.feature import StopWordsRemover
sw  = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tf2 = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1e5)

In [None]:
# shuffled_train_df_partition = shuffled_train_df.repartition(5)
# shuffled_test_df_partition = shuffled_test_df.repartition(5)

In [None]:
# print(shuffled_train_df_partition.rdd.getNumPartitions())
# print(shuffled_test_df_partition.rdd.getNumPartitions())

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
from pyspark import keyword_only
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param


class PorterStemming(Transformer, HasInputCol, HasOutputCol):
    """
    PosterStemming class using the NLTK Porter Stemmer
    
    This comes from https://stackoverflow.com/questions/32331848/create-a-custom-transformer-in-pyspark-ml
    Adapted to work with the Porter Stemmer from NLTK.
    """
    
    @keyword_only
    def __init__(self, 
                 inputCol  : str = None, 
                 outputCol : str = None, 
                 min_size  : int = None):
        """
        Constructor takes in the input column name, output column name,
        plus the minimum legnth of a token (min_size)
        """
        # call Transformer classes constructor since were extending it.
        super(Transformer, self).__init__()

        # set Parameter objects minimum token size
        self.min_size = Param(self, "min_size", "")
        self._setDefault(min_size=0)

        # set the input keywork arguments
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

        # initialize Stemmer object
        self.stemmer  = PorterStemmer()

        
    @keyword_only
    def setParams(self, 
                  inputCol  : str = None, 
                  outputCol : str = None, 
                  min_size  : int = None
      ) -> None:
        """
        Function to set the keyword arguemnts
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    

    def _stem_func(self, words  : list) -> list:
        """
        Stemmer function call that performs stemming on a
        list of tokens in words and returns a list of tokens
        that have meet the minimum length requiremnt.
        """
        # We need a way to get min_size and cannot access it 
        # with self.min_size
        min_size       = self.getMinSize()

        # stem that actual tokens by applying 
        # self.stemmer.stem function to each token in 
        # the words list
        stemmed_words  = map(self.stemmer.stem, words)

        # now create the new list of tokens from
        # stemmed_words by filtering out those
        # that are not of legnth > min_size
        filtered_words = filter(lambda x: len(x) > min_size, stemmed_words)

        return list(filtered_words)
    
    def _transform(self, df: DataFrame) -> DataFrame:
        """
        Transform function is the method that is called in the 
        MLPipleline.  We have to override this function for our own use
        and have it call the _stem_func.

        Notice how it takes in a type DataFrame and returns type Dataframe
        """
        # Get the names of the input and output columns to use
        out_col       = self.getOutputCol()
        in_col        = self.getInputCol()

        # create the stemming function UDF by wrapping the stemmer 
        # method function
        stem_func_udf = F.udf(self._stem_func, ArrayType(StringType()))
        
        # now apply that UDF to the column in the dataframe to return
        # a new column that has the same list of words after being stemmed
        df2           = df.withColumn(out_col, stem_func_udf(df[in_col]))

        return df2
  
  
    def setMinSize(self,value):
        """
        This method sets the minimum size value
        for the _paramMap dictionary.
        """
        self._paramMap[self.min_size] = value
        return self

    def getMinSize(self) -> int:
        """
        This method uses the parent classes (Transformer)
        .getOrDefault method to get the minimum
        size of a token.
        """
        return self.getOrDefault(self.min_size)


In [None]:

# train_shuffle_small = spark.sparkContext.parallelize(shuffled_train_df.take(50000)).toDF()
# test_shuffle_small = spark.sparkContext.parallelize(shuffled_test_df.take(50000)).toDF()

#print(type(test_shuffle_small))

In [None]:
# import pyspark.sql.functions as F
# from pyspark.sql.window import Window
# from pyspark.sql.functions import monotonically_increasing_id, row_number
# # w = Window().orderBy(F.lit('A'))
# # train_shuffle_small = train_shuffle_small.withColumn('row_num', F.floor(F.row_number().over(w) / 10) )
# # test_shuffle_small = test_shuffle_small.withColumn('row_num', F.floor(F.row_number().over(w) / 10) )

# # train_shuffle_small.show(2)
# train_shuffle_small = train_shuffle_small.withColumn('original_order', monotonically_increasing_id())
# train_shuffle_small = train_shuffle_small.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
# train_shuffle_small = train_shuffle_small.drop('original_order')

# train_shuffle_small.show(2)
# test_shuffle_small = test_shuffle_small.withColumn('original_order', monotonically_increasing_id())
# test_shuffle_small = test_shuffle_small.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
# test_shuffle_small = test_shuffle_small.drop('original_order')
# test_shuffle_small.show(2)

In [None]:
# from pyspark.sql.functions import col
# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 1) & (col('row_num') <=10000))
# #test_shuffle_small = test_shuffle_small.filter(test_shuffle_small.row_num(1, 10000))
# train_shuffle_small.show(2)

In [None]:
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)


# # test_stem  = stem_pipeline.transform(shuffled_test_df_partition)\
# #                           .where(F.size(F.col("stemmed")) >= 1)
# test_stem  = stem_pipeline.transform(test_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# # cache them to avoid running stemming 
# # each iteration in the grid search
# train_stem.cache()
# test_stem.cache()




In [None]:
# train_small = train_stem.take(10)
# rdd = spark.sparkContext.parallelize(train_small)
# train_small_df = rdd.toDF().show(2)
# train_small_df.typedf()
# print(type(train_small_df))

In [None]:
# test_small = test_stem.take(10)
# rdd2 = spark.sparkContext.parallelize(test_small)
# test_small_df = rdd2.toDF().show(2)
# print(type(test_small_df))

In [None]:
# from pyspark.ml.feature import NGram
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# bigram2 = NGram(inputCol="stemmed", outputCol="bigrams", n=2)

# tf6     = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

# idf     = IDF(inputCol="rawFeatures", outputCol="features")

# lr      = LogisticRegression(maxIter=10)

# stem_bigram_pipeline  = Pipeline(stages= [bigram2, tf6, idf, lr])

# paramGrid = ParamGridBuilder() \
#                         .addGrid(idf.minDocFreq, [2, 5]) \
#                         .addGrid(lr.regParam, [0.0, 0.1]) \
#                         .build()
# crossval = CrossValidator(estimator          = stem_bigram_pipeline,
#                           estimatorParamMaps = paramGrid,
#                           evaluator          = BinaryClassificationEvaluator(),
#                           numFolds           = 2,
#                           parallelism= 2
#                           )


# model    = crossval.fit(train_stem)
# predictions   = model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))

In [None]:

# print("AUC SCORE for cross validation: {}".format(score))




In [None]:
# model.save("cross_val_model_final")


In [None]:
# from pyspark.ml.tuning import CrossValidatorModel

# model_path = "cross_val_model_final"
# model = CrossValidatorModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 10001) & (col('row_num') <=20000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.bestModel.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))


In [None]:
# model.save("cross_val_model_final2")

In [None]:
# model.transform(test_stem).show(5)

# predictions   = model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE for cross validation: {}".format(score))

In [None]:
# from pyspark.mllib.evaluation import MulticlassMetrics
# bestModel = model.bestModel
# predictedAndLabels = predictions.select(["prediction","label"])\
#                                 .rdd.map(lambda r : (float(r[0]), float(r[1])))
# metrics = MulticlassMetrics(predictedAndLabels)

# print("Test Set Accuracy: {}".format(metrics.accuracy))

In [None]:
# model_path = "cross_val_model_final2"
# model = CrossValidatorModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 20001) & (col('row_num') <=50000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.bestModel.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))


In [None]:
shuffled_train_df = shuffled_train_df.withColumn('original_order', monotonically_increasing_id())
shuffled_train_df = shuffled_train_df.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
shuffled_train_df = shuffled_train_df.drop('original_order')

shuffled_train_df.show(2)
shuffled_test_df = shuffled_test_df.withColumn('original_order', monotonically_increasing_id())
shuffled_test_df = shuffled_test_df.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
shuffled_test_df = shuffled_test_df.drop('original_order')
shuffled_test_df.show(2)

In [None]:
stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
train_shuffle_small = shuffled_train_df.filter( (col('row_num') >= 1) & (col('row_num') <=50000))
#stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

#train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
                          #.where(F.size(F.col("stemmed")) >= 1)
train_stem = stem_pipeline.transform(train_shuffle_small)\
                          .where(F.size(F.col("stemmed")) >= 1)


# test_stem  = stem_pipeline.transform(shuffled_test_df_partition)\
#                           .where(F.size(F.col("stemmed")) >= 1)
test_stem  = stem_pipeline.transform(shuffled_test_df)\
                          .where(F.size(F.col("stemmed")) >= 1)

# cache them to avoid running stemming 
# each iteration in the grid search
train_stem.cache()
test_stem.cache()

In [None]:
from pyspark.ml.feature import NGram
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
bigram2 = NGram(inputCol="stemmed", outputCol="bigrams", n=2)

tf6     = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

idf     = IDF(inputCol="rawFeatures", outputCol="features")

lr      = LogisticRegression(maxIter=10)

stem_bigram_pipeline  = Pipeline(stages= [bigram2, tf6, idf, lr])

paramGrid = ParamGridBuilder() \
                        .addGrid(idf.minDocFreq, [2, 5]) \
                        .addGrid(lr.regParam, [0.0, 0.1]) \
                        .build()
crossval = CrossValidator(estimator          = stem_bigram_pipeline,
                          estimatorParamMaps = paramGrid,
                          evaluator          = BinaryClassificationEvaluator(),
                          numFolds           = 2,
                          parallelism= 2
                          )


model    = crossval.fit(train_stem)
predictions   = model.transform(test_stem)
score         = evaluator.evaluate(predictions)
print("AUC SCORE: {}".format(score))

In [None]:
model.save("cross_val_model_first50K")