In [2]:
from pyspark.sql import SparkSession
allocated_memory = 18 * 0.75 

# create a SparkSession
spark = SparkSession.builder.appName("ReadJSON")\
.config("spark.executor.memory", "6g") \
.master("local[*]")  \
.config("spark.driver.memory", "4g") \
.config("spark.network.timeout", "800s")\
.config("spark.executor.heartbeatInterval", "200s")\
.config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")\
.config("spark.driver.extraJavaOptions", "-XX:+UseG1GC")\
.config("spark.memory.fraction", "0.8") \
.getOrCreate()


24/04/23 12:45:44 WARN Utils: Your hostname, albertastein-2.local resolves to a loopback address: 127.0.0.1; using 10.2.12.43 instead (on interface en0)
24/04/23 12:45:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/23 12:45:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
print('Default Parallelism :', spark.sparkContext.defaultParallelism)

Default Parallelism : 12


In [4]:
# Check the default partition size
partition_size = spark.conf.get("spark.sql.files.maxPartitionBytes").replace("b","")
print(f"Partition Size: {partition_size} in bytes and {int(partition_size) / 1024 / 1024} in MB")

Partition Size: 134217728 in bytes and 128.0 in MB


In [5]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
#"reviewerID": "A8WEXFRWX1ZHH", 
# "asin": "0209688726", 
# "style": {"Color:": " AC"}, 
# "reviewerName": "Goldengate",
# Define the schema
schema = StructType([
    StructField("overall", FloatType(), True),
    StructField("verified", BooleanType(), True),
    StructField("reviewTime", StringType(), True),
    StructField("reviewerID", StringType(), True),
    StructField("asin", StringType(), True),
    StructField("style", StructType([StructField("Color:", StringType(), True)]), True),
    StructField("reviewerName", StringType(), True),
    StructField("reviewText", StringType(), True),
    StructField("unixReviewTime", IntegerType(), True)
    
])


json_df = spark.read.schema(schema).json("combined_train_data_chunked_10mb_latest.json")
json_test_df = spark.read.schema(schema).json("combined_test_data_chunked_10mb_latest.json")
json_df.show(5)
json_test_df.show(5)

# 

                                                                                

+-------+--------+-----------+--------------+----------+------+--------------+--------------------+--------------+
|overall|verified| reviewTime|    reviewerID|      asin| style|  reviewerName|          reviewText|unixReviewTime|
+-------+--------+-----------+--------------+----------+------+--------------+--------------------+--------------+
|    5.0|    true| 04 5, 2016|A1274GG1EB2JLJ|0486427706|{NULL}|   barbara ann|The pictures are ...|    1459814400|
|    5.0|    true|02 13, 2016|A30X5EGBYAZQQK|0486427706|{NULL}|      Samantha|I absolutely love...|    1455321600|
|    5.0|    true|12 10, 2015|A3U6UNXLAUY6ZV|0486427706|{NULL}|   CP in Texas|          I love it!|    1449705600|
|    5.0|    true|10 26, 2015|A1SAJF5SNM6WJS|0486427706|{NULL}|   LOIS LABIER|MY HUSBAND LOVED ...|    1445817600|
|    4.0|    true|09 15, 2015| AHJWO3SI0S0OR|0486427706|{NULL}|Saundra Hatley|                cool|    1442275200|
+-------+--------+-----------+--------------+----------+------+--------------+--

In [6]:
# Print the number of entries in the dataframe
print("Number of entries in the dataframe before pre processing: ", json_df.count())
print("Number of entries in the dataframe before pre processing: ", json_test_df.count())

                                                                                

Number of entries in the dataframe before pre processing:  11321389


[Stage 5:>                                                        (0 + 13) / 13]

Number of entries in the dataframe before pre processing:  2832499


                                                                                

In [7]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType, ArrayType
# targetUDF = F.udf(lambda x: 1 if x >= 4.0 else (0 if x == 3.0 else -1), IntegerType())
targetUDF = F.udf(lambda x: 1 if x >= 4.0 else 0, IntegerType())
import re
import nltk
from nltk.corpus import stopwords

In [8]:
reduced_df = json_df.select("overall", "reviewerID", "asin", "reviewText")
reduced_test_df = json_test_df.select("overall", "reviewerID", "asin", "reviewText")



In [9]:
reduced_df.show(5)
reduced_test_df.show(5)

+-------+--------------+----------+--------------------+
|overall|    reviewerID|      asin|          reviewText|
+-------+--------------+----------+--------------------+
|    5.0|A1274GG1EB2JLJ|0486427706|The pictures are ...|
|    5.0|A30X5EGBYAZQQK|0486427706|I absolutely love...|
|    5.0|A3U6UNXLAUY6ZV|0486427706|          I love it!|
|    5.0|A1SAJF5SNM6WJS|0486427706|MY HUSBAND LOVED ...|
|    4.0| AHJWO3SI0S0OR|0486427706|                cool|
+-------+--------------+----------+--------------------+
only showing top 5 rows

+-------+--------------+----------+--------------------+
|overall|    reviewerID|      asin|          reviewText|
+-------+--------------+----------+--------------------+
|    5.0|A2LSCFZM2FBZK7|0486427706|The stained glass...|
|    5.0|A3IXP5VS847GE5|0486427706|My 11 y.o. loved ...|
|    5.0|A2HK5AVQW6AUQ5|0486427706|             love it|
|    5.0|A18MVTKTTE8OS8|0486448789|Sometimes you nee...|
|    5.0|A2C2TLRMMMLAJV|0486448789|These little book...|
+-----

In [10]:
unique_df = reduced_df.dropDuplicates(["reviewerID", "asin"])
print("Number of training entries in the dataframe after removing duplicates: ", unique_df.count())

unique_test_df = reduced_test_df.dropDuplicates(["reviewerID", "asin"])
print("Number of testing entries in the dataframe after removing duplicates: ", unique_test_df.count())

                                                                                

Number of training entries in the dataframe after removing duplicates:  10936206




Number of testing entries in the dataframe after removing duplicates:  2806599


                                                                                

In [12]:
def preProcess(text):
    # Should return a list of tokens
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)    
    text = text.lower()
    tokens = word_tokenize(text)    
    return tokens

In [11]:
df_sentiment = unique_df.withColumn("sentiment", targetUDF(unique_df["overall"]))
df_test_sentiment = unique_test_df.withColumn("sentiment", targetUDF(unique_test_df["overall"]))

In [13]:
df_sentiment.show(5)
df_test_sentiment.show(5)

                                                                                

+-------+--------------------+----------+--------------------+---------+
|overall|          reviewerID|      asin|          reviewText|sentiment|
+-------+--------------------+----------+--------------------+---------+
|    5.0|A0015332H21AK8WZ0ZCS|B005G030TC|These collars are...|        1|
|    4.0| A0020356UF96ZV361ST|B00ZI5OVFM|It is a love stor...|        1|
|    2.0| A0020356UF96ZV361ST|B015X7KEDM|This book is not ...|        0|
|    5.0|A0024936S1WI02OHH9DP|B016AG5DR2|Looks great fits ...|        1|
|    5.0|A0034986DWR7WEDQN0GV|B001VJZO2S|           excellent|        1|
+-------+--------------------+----------+--------------------+---------+
only showing top 5 rows





+-------+-------------------+----------+--------------------+---------+
|overall|         reviewerID|      asin|          reviewText|sentiment|
+-------+-------------------+----------+--------------------+---------+
|    3.0|A0020356UF96ZV361ST|B00FDXFFW2|I guess you can s...|        0|
|    4.0|A0020356UF96ZV361ST|B00H6VZ0SS|This girl has bee...|        1|
|    4.0|A0020356UF96ZV361ST|B00XQOGWV8|Mario is a sorry ...|        1|
|    4.0|A0020356UF96ZV361ST|B014HD23EQ|This father had h...|        1|
|    4.0|A0020356UF96ZV361ST|B018RSH2FW|This guy is and h...|        1|
+-------+-------------------+----------+--------------------+---------+
only showing top 5 rows



                                                                                

In [14]:
from pyspark.ml.feature import Tokenizer

# use PySparks build in tokenizer to tokenize tweets
tokenizer = Tokenizer(inputCol  = "reviewText",
                      outputCol = "token")
# Remove the rows with missing values and tokenize
df_train_tokenized = tokenizer.transform(df_sentiment.filter(unique_df.reviewText.isNotNull()))
df_test_tokenized = tokenizer.transform(df_test_sentiment.filter(unique_test_df.reviewText.isNotNull()))

In [15]:

df_train_tokenized.show(5)
df_test_tokenized.show(5)

                                                                                

+-------+--------------------+----------+--------------------+---------+--------------------+
|overall|          reviewerID|      asin|          reviewText|sentiment|               token|
+-------+--------------------+----------+--------------------+---------+--------------------+
|    5.0|A0015332H21AK8WZ0ZCS|B005G030TC|These collars are...|        1|[these, collars, ...|
|    4.0| A0020356UF96ZV361ST|B00ZI5OVFM|It is a love stor...|        1|[it, is, a, love,...|
|    2.0| A0020356UF96ZV361ST|B015X7KEDM|This book is not ...|        0|[this, book, is, ...|
|    5.0|A0024936S1WI02OHH9DP|B016AG5DR2|Looks great fits ...|        1|[looks, great, fi...|
|    5.0|A0034986DWR7WEDQN0GV|B001VJZO2S|           excellent|        1|         [excellent]|
+-------+--------------------+----------+--------------------+---------+--------------------+
only showing top 5 rows





+-------+-------------------+----------+--------------------+---------+--------------------+
|overall|         reviewerID|      asin|          reviewText|sentiment|               token|
+-------+-------------------+----------+--------------------+---------+--------------------+
|    3.0|A0020356UF96ZV361ST|B00FDXFFW2|I guess you can s...|        0|[i, guess, you, c...|
|    4.0|A0020356UF96ZV361ST|B00H6VZ0SS|This girl has bee...|        1|[this, girl, has,...|
|    4.0|A0020356UF96ZV361ST|B00XQOGWV8|Mario is a sorry ...|        1|[mario, is, a, so...|
|    4.0|A0020356UF96ZV361ST|B014HD23EQ|This father had h...|        1|[this, father, ha...|
|    4.0|A0020356UF96ZV361ST|B018RSH2FW|This guy is and h...|        1|[this, guy, is, a...|
+-------+-------------------+----------+--------------------+---------+--------------------+
only showing top 5 rows



                                                                                

In [16]:
import re

def removeRegex(tokens: list) -> list:
    """
    Removes hashtags, call outs and web addresses from tokens.
    """
    # Use a raw string for regular expressions to avoid escape sequence warnings
    expr = r'(@[A-Za-z0-9_]+)|(#[A-Za-z0-9_]+)|'+\
           r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
    regex = re.compile(expr)
    cleaned = [t for t in tokens if not regex.search(t) and len(t) > 0]

    return cleaned


In [17]:
removeWEBUDF = F.udf(removeRegex, ArrayType(StringType()))

In [18]:
def normalize(tokens : list) -> list:
    """
    Removes non-english characters and returns lower case versions of words.
    """
    subbed   = [re.sub("[^a-zA-Z]+", "", s).lower() for s in tokens]
    
    filtered = filter(None, subbed)
    
    return list(filtered)


normalizeUDF = F.udf(normalize, ArrayType(StringType()))

In [19]:
# remove hashtags, call outs and web addresses
df4_train = df_train_tokenized.withColumn("tokens_re", removeWEBUDF(df_train_tokenized["token"]))
df4_test = df_test_tokenized.withColumn("tokens_re", removeWEBUDF(df_test_tokenized["token"]))
# remove non english characters
df4_train = df4_train.withColumn("tokens_clean", normalizeUDF(df4_train["tokens_re"]))
df4_test = df4_test.withColumn("tokens_clean", normalizeUDF(df4_test["tokens_re"]))

# rename columns
df5_train = df4_train.drop("token","tokens_re")
df5_test = df4_test.drop("token","tokens_re")
df5_train = df5_train.withColumnRenamed("tokens_clean", "tokens")
df5_test = df5_test.withColumnRenamed("tokens_clean", "tokens")

# remove reviews where the tokens array is empty, i.e. where it was just
# a hashtag, callout, numbers, web adress etc.
df6_train = df5_train.where(F.size(F.col("tokens")) > 0)
df6_test = df5_test.where(F.size(F.col("tokens")) > 0)

In [20]:
df_train_for_model = df6_train.select("reviewText","sentiment")\
        .withColumnRenamed("sentiment", "label")
df_test_for_model = df6_test.select("reviewText","sentiment").withColumnRenamed("sentiment", "label")

In [21]:
from pyspark.sql.functions import rand

# Assuming 'df' is your DataFrame
shuffled_train_df = df_train_for_model.orderBy(rand())
shuffled_test_df = df_test_for_model.orderBy(rand())

# Show the shuffled DataFrame
shuffled_train_df.show(10)
shuffled_test_df.show(10)

                                                                                

+--------------------+-----+
|          reviewText|label|
+--------------------+-----+
|***A gifted copy ...|    0|
|I loved this book...|    1|
|My wife and I lov...|    1|
|          Great Deal|    1|
|        Works great.|    1|
|Got the job done ...|    1|
|      Happy grandson|    1|
|Nobody in recent ...|    1|
|Well made, qualit...|    1|
|This was the begi...|    1|
+--------------------+-----+
only showing top 10 rows





+--------------------+-----+
|          reviewText|label|
+--------------------+-----+
|I absolutely love...|    1|
|Wow! I wish I cou...|    1|
|Replaced a Wink\n...|    1|
|i wanted to run w...|    1|
|wonderful accent ...|    1|
|It came in a very...|    1|
|Really sturdy! A ...|    1|
|Perfect . happy w...|    1|
|I've only used th...|    1|
|I have loved Mint...|    1|
+--------------------+-----+
only showing top 10 rows



                                                                                

In [None]:
# # Printing label distribution for training and testing data
# print("Training data label distribution")
# shuffled_train_df.groupBy("label").count().show()

# print("Testing data label distribution")
# shuffled_test_df.groupBy("label").count().show()

In [22]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# get the name of the metric used
evaluator.getMetricName()

'areaUnderROC'

In [None]:
# import pickle
# with open("model1.pkl", "wb") as f:
#     pickle.dump(model1, f)

In [23]:
# create tokens from reviews
tk = Tokenizer(inputCol= "reviewText", outputCol = "tokens")

# create term frequencies for each of the tokens
tf1 = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1e5)

# create tf-idf for each of the tokens
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

# create basic logistic regression model
lr = LogisticRegression(maxIter=20)

In [24]:
from pyspark.ml.feature import StopWordsRemover
sw  = StopWordsRemover(inputCol="tokens", outputCol="filtered")
tf2 = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1e5)

In [None]:
# shuffled_train_df_partition = shuffled_train_df.repartition(5)
# shuffled_test_df_partition = shuffled_test_df.repartition(5)

In [None]:
# print(shuffled_train_df_partition.rdd.getNumPartitions())
# print(shuffled_test_df_partition.rdd.getNumPartitions())

In [25]:
from nltk.stem.porter import PorterStemmer

In [26]:
from pyspark import keyword_only
import numpy as np
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param


class PorterStemming(Transformer, HasInputCol, HasOutputCol):
    """
    PosterStemming class using the NLTK Porter Stemmer
    
    This comes from https://stackoverflow.com/questions/32331848/create-a-custom-transformer-in-pyspark-ml
    Adapted to work with the Porter Stemmer from NLTK.
    """
    
    @keyword_only
    def __init__(self, 
                 inputCol  : str = None, 
                 outputCol : str = None, 
                 min_size  : int = None):
        """
        Constructor takes in the input column name, output column name,
        plus the minimum legnth of a token (min_size)
        """
        # call Transformer classes constructor since were extending it.
        super(Transformer, self).__init__()

        # set Parameter objects minimum token size
        self.min_size = Param(self, "min_size", "")
        self._setDefault(min_size=0)

        # set the input keywork arguments
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

        # initialize Stemmer object
        self.stemmer  = PorterStemmer()

        
    @keyword_only
    def setParams(self, 
                  inputCol  : str = None, 
                  outputCol : str = None, 
                  min_size  : int = None
      ) -> None:
        """
        Function to set the keyword arguemnts
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    

    def _stem_func(self, words  : list) -> list:
        """
        Stemmer function call that performs stemming on a
        list of tokens in words and returns a list of tokens
        that have meet the minimum length requiremnt.
        """
        # We need a way to get min_size and cannot access it 
        # with self.min_size
        min_size       = self.getMinSize()

        # stem that actual tokens by applying 
        # self.stemmer.stem function to each token in 
        # the words list
        stemmed_words  = map(self.stemmer.stem, words)

        # now create the new list of tokens from
        # stemmed_words by filtering out those
        # that are not of legnth > min_size
        filtered_words = filter(lambda x: len(x) > min_size, stemmed_words)

        return list(filtered_words)
    
    def _transform(self, df: DataFrame) -> DataFrame:
        """
        Transform function is the method that is called in the 
        MLPipleline.  We have to override this function for our own use
        and have it call the _stem_func.

        Notice how it takes in a type DataFrame and returns type Dataframe
        """
        # Get the names of the input and output columns to use
        out_col       = self.getOutputCol()
        in_col        = self.getInputCol()

        # create the stemming function UDF by wrapping the stemmer 
        # method function
        stem_func_udf = F.udf(self._stem_func, ArrayType(StringType()))
        
        # now apply that UDF to the column in the dataframe to return
        # a new column that has the same list of words after being stemmed
        df2           = df.withColumn(out_col, stem_func_udf(df[in_col]))

        return df2
  
  
    def setMinSize(self,value):
        """
        This method sets the minimum size value
        for the _paramMap dictionary.
        """
        self._paramMap[self.min_size] = value
        return self

    def getMinSize(self) -> int:
        """
        This method uses the parent classes (Transformer)
        .getOrDefault method to get the minimum
        size of a token.
        """
        return self.getOrDefault(self.min_size)


In [None]:

# train_shuffle_small = spark.sparkContext.parallelize(shuffled_train_df.take(50000)).toDF()
# test_shuffle_small = spark.sparkContext.parallelize(shuffled_test_df.take(50000)).toDF()

#print(type(test_shuffle_small))

In [None]:
# import pyspark.sql.functions as F
# from pyspark.sql.window import Window
# from pyspark.sql.functions import monotonically_increasing_id, row_number
# # w = Window().orderBy(F.lit('A'))
# # train_shuffle_small = train_shuffle_small.withColumn('row_num', F.floor(F.row_number().over(w) / 10) )
# # test_shuffle_small = test_shuffle_small.withColumn('row_num', F.floor(F.row_number().over(w) / 10) )

# # train_shuffle_small.show(2)
# train_shuffle_small = train_shuffle_small.withColumn('original_order', monotonically_increasing_id())
# train_shuffle_small = train_shuffle_small.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
# train_shuffle_small = train_shuffle_small.drop('original_order')

# train_shuffle_small.show(2)
# test_shuffle_small = test_shuffle_small.withColumn('original_order', monotonically_increasing_id())
# test_shuffle_small = test_shuffle_small.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
# test_shuffle_small = test_shuffle_small.drop('original_order')
# test_shuffle_small.show(2)

In [None]:
# from pyspark.sql.functions import col
# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 1) & (col('row_num') <=10000))
# #test_shuffle_small = test_shuffle_small.filter(test_shuffle_small.row_num(1, 10000))
# train_shuffle_small.show(2)

In [None]:
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)


# # test_stem  = stem_pipeline.transform(shuffled_test_df_partition)\
# #                           .where(F.size(F.col("stemmed")) >= 1)
# test_stem  = stem_pipeline.transform(test_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# # cache them to avoid running stemming 
# # each iteration in the grid search
# train_stem.cache()
# test_stem.cache()




In [None]:
# train_small = train_stem.take(10)
# rdd = spark.sparkContext.parallelize(train_small)
# train_small_df = rdd.toDF().show(2)
# train_small_df.typedf()
# print(type(train_small_df))

In [None]:
# test_small = test_stem.take(10)
# rdd2 = spark.sparkContext.parallelize(test_small)
# test_small_df = rdd2.toDF().show(2)
# print(type(test_small_df))

In [None]:
# from pyspark.ml.feature import NGram
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# bigram2 = NGram(inputCol="stemmed", outputCol="bigrams", n=2)

# tf6     = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

# idf     = IDF(inputCol="rawFeatures", outputCol="features")

# lr      = LogisticRegression(maxIter=10)

# stem_bigram_pipeline  = Pipeline(stages= [bigram2, tf6, idf, lr])

# paramGrid = ParamGridBuilder() \
#                         .addGrid(idf.minDocFreq, [2, 5]) \
#                         .addGrid(lr.regParam, [0.0, 0.1]) \
#                         .build()
# crossval = CrossValidator(estimator          = stem_bigram_pipeline,
#                           estimatorParamMaps = paramGrid,
#                           evaluator          = BinaryClassificationEvaluator(),
#                           numFolds           = 2,
#                           parallelism= 2
#                           )


# model    = crossval.fit(train_stem)
# predictions   = model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))

In [None]:

# print("AUC SCORE for cross validation: {}".format(score))




In [None]:
# model.save("cross_val_model_final")


In [None]:
# from pyspark.ml.tuning import CrossValidatorModel

# model_path = "cross_val_model_final"
# model = CrossValidatorModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 10001) & (col('row_num') <=20000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.bestModel.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))


In [None]:
# model.save("cross_val_model_final2")

In [None]:
# model.transform(test_stem).show(5)

# predictions   = model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE for cross validation: {}".format(score))

In [None]:
# from pyspark.mllib.evaluation import MulticlassMetrics
# bestModel = model.bestModel
# predictedAndLabels = predictions.select(["prediction","label"])\
#                                 .rdd.map(lambda r : (float(r[0]), float(r[1])))
# metrics = MulticlassMetrics(predictedAndLabels)

# print("Test Set Accuracy: {}".format(metrics.accuracy))

In [None]:
# model_path = "cross_val_model_final2"
# model = CrossValidatorModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 20001) & (col('row_num') <=50000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.bestModel.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))


In [27]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
import pyspark.sql.functions as F
from pyspark.sql.window import Window
shuffled_train_df = shuffled_train_df.withColumn('original_order', monotonically_increasing_id())
shuffled_train_df = shuffled_train_df.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
shuffled_train_df = shuffled_train_df.drop('original_order')

shuffled_train_df.show(2)
shuffled_test_df = shuffled_test_df.withColumn('original_order', monotonically_increasing_id())
shuffled_test_df = shuffled_test_df.withColumn('row_num', row_number().over(Window.orderBy('original_order')))
shuffled_test_df = shuffled_test_df.drop('original_order')
shuffled_test_df.show(2)

shuffled_train_df.cache()
shuffled_test_df.cache()

24/04/23 13:02:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:02:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:02:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:02:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:02:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:06:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 1

+--------------------+-----+-------+
|          reviewText|label|row_num|
+--------------------+-----+-------+
|***A gifted copy ...|    0|      1|
|I loved this book...|    1|      2|
+--------------------+-----+-------+
only showing top 2 rows



24/04/23 13:06:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:06:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:06:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:06:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:06:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:07:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 1

+--------------------+-----+-------+
|          reviewText|label|row_num|
+--------------------+-----+-------+
|I absolutely love...|    1|      1|
|Wow! I wish I cou...|    1|      2|
+--------------------+-----+-------+
only showing top 2 rows



24/04/23 13:07:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:07:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:07:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:07:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:07:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:07:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


DataFrame[reviewText: string, label: int, row_num: int]

In [28]:
from pyspark.sql.functions import col
stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
train_shuffle_small = shuffled_train_df.filter( (col('row_num') >= 1) & (col('row_num') <=50000))
#stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

#train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
                          #.where(F.size(F.col("stemmed")) >= 1)
train_stem = stem_pipeline.transform(train_shuffle_small)\
                          .where(F.size(F.col("stemmed")) >= 1)


# test_stem  = stem_pipeline.transform(shuffled_test_df_partition)\
#                           .where(F.size(F.col("stemmed")) >= 1)
test_stem  = stem_pipeline.transform(shuffled_test_df)\
                          .where(F.size(F.col("stemmed")) >= 1)

# cache them to avoid running stemming 
# each iteration in the grid search
train_stem.cache()
test_stem.cache()

24/04/23 13:08:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:08:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:08:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:08:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


DataFrame[reviewText: string, label: int, row_num: int, tokens: array<string>, stemmed: array<string>]

In [None]:
from pyspark.ml.feature import NGram
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
bigram2 = NGram(inputCol="stemmed", outputCol="bigrams", n=2)

tf6     = HashingTF(inputCol="bigrams", outputCol="rawFeatures", numFeatures=2e5)

idf     = IDF(inputCol="rawFeatures", outputCol="features")

lr      = LogisticRegression(maxIter=10)

stem_bigram_pipeline  = Pipeline(stages= [bigram2, tf6, idf, lr])

paramGrid = ParamGridBuilder() \
                        .addGrid(idf.minDocFreq, [2, 5]) \
                        .addGrid(lr.regParam, [0.0, 0.1]) \
                        .build()
crossval = CrossValidator(estimator          = stem_bigram_pipeline,
                          estimatorParamMaps = paramGrid,
                          evaluator          = BinaryClassificationEvaluator(),
                          numFolds           = 2,
                          parallelism= 2
                          )


model    = crossval.fit(train_stem)
predictions   = model.transform(test_stem)
score         = evaluator.evaluate(predictions)
print("AUC SCORE: {}".format(score))

In [None]:
model.save("cross_val_model_firstRun")

In [None]:
# from pyspark.ml.tuning import CrossValidatorModel
# model_path = "cross_val_model_first50K"

# model = CrossValidatorModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 50001) & (col('row_num') <=100000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.bestModel.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))

In [None]:
# updated_model.save("cross_val_model_100K")

In [None]:
# from pyspark.ml.tuning import CrossValidatorModel
# from pyspark.ml.pipeline import PipelineModel
# model_path = "cross_val_model_100K"

# model = PipelineModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 100001) & (col('row_num') <=150000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))

In [None]:
# updated_model.save("cross_val_model_200K")

In [None]:
# from pyspark.ml.tuning import CrossValidatorModel
# from pyspark.ml.pipeline import PipelineModel
# model_path = "cross_val_model_200K"

# model = PipelineModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 150001) & (col('row_num') <=200000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))

In [None]:
# updated_model.save("cross_val_model_300K")

In [None]:
# from pyspark.ml.tuning import CrossValidatorModel
# from pyspark.ml.pipeline import PipelineModel
# model_path = "cross_val_model_300K"

# model = PipelineModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 200000) & (col('row_num') <=270000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))
# updated_model.save("cross_val_model_400K")

In [None]:
# from pyspark.ml.tuning import CrossValidatorModel
# from pyspark.ml.pipeline import PipelineModel
# model_path = "cross_val_model_400K"

# model = PipelineModel.load(model_path)

# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 270001) & (col('row_num') <=340000))
# stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
# #stem_pipeline = Pipeline(stages= [tk, stem2]).fit(shuffled_train_df_partition)
# stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)

# #train_stem = stem_pipeline.transform(shuffled_train_df_partition)\
#                           #.where(F.size(F.col("stemmed")) >= 1)
# train_stem = stem_pipeline.transform(train_shuffle_small)\
#                           .where(F.size(F.col("stemmed")) >= 1)

# updated_model = Pipeline(stages = model.stages).fit(train_stem)
# predictions = updated_model.transform(test_stem)
# score         = evaluator.evaluate(predictions)
# print("AUC SCORE: {}".format(score))
# updated_model.save("cross_val_model_5Runs")

In [None]:
# train_shuffle_small = train_shuffle_small.filter( (col('row_num') >= 340001) & (col('row_num') <=400000))
rows = shuffled_train_df.count()
print(rows)

In [None]:
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.pipeline import PipelineModel
i = 50001
# 10927431
file_name_first = "cross_val_model_firstRun"
file_name_string = ""
file_name = 1

for i in range(50001, 3480000, 70000):
    # Add your indented block of code here
    if i == 50001:
        model_path = "cross_val_model_firstRun"
        model = CrossValidatorModel.load(model_path)
        stages_steps = model.bestModel.stages
    else:
        model_path = file_name_string
        model = PipelineModel.load(model_path)
        stages_steps = model.stages
    train_shuffle_small = shuffled_train_df.filter( (col('row_num') >= i) & (col('row_num') <=i+70000))
    stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
    stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)
    updated_model = Pipeline(stages = stages_steps).fit(train_stem)
    predictions = updated_model.transform(test_stem)
    score         = evaluator.evaluate(predictions)
    print("AUC SCORE in this run : {}".format(score))
    file_name_string = "cross_val_model_" + str(file_name + 1) + "_Runs"
    updated_model.save(file_name_string)
    print("Model saved with name : ", file_name_string)
    file_name += 1
    i += 70000
    
    
    


In [None]:
from pyspark.ml.pipeline import PipelineModel
model = PipelineModel.load("/Users/kravisankaran/Desktop/big-data-final-project/cross_val_model_49_Runs")

In [29]:
# from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.pipeline import PipelineModel
i = 3480001
# 10927431

file_name_string = ""
file_name = 49
for i in range(3480001, 7000000, 90000):
    # Add your indented block of code here
    if i == 3480001:
        print("Run number : ", file_name)
        model_path = "/Users/kravisankaran/Desktop/big-data-final-project/cross_val_model_49_Runs"
    else:
        print("Run number : ", file_name)
        model_path = file_name_string
    model = PipelineModel.load(model_path)
    stages_steps = model.stages
    train_shuffle_small = shuffled_train_df.filter( (col('row_num') >= i) & (col('row_num') <=i+90000))
    stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
    stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)
    updated_model = Pipeline(stages = stages_steps).fit(train_stem)
    predictions = updated_model.transform(test_stem)
    score         = evaluator.evaluate(predictions)
    print("AUC SCORE in this run : {}".format(score))
    file_name_string = "cross_val_model_" + str(file_name + 1) + "_Runs"
    updated_model.save(file_name_string)
    print("Model saved with name : ", file_name_string)
    file_name += 1
    i += 90000
    
    
    


Run number :  49


24/04/23 13:09:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:09:06 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:10:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:10:05 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:10:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 13:10:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/04/23 1

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:30:20 WARN TaskSetManager: Stage 110 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:30:21 WARN TaskSetManager: Stage 114 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_50_Runs
Run number :  50


24/04/23 13:30:23 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:30:28 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:35:04 WARN TaskSetManager: Stage 153 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:35:04 WARN TaskSetManager: Stage 157 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_51_Runs
Run number :  51


24/04/23 13:35:06 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:35:10 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:39:48 WARN TaskSetManager: Stage 196 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:39:48 WARN TaskSetManager: Stage 200 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_52_Runs
Run number :  52


24/04/23 13:39:50 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:39:54 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:44:23 WARN TaskSetManager: Stage 239 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:44:24 WARN TaskSetManager: Stage 243 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_53_Runs
Run number :  53


24/04/23 13:44:26 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:44:30 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:49:02 WARN TaskSetManager: Stage 282 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:49:03 WARN TaskSetManager: Stage 286 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_54_Runs
Run number :  54


24/04/23 13:49:05 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:49:09 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:53:40 WARN TaskSetManager: Stage 325 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:53:41 WARN TaskSetManager: Stage 329 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_55_Runs
Run number :  55


24/04/23 13:53:43 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:53:47 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 13:58:18 WARN TaskSetManager: Stage 368 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 13:58:19 WARN TaskSetManager: Stage 372 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_56_Runs
Run number :  56


24/04/23 13:58:21 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 13:58:24 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:02:52 WARN TaskSetManager: Stage 411 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:02:52 WARN TaskSetManager: Stage 415 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_57_Runs
Run number :  57


24/04/23 14:02:54 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:02:58 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:07:27 WARN TaskSetManager: Stage 454 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:07:27 WARN TaskSetManager: Stage 458 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_58_Runs
Run number :  58


24/04/23 14:07:29 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:07:33 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:12:03 WARN TaskSetManager: Stage 497 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:12:03 WARN TaskSetManager: Stage 501 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_59_Runs
Run number :  59


24/04/23 14:12:05 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:12:09 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:16:40 WARN TaskSetManager: Stage 540 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:16:41 WARN TaskSetManager: Stage 544 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_60_Runs
Run number :  60


24/04/23 14:16:43 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:16:47 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:21:19 WARN TaskSetManager: Stage 583 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:21:19 WARN TaskSetManager: Stage 587 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_61_Runs
Run number :  61


24/04/23 14:21:21 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:21:24 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:25:58 WARN TaskSetManager: Stage 626 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:25:58 WARN TaskSetManager: Stage 630 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_62_Runs
Run number :  62


24/04/23 14:26:00 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:26:04 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:30:41 WARN TaskSetManager: Stage 669 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:30:41 WARN TaskSetManager: Stage 673 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_63_Runs
Run number :  63


24/04/23 14:30:43 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:30:47 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:35:23 WARN TaskSetManager: Stage 712 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:35:24 WARN TaskSetManager: Stage 716 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_64_Runs
Run number :  64


24/04/23 14:35:26 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:35:29 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:40:04 WARN TaskSetManager: Stage 755 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:40:05 WARN TaskSetManager: Stage 759 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_65_Runs
Run number :  65


24/04/23 14:40:07 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:40:10 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:44:47 WARN TaskSetManager: Stage 798 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:44:47 WARN TaskSetManager: Stage 802 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_66_Runs
Run number :  66


24/04/23 14:44:49 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:44:53 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:49:24 WARN TaskSetManager: Stage 841 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:49:25 WARN TaskSetManager: Stage 845 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_67_Runs
Run number :  67


24/04/23 14:49:27 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:49:30 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:54:02 WARN TaskSetManager: Stage 884 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:54:02 WARN TaskSetManager: Stage 888 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_68_Runs
Run number :  68


24/04/23 14:54:04 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:54:08 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
                                                                                

AUC SCORE in this run : 0.8242746099293129


24/04/23 14:58:38 WARN TaskSetManager: Stage 927 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 14:58:39 WARN TaskSetManager: Stage 931 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_69_Runs
Run number :  69


24/04/23 14:58:40 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 14:58:45 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:13:27 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 828025 ms exceeds timeout 800000 ms
24/04/23 15:13:27 WARN SparkContext: Killing executors is not supported by current scheduler.
24/04/23 15:13:41 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.Block

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:14:09 WARN TaskSetManager: Stage 970 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:14:10 WARN TaskSetManager: Stage 974 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_70_Runs
Run number :  70


24/04/23 15:14:11 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:14:15 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:17:01 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:18:53 WARN TaskSetManager: Stage 1013 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:18:54 WARN TaskSetManager: Stage 1017 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_71_Runs
Run number :  71


24/04/23 15:18:55 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:18:59 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:20:21 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:23:43 WARN TaskSetManager: Stage 1056 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:23:44 WARN TaskSetManager: Stage 1060 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_72_Runs
Run number :  72


24/04/23 15:23:46 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:23:50 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:27:01 WARN Executor: Issue communicating with driver in heartbeater]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at org.apache.spark.executor.Exe

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:28:31 WARN TaskSetManager: Stage 1099 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:28:32 WARN TaskSetManager: Stage 1103 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_73_Runs
Run number :  73


24/04/23 15:28:34 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:28:38 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:30:21 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:33:15 WARN TaskSetManager: Stage 1142 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:33:16 WARN TaskSetManager: Stage 1146 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_74_Runs
Run number :  74


24/04/23 15:33:17 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:33:21 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:33:41 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMa

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:38:07 WARN TaskSetManager: Stage 1185 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:38:07 WARN TaskSetManager: Stage 1189 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_75_Runs
Run number :  75


24/04/23 15:38:09 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:38:13 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:40:21 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:42:54 WARN TaskSetManager: Stage 1228 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:42:54 WARN TaskSetManager: Stage 1232 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_76_Runs
Run number :  76


24/04/23 15:42:56 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:43:00 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:43:41 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMa

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:47:40 WARN TaskSetManager: Stage 1271 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:47:41 WARN TaskSetManager: Stage 1275 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_77_Runs
Run number :  77


24/04/23 15:47:43 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:47:47 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:50:21 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:52:27 WARN TaskSetManager: Stage 1314 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:52:28 WARN TaskSetManager: Stage 1318 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_78_Runs
Run number :  78


24/04/23 15:52:30 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:52:33 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 15:53:41 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 15:57:15 WARN TaskSetManager: Stage 1357 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 15:57:15 WARN TaskSetManager: Stage 1361 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_79_Runs
Run number :  79


24/04/23 15:57:17 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 15:57:20 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:10:33 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:12:20 WARN TaskSetManager: Stage 1400 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:12:20 WARN TaskSetManager: Stage 1404 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_80_Runs
Run number :  80


24/04/23 16:12:22 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:12:25 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:13:53 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:17:08 WARN TaskSetManager: Stage 1443 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:17:08 WARN TaskSetManager: Stage 1447 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_81_Runs
Run number :  81


24/04/23 16:17:10 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:17:13 WARN Executor: Issue communicating with driver in heartbeater]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:295)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$m

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:21:55 WARN TaskSetManager: Stage 1486 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:21:56 WARN TaskSetManager: Stage 1490 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_82_Runs
Run number :  82


24/04/23 16:21:57 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:22:02 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:23:53 WARN Executor: Issue communicating with driver in heartbeater]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at org.apache.spark.executor.Exe

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:26:43 WARN TaskSetManager: Stage 1529 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:26:44 WARN TaskSetManager: Stage 1533 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_83_Runs
Run number :  83


24/04/23 16:26:45 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:26:49 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:27:13 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMa

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:31:28 WARN TaskSetManager: Stage 1572 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:31:29 WARN TaskSetManager: Stage 1576 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_84_Runs
Run number :  84


24/04/23 16:31:31 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:31:35 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:33:53 ERROR Inbox: Ignoring error                       (0 + 1) / 1]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndp

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:36:14 WARN TaskSetManager: Stage 1615 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:36:15 WARN TaskSetManager: Stage 1619 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_85_Runs
Run number :  85


24/04/23 16:36:16 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:36:20 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:37:13 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMa

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:40:52 WARN TaskSetManager: Stage 1658 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:40:52 WARN TaskSetManager: Stage 1662 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_86_Runs
Run number :  86


24/04/23 16:40:54 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:40:59 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:43:53 WARN Executor: Issue communicating with driver in heartbeater]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at org.apache.spark.executor.Exe

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:45:38 WARN TaskSetManager: Stage 1701 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:45:39 WARN TaskSetManager: Stage 1705 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_87_Runs
Run number :  87


24/04/23 16:45:40 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:45:44 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:47:13 WARN Executor: Issue communicating with driver in heartbeater]
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at org.apache.spark.executor.Exe

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:50:24 WARN TaskSetManager: Stage 1744 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.
24/04/23 16:50:25 WARN TaskSetManager: Stage 1748 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_88_Runs
Run number :  88


24/04/23 16:50:27 WARN DAGScheduler: Broadcasting large task binary with size 4.7 MiB
24/04/23 16:50:31 WARN MemoryStore: Not enough space to cache rdd_263_0 in memory! (computed 2.5 GiB so far)
24/04/23 16:50:33 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMa

AUC SCORE in this run : 0.8242746099293129


24/04/23 16:55:13 WARN TaskSetManager: Stage 1787 contains a task of very large size (3196 KiB). The maximum recommended task size is 1000 KiB.


Model saved with name :  cross_val_model_89_Runs


24/04/23 16:55:13 WARN TaskSetManager: Stage 1791 contains a task of very large size (1602 KiB). The maximum recommended task size is 1000 KiB.


In [None]:
# from pyspark.ml.tuning import CrossValidatorModel
# from pyspark.ml.pipeline import PipelineModel
# i = 3480001
# # 10927431

# file_name_string = ""
# file_name = 49

# for i in range(3480001, 7000000, 90000):
#     # Add your indented block of code here
#     if i == 3480001:
#         model_path = "cross_val_model_49_Runs"
#     else:
#         model_path = file_name_string
#     model = PipelineModel.load(model_path)
#     stages_steps = model.stages
#     train_shuffle_small = shuffled_train_df.filter( (col('row_num') >= i) & (col('row_num') <=i+90000))
#     stem2 = PorterStemming(inputCol="tokens", outputCol="stemmed")
#     stem_pipeline = Pipeline(stages= [tk, stem2]).fit(train_shuffle_small)
#     updated_model = Pipeline(stages = stages_steps).fit(train_stem)
#     predictions = updated_model.transform(test_stem)
#     score         = evaluator.evaluate(predictions)
#     print("AUC SCORE in this run : {}".format(score))
#     file_name_string = "cross_val_model_" + str(file_name + 1) + "_Runs"
#     updated_model.save(file_name_string)
#     print("Model saved with name : ", file_name_string)
#     file_name += 1
#     i += 90000