In [3]:
import os
import threading
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, LinearSVC, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, Tokenizer, StopWordsRemover, CountVectorizer, IDF, PCA, HashingTF


from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, sum, when
from pyspark.sql.types import StringType

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [4]:
# Change file path
file_path = "/Users/hydraze/Library/CloudStorage/GoogleDrive-tohziyu2@gmail.com/My Drive/Studies/KU Leuven/Courses/Classes/Y1S2/Advanced Analytics in Business/Project/3/AdvancedAnalytics_Streaming-Text-Analytics/"
os.chdir(file_path)


In [5]:
# read pickled model via pipeline api
mPath =  file_path+"models/best_model"
best_model = PipelineModel.load(mPath)

In [6]:
# Toy predict function that returns a random probability. Normally you'd use your loaded globals()['my_model'] here
def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))

    # Convert to data frame
    df = spark.read.json(rdd)

    # Data cleaning (to update concurrently with the other jupyter notebook)
    
    # Extracting type of post: Show HN
    df = df.withColumn('isShowHN', when(df.title.contains("Show HN"), 1).otherwise(0))
    
    # Extracting time of day
    extract_time_of_day_udf = udf(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%H'))
    
    df = df.withColumn('time_of_day', extract_time_of_day_udf(df.posted_at))
    
    # Extracting day of week
    weekDay =  udf(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%w'))
    
    df = df.withColumn('day_of_week', weekDay(df.posted_at))

    # Fill null values
    df = df.na.fill({"title": "", "source_title": "", "source_text": ""})
    
    # And then predict using the loaded model (uncomment below):
    df_result = best_model.transform(df)
    df_result.select('aid', 'comments', 'frontpage', 'prediction').show()

In [7]:
# Likely the usual streaming
ssc = StreamingContext(sc, 10)
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)
ssc_t = StreamingThread(ssc)
ssc_t.start()

24/05/16 23:21:37 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:37 WARN BlockManager: Block input-0-1715894497200 replicated to only 0 peer(s) instead of 1 peers
24/05/16 23:21:39 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:39 WARN BlockManager: Block input-0-1715894499400 replicated to only 0 peer(s) instead of 1 peers
                                                                                



24/05/16 23:21:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:41 WARN BlockManager: Block input-0-1715894501400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40376539|       0|    false|       1.0|
|40376550|       0|    false|       1.0|
+--------+--------+---------+----------+



24/05/16 23:21:43 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:43 WARN BlockManager: Block input-0-1715894503400 replicated to only 0 peer(s) instead of 1 peers
24/05/16 23:21:47 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:47 WARN BlockManager: Block input-0-1715894507400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40376553|       0|    false|       1.0|
|40376554|       0|    false|       1.0|
|40376563|       0|    false|       1.0|
+--------+--------+---------+----------+



24/05/16 23:21:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:50 WARN BlockManager: Block input-0-1715894510600 replicated to only 0 peer(s) instead of 1 peers
24/05/16 23:21:52 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:52 WARN BlockManager: Block input-0-1715894512400 replicated to only 0 peer(s) instead of 1 peers
24/05/16 23:21:57 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:57 WARN BlockManager: Block input-0-1715894517400 replicated to only 0 peer(s) instead of 1 peers
24/05/16 23:21:59 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:21:59 WARN BlockManager: Block input-0-1715894519600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40376573|       0|    false|       1.0|
|40376578|       0|    false|       1.0|
|40376581|       0|    false|       1.0|
|40376583|       0|    false|       1.0|
+--------+--------+---------+----------+



24/05/16 23:22:01 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/16 23:22:01 WARN BlockManager: Block input-0-1715894521400 replicated to only 0 peer(s) instead of 1 peers
24/05/16 23:22:05 ERROR ReceiverTracker: Receiver has been stopped. Try to restart it.
org.apache.spark.SparkException: Job 66 cancelled as part of cancellation of all jobs
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:2731)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$doCancelAllJobs$2(DAGScheduler.scala:1114)
	at scala.runtime.java8.JFunction1$mcVI$sp.apply(JFunction1$mcVI$sp.java:23)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.doCancelAllJobs(DAGScheduler.scala:1113)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3022)
	at org.apache

In [9]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----


24/05/09 18:12:05 WARN StreamingContext: StreamingContext has not been started yet
24/05/09 18:12:07 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/09 18:12:07 WARN BlockManager: Block input-0-1715271127000 replicated to only 0 peer(s) instead of 1 peers
24/05/09 18:12:10 ERROR JobScheduler: Error running job streaming job 1715271130000 ms.0
org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
  File "/Users/hydraze/Downloads/spark/spark-3.5.1-bin-hadoop3/python/pyspark/streaming/util.py", line 71, in call
    r = self.func(t, *rdds)
        ^^^^^^^^^^^^^^^^^^^
  File "/var/folders/4h/j84nnmnn0nj01mk0ndxyk7n40000gn/T/ipykernel_54142/3537873331.py", line 10, in process
    df = spark.createDataFrame(rdd)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/hydraze/Downloads/spark/spark-3.5.1-bin-hadoop3/python/pyspark/sql/session.py", line 1443, in createDataFrame
    return self._create_dataframe(
      