In [1]:
import os
import threading
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression, LinearSVC, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder, Tokenizer, StopWordsRemover, CountVectorizer, IDF, PCA, HashingTF


from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, sum, when
from pyspark.sql.types import StringType

from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
# Change file path
file_path = "/Users/hydraze/Library/CloudStorage/GoogleDrive-tohziyu2@gmail.com/My Drive/Studies/KU Leuven/Courses/Classes/Y1S2/Advanced Analytics in Business/Project/3/AdvancedAnalytics_Streaming-Text-Analytics/"
os.chdir(file_path)


In [3]:
# read pickled model via pipeline api
mPath =  file_path+"models/best_model"
best_model = PipelineModel.load(mPath)

In [4]:
# Toy predict function that returns a random probability. Normally you'd use your loaded globals()['my_model'] here
def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))

    # Convert to data frame
    df = spark.read.json(rdd)

    # Data cleaning (to update concurrently with the other jupyter notebook)
    
    # Extracting type of post: Show HN
    df = df.withColumn('isShowHN', when(df.title.contains("Show HN"), 1).otherwise(0))
    
    # Extracting time of day
    extract_time_of_day_udf = udf(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%H'))
    
    df = df.withColumn('time_of_day', extract_time_of_day_udf(df.posted_at))
    
    # Extracting day of week
    weekDay =  udf(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').strftime('%w'))
    
    df = df.withColumn('day_of_week', weekDay(df.posted_at))

    # Fill null values
    df = df.na.fill({"title": "", "source_title": "", "source_text": ""})
    
    # And then predict using the loaded model (uncomment below):
    df_result = best_model.transform(df)
    df_result.select('aid', 'comments', 'frontpage', 'prediction').show()

In [5]:
# Likely the usual streaming
ssc = StreamingContext(sc, 10)
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)
ssc_t = StreamingThread(ssc)
ssc_t.start()

24/05/17 15:40:22 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:22 WARN BlockManager: Block input-0-1715953221800 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:26 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:26 WARN BlockManager: Block input-0-1715953225800 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:29 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:29 WARN BlockManager: Block input-0-1715953229000 replicated to only 0 peer(s) instead of 1 peers
                                                                                



                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40385457|       0|    false|       1.0|
|40385469|       2|     true|       1.0|
|40385496|       0|    false|       0.0|
+--------+--------+---------+----------+



24/05/17 15:40:33 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:33 WARN BlockManager: Block input-0-1715953232800 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:35 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:35 WARN BlockManager: Block input-0-1715953234800 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:36 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:36 WARN BlockManager: Block input-0-1715953236400 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:39 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:39 WARN BlockManager: Block input-0-1715953239400 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40385536|       0|     true|       1.0|
|40385544|       0|    false|       0.0|
|40385547|       0|    false|       0.0|
|40385549|       0|     true|       0.0|
+--------+--------+---------+----------+



24/05/17 15:40:44 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:44 WARN BlockManager: Block input-0-1715953244400 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:49 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:49 WARN BlockManager: Block input-0-1715953249400 replicated to only 0 peer(s) instead of 1 peers
                                                                                



24/05/17 15:40:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:50 WARN BlockManager: Block input-0-1715953250400 replicated to only 0 peer(s) instead of 1 peers


+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40385562|       0|    false|       0.0|
|40385579|       0|     true|       0.0|
+--------+--------+---------+----------+



24/05/17 15:40:52 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:52 WARN BlockManager: Block input-0-1715953252600 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:55 WARN BlockManager: Block input-0-1715953255600 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:56 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:56 WARN BlockManager: Block input-0-1715953256600 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:40:58 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:40:58 WARN BlockManager: Block input-0-1715953258600 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40385589|       0|    false|       0.0|
|40385599|       0|    false|       0.0|
|40385611|       0|     true|       0.0|
|40385637|       0|    false|       0.0|
|40385645|       0|    false|       0.0|
+--------+--------+---------+----------+



24/05/17 15:41:03 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:41:03 WARN BlockManager: Block input-0-1715953263600 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:41:04 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:41:04 WARN BlockManager: Block input-0-1715953264600 replicated to only 0 peer(s) instead of 1 peers
24/05/17 15:41:09 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:41:09 WARN BlockManager: Block input-0-1715953268800 replicated to only 0 peer(s) instead of 1 peers
                                                                                

+--------+--------+---------+----------+
|     aid|comments|frontpage|prediction|
+--------+--------+---------+----------+
|40385693|       3|     true|       0.0|
|40385710|       2|     true|       1.0|
|40385721|       0|    false|       0.0|
+--------+--------+---------+----------+



24/05/17 15:41:11 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
24/05/17 15:41:11 WARN BlockManager: Block input-0-1715953271600 replicated to only 0 peer(s) instead of 1 peers


In [None]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----


24/05/17 15:41:16 WARN SocketReceiver: Error receiving data
java.net.SocketException: Socket closed
	at java.base/java.net.SocketInputStream.socketRead0(Native Method)
	at java.base/java.net.SocketInputStream.socketRead(SocketInputStream.java:115)
	at java.base/java.net.SocketInputStream.read(SocketInputStream.java:168)
	at java.base/java.net.SocketInputStream.read(SocketInputStream.java:140)
	at java.base/sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
	at java.base/sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
	at java.base/sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
	at java.base/java.io.InputStreamReader.read(InputStreamReader.java:181)
	at java.base/java.io.BufferedReader.fill(BufferedReader.java:161)
	at java.base/java.io.BufferedReader.readLine(BufferedReader.java:326)
	at java.base/java.io.BufferedReader.readLine(BufferedReader.java:392)
	at org.apache.spark.streaming.dstream.SocketReceiver$$anon$2.getNext(SocketInputDStream.scala:121)
	at org.a