In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
import numpy as np
import pandas as pd
import pyspark
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pyspark.ml import Pipeline # pipeline to transform data
from pyspark.sql import SparkSession # to initiate spark
from pyspark.ml.linalg import Vectors # to allow us to work with VectorAssembler
from pyspark.ml.feature import VectorAssembler # to combine our feature columns to pass to LogisticRegression model
from pyspark.sql.types import FloatType
from pyspark.ml.feature import RegexTokenizer # tokenizer
from pyspark.ml.feature import HashingTF, IDF, IDFModel # vectorizer
from pyspark.ml.feature import StopWordsRemover # to remove stop words
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel # ml model
from pyspark.ml.evaluation import BinaryClassificationEvaluator # to evaluate the model

from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

from pyspark.ml.pipeline import PipelineModel

In [5]:
## YOUR MODEL PATH HERE ##
model_path = "C:\\Users\\Dillon\\Desktop\\spark\\notebooks\\models"

In [6]:
globals()['models_loaded'] = False
globals()['my_model'] = None

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = PipelineModel.load(model_path) 
        globals()['models_loaded'] = True
        
    # Predicting using the loaded model:
    
    df_result = globals()['my_model'].transform(df)
    df_result.select(["title", "votes", "comments", "probability", "prediction", "frontpage"]).show()

In [7]:
ssc = StreamingContext(sc, 10)



In [8]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [9]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|Impact of Design ...|    1|       0|[0.96898402799738...|       0.0|    false|
|Palestinian state...|    1|       0|[0.96514873172688...|       0.0|    false|
+--------------------+-----+--------+--------------------+----------+---------+

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|China's latest an...|    1|       0|[0.96041286055183...|       0.0|    false|
|How Big Is Taylor...|    1|       0|[0.96377734611366...|       0.0|    false|
|Reflections on my...|    2|       0|[0.91697599964288...|       0.0|    false|
|New code-focused ...|    4|       0|[0

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|Show HN: Can GPT ...|    1|       0|[0.98023619718023...|       0.0|    false|
|Oxidizing OCaml w...|    2|       0|[0.91846809175416...|       0.0|    false|
+--------------------+-----+--------+--------------------+----------+---------+

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|(1) Confessions o...|    2|       0|[0.90859683229771...|       0.0|    false|
|Can Scarlett Joha...|    1|       0|[0.97945156358759...|       0.0|    false|
|OpenAI, WSJ Owner...|   26|       4|[2.22247916958501...|       1.0|     true|
+--------------------+-----+--------+--

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|Getting Wiser fro...|    1|       0|[0.97431307897922...|       0.0|    false|
|AutoBlink: Monito...|    2|       0|[0.85361326432267...|       0.0|    false|
|Bypassed app down...|    1|       0|[0.95976334163265...|       0.0|    false|
+--------------------+-----+--------+--------------------+----------+---------+

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|Evaluating a Cont...|    3|       0|[0.92655599541029...|       0.0|     true|
|The Essentials of...|    1|       0|[0.94375267199331...|       0.0|    false|
|The Klingon Progr...|    1|       0|[0

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|Last Chance to Ge...|    2|       0|[0.95728307579888...|       0.0|    false|
|QDirStat – Linux ...|    1|       0|[0.96204090595746...|       0.0|    false|
|Google, Pizza, an...|    1|       0|[0.98021679438693...|       0.0|    false|
+--------------------+-----+--------+--------------------+----------+---------+

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|Refining Discord'...|    1|       0|[0.97279481264891...|       0.0|    false|
|Biden Keeps Passi...|    1|       0|[0.98524973226848...|       0.0|    false|
|Microsoft Bing Se...|    6|       0|[0

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+--------------------+----------+---------+
|High BMI linked t...|    2|       0|[0.93859069688369...|       0.0|    false|
+--------------------+-----+--------+--------------------+----------+---------+

+------------------+-----+--------+--------------------+----------+---------+
|             title|votes|comments|         probability|prediction|frontpage|
+------------------+-----+--------+--------------------+----------+---------+
|DuckDuckGo Is Down|    1|       0|[0.98648929052124...|       0.0|    false|
+------------------+-----+--------+--------------------+----------+---------+

+--------------------+-----+--------+--------------------+----------+---------+
|               title|votes|comments|         probability|prediction|frontpage|
+--------------------+-----+--------+-----------

In [16]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
