# Hands-on Exercise_Streaming Prediction with Spark ML and Spark Streaming

In this notebook we are going to load a pipeline that has a set of pre-processing phases and a classification model to predict a patient's probability of suffering a heart attack. The prediction will be made on streaming data obtained from the csv of heart.csv

In [None]:
import findspark
findspark.init()

In [None]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import *

In [None]:
from pyspark.sql import SparkSession
## Start a Spark session

In [None]:
## Load and view the csv of Exercises\data\heart.csv with the name of heart


In [None]:
schema = StructType( \
                     [StructField("age", LongType(),True), \
                      StructField("sex", LongType(), True), \
                      StructField("cp", LongType(), True), \
                      StructField('trestbps', LongType(), True), \
                      StructField("chol", LongType(), True), \
                      StructField("fbs", LongType(), True), \
                      StructField("restecg", LongType(), True), \
                      StructField("thalach", LongType(), True),\
                      StructField("exang", LongType(), True), \
                      StructField("oldpeak", DoubleType(), True), \
                      StructField("slope", LongType(),True), \
                      StructField("ca", LongType(), True), \
                      StructField("thal", LongType(), True), \
                      StructField("target", LongType(), True), \
                        ])

In [None]:
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType,StructField,LongType, StringType,DoubleType,TimestampType


df = heart.withColumnRenamed("target","label")
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trtbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalachh: integer (nullable = true)
 |-- exng: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slp: integer (nullable = true)
 |-- caa: integer (nullable = true)
 |-- thall: integer (nullable = true)
 |-- label: integer (nullable = true)



In [None]:
testDF, trainDF = df.randomSplit([0.3, 0.7])

### Pipeline loading

In [None]:
from pyspark.ml import PipelineModel


pModel = PipelineModel.load("\pipelines")

PipelineModel_f8d4b32a5360

In [None]:
## Check that the above pipeline works correctly. To do so, make a prediction on
## trainDF data and show the prediction


In [None]:
testData = testDF.repartition(10)

testData.write.format("CSV").option("header",False).save("/heart_streaming/")

## Creating Streaming Predictions

In [None]:
## Use the csv files stored in data/heart_streaming to simulate a streaming data process.
## To do so, use the function spark.readStream 
## In the configuration: set a one file be imported per execution
## rename the variable "output" to "label"
## Call this process with the name sourceStream


In [None]:
## Use the "pModel" pipeline to make predictions using the streaming data from "sourceStream"
## From the prediction select the variables: label, probability, prediction. 
## Call this process with the name "prediction1"

In [None]:
display(prediction1)

DataFrame[label: bigint, probability: vector, prediction: double]

#### Displaying the predictions in console

In [None]:
## Get the predictions using prediction1.writeStream. in the options
## config set: "format" equal to "console"
## in .trigger equals (once=True),
## and allow the process to wait for completion with .awaitTermination()


#### Keeping the predictions in Memory

In [None]:
## Get the predictions using prediction1.writeStream.
## In configuration: results should be saved in memory 
## .outputMode should be "append"
## the name of the query "queryName" is "prediction4"


In [None]:
for x in range(2):
    df = spark.sql(
        "SELECT * FROM prediction4")
    df.show(10)
df

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.04086978924170...|       1.0|
|    0|[0.98184892212735...|       0.0|
|    1|[0.00474279761632...|       1.0|
|    1|[0.35775366097494...|       1.0|
|    1|[0.05755909903937...|       1.0|
|    0|[0.95305536703752...|       0.0|
|    0|[0.94079962605713...|       0.0|
|    0|[0.13017480179914...|       1.0|
|    0|[0.99807916786174...|       0.0|
|    1|[0.15541832735450...|       1.0|
+-----+--------------------+----------+
only showing top 10 rows

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.04086978924170...|       1.0|
|    0|[0.98184892212735...|       0.0|
|    1|[0.00474279761632...|       1.0|
|    1|[0.35775366097494...|       1.0|
|    1|[0.05755909903937...|       1.0|
|    0|[0.95305536703752...|       0.0|
|    0|[0.94079962605713...|       0.0|
|    0|[0.1301

DataFrame[label: bigint, probability: vector, prediction: double]

In [None]:
## Validate that the streaming process is active and then show the status