# IlseArredondo.323019078.Ejercicio_Predicción en Streaming con Spark ML y Spark Streaming

En este notebook vamos a cargar un pipeline que tiene un conjunto de fases de pre-procesamiento y un modelo de clasificacion predecir la probabilidad de un paciente de sufrir un ataque al corazón. La predicción se realizará sobre datos en streaming optenidos a partir del csv de heart.csv

In [3]:
!pip install pyspark
!pip install findspark
!pip install koalas
!pip install plotly
!pip install nbformat
!pip install databricks

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=88ac7d9d371be76e7549a61c8bd1f0bd0cf6eb6f220151e1656cf332a5101ce8
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
Collecting koalas
  Downloading koalas-0.32.0-py3-none-any.whl (593 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import findspark
findspark.init()

In [5]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import *

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('UCI Heart disease').getOrCreate()

In [12]:
from google.colab import files
import pandas as pd
uploaded = files.upload()

import io
df2 = pd.read_csv(io.BytesIO(uploaded['heart.csv']))

Saving heart.csv to heart.csv


In [14]:
## Carga y visualiza el csv de Ejercicios\data\heart.csv con el nombre de heart
heart = spark.read.csv('./heart.csv',
                       inferSchema = True,
                       header = True)

In [13]:
schema = StructType( \
                     [StructField("age", LongType(),True), \
                      StructField("sex", LongType(), True), \
                      StructField("cp", LongType(), True), \
                      StructField('trestbps', LongType(), True), \
                      StructField("chol", LongType(), True), \
                      StructField("fbs", LongType(), True), \
                      StructField("restecg", LongType(), True), \
                      StructField("thalach", LongType(), True),\
                      StructField("exang", LongType(), True), \
                      StructField("oldpeak", DoubleType(), True), \
                      StructField("slope", LongType(),True), \
                      StructField("ca", LongType(), True), \
                      StructField("thal", LongType(), True), \
                      StructField("target", LongType(), True), \
                        ])

In [15]:
from pyspark.ml import Pipeline
from pyspark.sql.types import StructType,StructField,LongType, StringType,DoubleType,TimestampType

df = heart.withColumnRenamed("target","label")
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- label: integer (nullable = true)



In [16]:
testDF, trainDF = df.randomSplit([0.3, 0.7])

## Carga del Pipeline

In [17]:
# Create the logistic regression model
lr = LogisticRegression(maxIter=10, regParam= 0.01)

In [20]:
from pyspark.ml import PipelineModel

# Create the logistic regression model
lr = LogisticRegression(maxIter=10, regParam= 0.01)

# We create a one hot encoder.
ohe = OneHotEncoder(inputCols = ['sex', 'cp', 'fbs', 'restecg', 'slope',
                                 'exang', 'ca', 'thal'],
                    outputCols=['sex_ohe', 'cp_ohe', 'fbs_ohe',
                                'restecg_ohe', 'slp_ohe', 'exng_ohe',
                                'caa_ohe', 'thall_ohe'])

# Input list for scaling
inputs = ['age','trestbps','chol','thalach','oldpeak']

# We scale our inputs
assembler1 = VectorAssembler(inputCols=inputs, outputCol="features_scaled1")
scaler = MinMaxScaler(inputCol="features_scaled1", outputCol="features_scaled")

# We create a second assembler for the encoded columns.
assembler2 = VectorAssembler(inputCols=['sex_ohe', 'cp_ohe',
                                        'fbs_ohe', 'restecg_ohe',
                                        'slp_ohe', 'exng_ohe', 'caa_ohe',
                                        'thall_ohe','features_scaled'],
                             outputCol="features")
# Create stages list
myStages = [assembler1, scaler, ohe, assembler2,lr]
# Set up the pipeline
pipeline = Pipeline(stages= myStages)
# We fit the model using the training data.
pModel = pipeline.fit(trainDF)

In [21]:
## Comprueba que el pipeline anterior funciona correctamente. Para ello realiza una prediccion sobre el conjunto de
## datos de trainDF y muestra la prediccion
# We transform the data.
trainingPred = pModel.transform(trainDF)

# # We select the actual label, probability and predictions
trainingPred.select('label','probability','prediction').show()

+-----+--------------------+----------+
|label|         probability|prediction|
+-----+--------------------+----------+
|    1|[0.06600538313039...|       1.0|
|    1|[0.07316247001796...|       1.0|
|    0|[0.91713077651849...|       0.0|
|    0|[0.76673568590232...|       0.0|
|    1|[0.03516086209690...|       1.0|
|    1|[0.00557640273219...|       1.0|
|    1|[0.06443589101584...|       1.0|
|    1|[0.04437533265611...|       1.0|
|    1|[0.00388373733811...|       1.0|
|    1|[0.02531489565237...|       1.0|
|    0|[0.74862356883628...|       0.0|
|    0|[0.94566371169472...|       0.0|
|    0|[0.39333228673980...|       1.0|
|    1|[0.06974989787790...|       1.0|
|    0|[0.39417606529206...|       1.0|
|    1|[0.03268116134969...|       1.0|
|    1|[0.02846366102979...|       1.0|
|    1|[0.26410280712989...|       1.0|
|    1|[0.01207953987227...|       1.0|
|    1|[0.15789917297314...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



In [22]:
testData = testDF.repartition(10)
type(testData)
testData.toPandas().to_csv('./heart.csv')

In [23]:
## Utiliza los csv guardados en data/heart_streaming para simular un proceso de datos en streaming.
## Para ello, utiliza la funcion de spark.readStream
## En la configuración pon: que se importe un archivo por ejecucion
## que se renombre la variable de "output"a "label"
## Llama a este proceso con el nombre sourceStream
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

schema = StructType([
    StructField("index", IntegerType(), True),
    StructField("age", IntegerType(), True),
    StructField("sex", IntegerType(), True),
    StructField("cp", IntegerType(), True),
    StructField("trestbps", IntegerType(), True),
    StructField("chol", IntegerType(), True),
    StructField("fbs", IntegerType(), True),
    StructField("restecg", IntegerType(), True),
    StructField("thalach", IntegerType(), True),
    StructField("exang", IntegerType(), True),
    StructField("oldpeak", DoubleType(), True),
    StructField("slope", IntegerType(), True),
    StructField("ca", IntegerType(), True),
    StructField("thal", IntegerType(), True),
    StructField("target", IntegerType(), True)  # Renombra la columna "output" a "label"
])

sourceStream = (
    spark.readStream
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .option("maxFilesPerTrigger", 1)  # Importa un archivo por ejecución
    .load("./heart.csv")
    .withColumnRenamed("target", "label")  # Renombra la columna "output" a "label"
)

In [24]:
## Utiliza el pipeline "pModel" para realizar predicciones utilizando los datos en streaming de "sourceStream"
## De la predicción selecciona las variables label, probability, prediction.
## Llama a este proceso con el nombre de "prediction1"


sourceStream.isStreaming

True

In [25]:
prediction1 = pModel.transform(sourceStream)

prediction1.printSchema()

root
 |-- index: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- features_scaled1: vector (nullable = true)
 |-- features_scaled: vector (nullable = true)
 |-- sex_ohe: vector (nullable = true)
 |-- cp_ohe: vector (nullable = true)
 |-- fbs_ohe: vector (nullable = true)
 |-- restecg_ohe: vector (nullable = true)
 |-- slp_ohe: vector (nullable = true)
 |-- exng_ohe: vector (nullable = true)
 |-- caa_ohe: vector (nullable = true)
 |-- thall_ohe: vector (nullable = true)
 |-- features: vec

In [29]:
display(prediction1)

DataFrame[index: int, age: int, sex: int, cp: int, trestbps: int, chol: int, fbs: int, restecg: int, thalach: int, exang: int, oldpeak: double, slope: int, ca: int, thal: int, label: int, features_scaled1: vector, features_scaled: vector, sex_ohe: vector, cp_ohe: vector, fbs_ohe: vector, restecg_ohe: vector, slp_ohe: vector, exng_ohe: vector, caa_ohe: vector, thall_ohe: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

Mostrando las predicciones en consola

In [30]:
## Obten las predicciones sobre los datos en streaming, para ello utiliza prediction1.writeStream. En las opciones de
## configuracion pon: "format" igual a "console"
## en .trigger igual (once=True),
## y permite que el proceso espere hasta que se complete con .awaitTermination()


query = (
    prediction1
    .writeStream
    .outputMode("append")
    .format("console")
    .trigger(once = True)
    .queryName("prediction1")
    .start()
)

query.awaitTermination()

StreamingQueryException: ignored

In [31]:
## Obten las predicciones sobre los datos en streaming, para ello utiliza prediction1.writeStream.
## En las opciones de configuracion pon: que los resultados se guarden en memoria,
## que el .outputMode sea "append"
## que el nombre de la query "queryName" sea "prediction4"

query = (
    prediction1
    .writeStream
    .outputMode("append")
    .format("memory")  # You can change the output format as needed
    .trigger(once = True)
    .queryName("prediction4")
    .start()
)

In [32]:
for x in range(2):
    df = spark.sql(
        "SELECT * FROM prediction4")
    df.show(10)
df

+-----+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+-----+----------------+---------------+-------+------+-------+-----------+-------+--------+-------+---------+--------+-------------+-----------+----------+
|index|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|label|features_scaled1|features_scaled|sex_ohe|cp_ohe|fbs_ohe|restecg_ohe|slp_ohe|exng_ohe|caa_ohe|thall_ohe|features|rawPrediction|probability|prediction|
+-----+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+-----+----------------+---------------+-------+------+-------+-----------+-------+--------+-------+---------+--------+-------------+-----------+----------+
+-----+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+-----+----------------+---------------+-------+------+-------+-----------+-------+--------+-------+---------+--------+-------------+-----------+----------+

+-----+---+---+---+--------+----+---+-------+--

DataFrame[index: int, age: int, sex: int, cp: int, trestbps: int, chol: int, fbs: int, restecg: int, thalach: int, exang: int, oldpeak: double, slope: int, ca: int, thal: int, label: int, features_scaled1: vector, features_scaled: vector, sex_ohe: vector, cp_ohe: vector, fbs_ohe: vector, restecg_ohe: vector, slp_ohe: vector, exng_ohe: vector, caa_ohe: vector, thall_ohe: vector, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [33]:
## Valida que el proceso de streaming está activo y después muestra el estado
sourceStream.isStreaming

True