In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructType
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
import pyspark.sql.functions as f
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml.pipeline import PipelineModel

In [2]:
from pyspark.sql.types import *
spark = SparkSession.builder.appName("Mlseverity").getOrCreate()

accidentsSchema = StructType([
                    StructField("TMC", DoubleType(), True),
                    StructField("Severity", DoubleType(), True),
                    StructField("Start_Lat", DoubleType(), True),
                    StructField("Start_Lng", DoubleType(), True),
                    StructField("Distance(mi)", DoubleType(), True),
                    StructField("Temperature(F)", DoubleType(), True),
                    StructField("Wind_Chill(F)", DoubleType(), True),
                    StructField("Humidity(%)", DoubleType(), True),
                    StructField("Pressure(in)", DoubleType(), True),
                    StructField("Visibility(mi)", DoubleType(), True),
                    StructField("Wind_Speed(mph)", DoubleType(), True),
                    StructField("Precipitation(in)", DoubleType(), True),
                    StructField("Duration", DoubleType(), True),
                    StructField("Side", DoubleType(), True),
                    StructField("City", DoubleType(), True),
                    StructField("County", DoubleType(), True),
                    StructField("State", DoubleType(), True),
                    StructField("Wind_Direction", DoubleType(), True),
                    StructField("Weather_Condition", DoubleType(), True),
                    StructField("Amenity", DoubleType(), True),
                    StructField("Bump", DoubleType(), True),
                    StructField("Crossing", DoubleType(), True),
                    StructField("Give_Way", DoubleType(), True),
                    StructField("Junction", DoubleType(), True),
                    StructField("No_Exit", DoubleType(), True),
                    StructField("Railway", DoubleType(), True),
                    StructField("Roundabout", DoubleType(), True),
                    StructField("Station", DoubleType(), True),
                    StructField("Stop", DoubleType(), True),
                    StructField("Traffic_Calming", DoubleType(), True),
                    StructField("Traffic_Signal", DoubleType(), True),
                    StructField("Turning_Loop", DoubleType(), True),
                    StructField("Civil_Twilight", DoubleType(), True),
                ])

dfCSV = spark.readStream.option("delimeter", ",").option("header", True).schema(accidentsSchema).csv("tmp")

In [3]:
features = ["TMC", "Start_Lat", "Start_Lng", "Distance(mi)", "Temperature(F)", "Wind_Chill(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)", "Wind_Speed(mph)", "Precipitation(in)", "Duration", "Side", "City", "County", "State", "Wind_Direction", "Weather_Condition", "Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop", "Civil_Twilight"]

vectorAssembler = VectorAssembler(inputCols=features, outputCol="features")
v_df = vectorAssembler.transform(dfCSV)
v_df = v_df.select(["features", "Severity"])
v_df = v_df.withColumnRenamed("Severity", "label")

In [4]:
persistedModel = LogisticRegressionModel.load("LogisticRegression")

In [5]:
predictions = persistedModel.transform(v_df)

In [None]:
predictions.createOrReplaceTempView("df")

prueba = spark.sql("SELECT prediction as Severiry, count(*) as N  FROM df GROUP BY prediction")

query = prueba.writeStream.outputMode("complete").format("console").start()
query.awaitTermination()