In [0]:
#Explore Test Data and Run Saved Logistics Regression Model to predict outbreak of flu

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.streaming.StreamingQueryException
import org.apache.spark.sql.types.StructType

val tweetSchema = new StructType()
                .add("tweetId", "string")
                .add("tweetText", "string")
                .add("location", "string")
                .add("timestamp", "string");

val spark = SparkSession
		.builder()
		.appName("StreamHandler")
		.config("spark.master", "local")
		.getOrCreate();
            
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{unix_timestamp, to_date}

val realTweets = "/home/opt/data/test/"

val tweetStream = spark.readStream
		       .schema(tweetSchema)
		       .option("maxFilesPerTrigger", 1)
		       .json(realTweets)
		       .select($"location", to_date(unix_timestamp($"timestamp", "EEE MMM dd HH:mm:ss Z yyyy").cast("timestamp")).as("timestamp"))
 
 val streamingCountsDF = 
  tweetStream
    .groupBy($"location", $"timestamp" , window($"timestamp", "1 hour"))
    .count()
    
streamingCountsDF.isStreaming

spark.conf.set("spark.sql.shuffle.partitions", "1") 

import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.types._

val query =
  streamingCountsDF
    .writeStream
    .format("memory")     
    .trigger(ProcessingTime("10 seconds"))
    .queryName("testTable2")     
    .outputMode("complete") 
    .start()

%sql
select location, weekofyear(window.end) as Week, AVG(count) 
as weekly_avg from testTable2 where count > 0 group by location,  weekofyear(window.end) 
having count(*) > 0 order by weekly_avg DESC


%sql
select location, date_format(window.end, "dd-MM-YYYY") as time, AVG(count) 
as count from testTable2 where count > 0 group by location,  date_format(window.end, "dd-MM-YYYY") 
having count(*) > 0 order by count DESC


%sql
select weekofyear(window.end) as Week from testTable2 where count > 0 group by  weekofyear(timestamp)

// create the features for test data and run the prediction
import org.apache.spark.sql.functions.udf
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.classification.BinaryLogisticRegressionSummary
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.ml.{Pipeline, PipelineModel}

////
val locationRankDF = sql("select row_number() over (order by location) as locRank, location as locName from testTable2  GROUP BY location")
locationRankDF.createOrReplaceTempView("locationRankMap")
println("Location Rank")

//
def calcLabel: (Double => Double) = (arg: Double) => {if (arg > 2.5) 1.0 else 0.0 }

// TESTING DATA SET

val testDf1 = sql(s"""
SELECT locRank, avg(count) as weeklyAvg, weekofyear(window.end) as week
  FROM testTable2
  JOIN locationRankMap ON location = locName
  GROUP BY weekofyear(window.end), locRank
  HAVING weeklyAvg > 0
  order by weeklyAvg DESC
  """)
  
//df1.show

val testDf2 = testDf1.select($"locRank".cast("Double"), $"weeklyAvg".cast("Double"), $"week".cast("Double"))
val flulabel = udf(calcLabel)

val testDf3 = testDf2.withColumn("class", flulabel(testDf2("weeklyAvg")))
val assembler = new VectorAssembler()
  .setInputCols(Array("locRank", "weeklyAvg","week"))
  .setOutputCol("features")

val testDf4 = assembler.transform(testDf3)

val labelIndexer = new StringIndexer().setInputCol("class").setOutputCol("label")
val testDf5 = labelIndexer.fit(testDf4).transform(testDf4)

val splitSeed = 5043
val Array(holdoutData2, testData) = testDf5.randomSplit(Array(0.0, 1.0), splitSeed)

testData.show

//////////////////////////////// ///////////////////////////////////// ////////////////////////////////////////

val model = PipelineModel.load("/home/opt/models/lr-model4")

//println(s"Coefficients: ${model.coefficients} Intercept: ${model.intercept}")

val predictions = model.transform(testData)

print("$$$$$$ predictions size >>> "+predictions.count())

predictions.show

val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("rawPrediction").setMetricName("areaUnderROC")
val accuracy = evaluator.evaluate(predictions)

val lp = predictions.select( "label", "prediction")
val counttotal = predictions.count()
val correct = lp.filter($"label" === $"prediction").count()
val wrong = lp.filter(not($"label" === $"prediction")).count()
val truep = lp.filter($"prediction" === 0.0).filter($"label" === $"prediction").count()
val falseN = lp.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count()
val falseP = lp.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count()
val ratioWrong=wrong.toDouble/counttotal.toDouble
val ratioCorrect=correct.toDouble/counttotal.toDouble

val  predictionAndLabels =predictions.select("rawPrediction", "label").rdd.map(x => (x(0).asInstanceOf[DenseVector](1), x(1).asInstanceOf[Double]))
val metrics = new BinaryClassificationMetrics(predictionAndLabels)
println("area under the precision-recall curve: " + metrics.areaUnderPR)
println("area under the receiver operating characteristic (ROC) curve : " + metrics.areaUnderROC)


//accuracy: Double = 0.941260162601624
//lp: org.apache.spark.sql.DataFrame = [label: double, prediction: double]
//counttotal: Long = 229
//correct: Long = 220
//wrong: Long = 9
//truep: Long = 203
//falseN: Long = 7
//falseP: Long = 2
//ratioWrong: Double = 0.039301310043668124
//ratioCorrect: Double = 0.9606986899563319

//area under the precision-recall curve: 0.8665970252411103
//area under the (ROC) curve : 0.941260162601624