streaming example

lintool · Mar 21, 2018 · 595c057 · 595c057
1 parent 66ffc50
commit 595c057
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 0 deletions.
diff --git a/pom.xml b/pom.xml
@@ -254,6 +254,12 @@
       <artifactId>spark-sql_2.11</artifactId>
       <version>${spark.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_2.11</artifactId>
+      <version>${spark.version}</version>
+      <scope>provided</scope>
+    </dependency>
   </dependencies>
 
 </project>
diff --git a/src/main/scala/io/bespin/scala/spark/streaming/ManualClockWrapper.scala b/src/main/scala/io/bespin/scala/spark/streaming/ManualClockWrapper.scala
@@ -0,0 +1,27 @@
+/**
+  * Bespin: reference implementations of "big data" algorithms
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+package org.apache.spark.streaming
+
+import org.apache.spark.util.ManualClock
+
+object ManualClockWrapper {
+  def advanceManualClock(ssc: StreamingContext, timeToAdd: Long, sleepTime: Long = 0L): Unit = {
+    val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
+    clock.advance(timeToAdd)
+    Thread.sleep(sleepTime)
+  }
+}
diff --git a/src/main/scala/io/bespin/scala/spark/streaming/StreamingExample.scala b/src/main/scala/io/bespin/scala/spark/streaming/StreamingExample.scala
@@ -0,0 +1,108 @@
+/**
+  * Bespin: reference implementations of "big data" algorithms
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  * http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+
+package io.bespin.scala.spark.streaming
+
+import java.io.File
+
+import org.apache.log4j._
+import org.apache.spark.SparkContext
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.streaming.{ManualClockWrapper, Minutes, StreamingContext}
+import org.apache.spark.util.LongAccumulator
+import org.rogach.scallop._
+
+import scala.collection.mutable
+
+class StreamingExampleConf(args: Seq[String]) extends ScallopConf(args) {
+  mainOptions = Seq(input, checkpoint, output)
+  val input = opt[String](descr = "input path", required = true)
+  val checkpoint = opt[String](descr = "checkpoint path", required = true)
+  val output = opt[String](descr = "output path", required = true)
+  verify()
+}
+
+object StreamingExample {
+  val log = Logger.getLogger(getClass().getName())
+
+  def main(argv: Array[String]): Unit = {
+    val args = new StreamingExampleConf(argv)
+
+    log.info("Input: " + args.input())
+
+    val spark = SparkSession
+      .builder()
+      .config("spark.streaming.clock", "org.apache.spark.util.ManualClock")
+      .appName("StreamingExample")
+      .getOrCreate()
+
+    val numCompletedRDDs = spark.sparkContext.longAccumulator("number of completed RDDs")
+
+    val batchDuration = Minutes(1)
+    val ssc = new StreamingContext(spark.sparkContext, batchDuration)
+
+    val rdds = buildMockStream(ssc.sparkContext, args.input())
+    val inputData: mutable.Queue[RDD[String]] = mutable.Queue()
+    val stream = ssc.queueStream(inputData)
+
+    val wc = stream.map(_.split(","))
+      .map(tuple => ("all", 1))
+      .reduceByKeyAndWindow(
+        (x: Int, y: Int) => x + y, (x: Int, y: Int) => x - y, Minutes(60), Minutes(60))
+      .persist()
+
+    wc.saveAsTextFiles(args.output())
+
+    wc.foreachRDD(rdd => {
+      numCompletedRDDs.add(1L)
+    })
+    ssc.checkpoint(args.checkpoint())
+    ssc.start()
+
+    for (rdd <- rdds) {
+      inputData += rdd
+      ManualClockWrapper.advanceManualClock(ssc, batchDuration.milliseconds)
+      // Sleep a bit to make sure the batch is processed.
+      while (inputData.length > 100) {
+        Thread.sleep(50L)
+      }
+    }
+
+    waitForAccumulator(numCompletedRDDs, 24) { () =>
+      ssc.stop()
+    }
+  }
+
+  def waitForAccumulator(accum: LongAccumulator, target: Long)(cleanUpFunc: () => Unit): Unit = {
+    while (accum.value < target) {
+      Thread.sleep(50L)
+    }
+    cleanUpFunc()
+  }
+
+  def buildMockStream(sc: SparkContext, directoryName: String): Array[RDD[String]] = {
+    val d = new File(directoryName)
+    if (d.exists() && d.isDirectory) {
+      d.listFiles
+        .filter(file => file.isFile && file.getName.startsWith("part-"))
+        .map(file => d.getAbsolutePath + "/" + file.getName).sorted
+        .map(path => sc.textFile(path))
+    } else {
+      throw new IllegalArgumentException(s"$directoryName is not a valid directory containing part files!")
+    }
+  }
+}