Skip to content

Commit

Permalink
streaming example
Browse files Browse the repository at this point in the history
  • Loading branch information
youngbink committed Mar 21, 2018
1 parent 66ffc50 commit 595c057
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 0 deletions.
6 changes: 6 additions & 0 deletions pom.xml
Expand Up @@ -254,6 +254,12 @@
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>

</project>
@@ -0,0 +1,27 @@
/**
* Bespin: reference implementations of "big data" algorithms
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.streaming

import org.apache.spark.util.ManualClock

object ManualClockWrapper {
def advanceManualClock(ssc: StreamingContext, timeToAdd: Long, sleepTime: Long = 0L): Unit = {
val clock = ssc.scheduler.clock.asInstanceOf[ManualClock]
clock.advance(timeToAdd)
Thread.sleep(sleepTime)
}
}
108 changes: 108 additions & 0 deletions src/main/scala/io/bespin/scala/spark/streaming/StreamingExample.scala
@@ -0,0 +1,108 @@
/**
* Bespin: reference implementations of "big data" algorithms
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.bespin.scala.spark.streaming

import java.io.File

import org.apache.log4j._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{ManualClockWrapper, Minutes, StreamingContext}
import org.apache.spark.util.LongAccumulator
import org.rogach.scallop._

import scala.collection.mutable

class StreamingExampleConf(args: Seq[String]) extends ScallopConf(args) {
mainOptions = Seq(input, checkpoint, output)
val input = opt[String](descr = "input path", required = true)
val checkpoint = opt[String](descr = "checkpoint path", required = true)
val output = opt[String](descr = "output path", required = true)
verify()
}

object StreamingExample {
val log = Logger.getLogger(getClass().getName())

def main(argv: Array[String]): Unit = {
val args = new StreamingExampleConf(argv)

log.info("Input: " + args.input())

val spark = SparkSession
.builder()
.config("spark.streaming.clock", "org.apache.spark.util.ManualClock")
.appName("StreamingExample")
.getOrCreate()

val numCompletedRDDs = spark.sparkContext.longAccumulator("number of completed RDDs")

val batchDuration = Minutes(1)
val ssc = new StreamingContext(spark.sparkContext, batchDuration)

val rdds = buildMockStream(ssc.sparkContext, args.input())
val inputData: mutable.Queue[RDD[String]] = mutable.Queue()
val stream = ssc.queueStream(inputData)

val wc = stream.map(_.split(","))
.map(tuple => ("all", 1))
.reduceByKeyAndWindow(
(x: Int, y: Int) => x + y, (x: Int, y: Int) => x - y, Minutes(60), Minutes(60))
.persist()

wc.saveAsTextFiles(args.output())

wc.foreachRDD(rdd => {
numCompletedRDDs.add(1L)
})
ssc.checkpoint(args.checkpoint())
ssc.start()

for (rdd <- rdds) {
inputData += rdd
ManualClockWrapper.advanceManualClock(ssc, batchDuration.milliseconds)
// Sleep a bit to make sure the batch is processed.
while (inputData.length > 100) {
Thread.sleep(50L)
}
}

waitForAccumulator(numCompletedRDDs, 24) { () =>
ssc.stop()
}
}

def waitForAccumulator(accum: LongAccumulator, target: Long)(cleanUpFunc: () => Unit): Unit = {
while (accum.value < target) {
Thread.sleep(50L)
}
cleanUpFunc()
}

def buildMockStream(sc: SparkContext, directoryName: String): Array[RDD[String]] = {
val d = new File(directoryName)
if (d.exists() && d.isDirectory) {
d.listFiles
.filter(file => file.isFile && file.getName.startsWith("part-"))
.map(file => d.getAbsolutePath + "/" + file.getName).sorted
.map(path => sc.textFile(path))
} else {
throw new IllegalArgumentException(s"$directoryName is not a valid directory containing part files!")
}
}
}

0 comments on commit 595c057

Please sign in to comment.