-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9e6cc44
commit 6a23202
Showing
30 changed files
with
986 additions
and
259 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.7.2") | ||
addSbtPlugin("org.hammerlab" % "sbt-parent" % "2.0.0-SNAPSHOT") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package org.apache.spark.hadoop | ||
|
||
import org.apache.spark.deploy.SparkHadoopUtil | ||
import org.apache.spark.executor.InputMetrics | ||
|
||
object Util { | ||
def getFSBytesReadOnThreadCallback: Option[() => Long] = | ||
SparkHadoopUtil.get.getFSBytesReadOnThreadCallback() | ||
|
||
def setBytesRead(bytesRead: Long)(implicit inputMetrics: InputMetrics): Unit = | ||
inputMetrics.setBytesRead(bytesRead) | ||
|
||
def incRecordsRead(amount: Long = 1)(implicit inputMetrics: InputMetrics): Unit = | ||
inputMetrics.incRecordsRead(amount) | ||
|
||
def incBytesRead(amount: Long = 1)(implicit inputMetrics: InputMetrics): Unit = | ||
inputMetrics.incBytesRead(amount) | ||
|
||
val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package org.hammerlab | ||
|
||
import scala.collection.generic.CanBuildFrom | ||
import scala.collection.mutable | ||
|
||
package object collection { | ||
|
||
/** | ||
* [[CanBuildFrom]] instance for constructing [[Array]]s, not provided in standard library. | ||
*/ | ||
implicit def canBuildArray[From] = | ||
new CanBuildFrom[From, String, Array[String]] { | ||
override def apply(from: From): mutable.Builder[String, Array[String]] = | ||
mutable.ArrayBuilder.make[String] | ||
|
||
override def apply(): mutable.Builder[String, Array[String]] = | ||
mutable.ArrayBuilder.make[String] | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package org.hammerlab.hadoop | ||
|
||
import org.apache.hadoop.mapreduce.lib.input | ||
|
||
/** | ||
* Case-class sugar over Hadoop [[input.FileSplit]] | ||
*/ | ||
case class FileSplit(path: Path, | ||
start: Long, | ||
length: Long, | ||
locations: Array[String]) | ||
extends input.FileSplit { | ||
def end = start + length | ||
} | ||
|
||
object FileSplit { | ||
def apply(split: input.FileSplit): FileSplit = | ||
FileSplit( | ||
split.getPath, | ||
split.getStart, | ||
split.getLength, | ||
split.getLocations | ||
) | ||
|
||
implicit def conv(split: input.FileSplit): FileSplit = | ||
apply(split) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package org.hammerlab.hadoop | ||
|
||
import org.apache.hadoop.conf.Configuration | ||
import org.apache.hadoop.mapreduce | ||
import org.apache.hadoop.mapreduce.lib.input | ||
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat.{ SPLIT_MAXSIZE, setInputPaths } | ||
import org.apache.hadoop.mapreduce.{ InputSplit, Job, TaskAttemptContext } | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
case class MaxSplitSize(size: Long) | ||
|
||
object MaxSplitSize { | ||
implicit def makeMaxSplitSize(size: Long): MaxSplitSize = MaxSplitSize(size) | ||
implicit def unmakeMaxSplitSize(size: MaxSplitSize): Long = size.size | ||
|
||
val DEFAULT_MAX_SPLIT_SIZE = 32 * 1024 * 1024L | ||
|
||
def apply(size: Option[Long] = None)(implicit conf: Configuration): MaxSplitSize = | ||
MaxSplitSize( | ||
size.getOrElse( | ||
conf.getLong( | ||
SPLIT_MAXSIZE, | ||
DEFAULT_MAX_SPLIT_SIZE | ||
) | ||
) | ||
) | ||
} | ||
|
||
object FileSplits { | ||
|
||
trait Config { | ||
def maxSplitSize: MaxSplitSize | ||
} | ||
|
||
object Config { | ||
def apply(maxSplitSize: Long): Config = ConfigImpl(maxSplitSize) | ||
def apply(maxSplitSize: Option[Long] = None)(implicit conf: Configuration): Config = | ||
ConfigImpl( | ||
MaxSplitSize( | ||
maxSplitSize | ||
) | ||
) | ||
|
||
implicit def default(implicit conf: Configuration) = apply() | ||
} | ||
|
||
private case class ConfigImpl(maxSplitSize: MaxSplitSize) | ||
extends Config | ||
|
||
def apply(path: Path, | ||
conf: Configuration)( | ||
implicit config: Config | ||
): Seq[FileSplit] = { | ||
|
||
val job = Job.getInstance(conf, s"$path:file-splits") | ||
|
||
val jobConf = job.getConfiguration | ||
|
||
jobConf.setLong(SPLIT_MAXSIZE, config.maxSplitSize) | ||
|
||
setInputPaths(job, path) | ||
|
||
val fif = | ||
new input.FileInputFormat[Any, Any] { | ||
// Hadoop API requires us to have a stub here, though it is not used | ||
override def createRecordReader(split: InputSplit, | ||
context: TaskAttemptContext): mapreduce.RecordReader[Any, Any] = | ||
??? | ||
} | ||
|
||
fif | ||
.getSplits(job) | ||
.asScala | ||
.map(_.asInstanceOf[input.FileSplit]: FileSplit) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package org.hammerlab.hadoop | ||
|
||
import java.net.URI | ||
|
||
import org.apache.hadoop.fs | ||
|
||
case class Path(uri: URI) { | ||
override def toString: String = uri.toString | ||
} | ||
|
||
object Path { | ||
implicit def fromHadoopPath(path: fs.Path): Path = Path(path.toUri) | ||
implicit def toHadoopPath(path: Path): fs.Path = new fs.Path(path.uri) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package org.hammerlab.hadoop | ||
|
||
import com.esotericsoftware.kryo.Kryo | ||
import org.apache.spark.serializer.KryoRegistrator | ||
|
||
object Registrar extends KryoRegistrator { | ||
override def registerClasses(kryo: Kryo): Unit = { | ||
kryo.register(classOf[SerializableConfiguration]) | ||
} | ||
} |
32 changes: 32 additions & 0 deletions
32
src/main/scala/org/hammerlab/hadoop/SerializableConfiguration.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package org.hammerlab.hadoop | ||
|
||
import java.io.{ ObjectInputStream, ObjectOutputStream } | ||
|
||
import org.apache.hadoop.conf.Configuration | ||
import org.apache.spark.broadcast.Broadcast | ||
|
||
class SerializableConfiguration(@transient var value: Configuration) | ||
extends Serializable { | ||
private def writeObject(out: ObjectOutputStream): Unit = { | ||
out.defaultWriteObject() | ||
value.write(out) | ||
} | ||
|
||
private def readObject(in: ObjectInputStream): Unit = { | ||
value = new Configuration(false) | ||
value.readFields(in) | ||
} | ||
} | ||
|
||
object SerializableConfiguration { | ||
implicit def unwrapSerializableConfiguration(conf: SerializableConfiguration): Configuration = conf.value | ||
implicit def unwrapSerializableConfigurationBroadcast(confBroadcast: Broadcast[SerializableConfiguration]): Configuration = confBroadcast.value.value | ||
|
||
def apply(conf: Configuration): SerializableConfiguration = | ||
new SerializableConfiguration(conf) | ||
|
||
implicit class ConfWrapper(val conf: Configuration) extends AnyVal { | ||
def serializable: SerializableConfiguration = | ||
SerializableConfiguration(conf) | ||
} | ||
} |
8 changes: 4 additions & 4 deletions
8
...UnsplittableSequenceFileInputFormat.scala → ...UnsplittableSequenceFileInputFormat.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
src/main/scala/org/hammerlab/magic/rdd/keyed/FilterKeysRDD.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package org.hammerlab.magic.rdd.keyed | ||
|
||
import org.apache.spark.broadcast.Broadcast | ||
import org.apache.spark.rdd.RDD | ||
|
||
import scala.reflect.ClassTag | ||
|
||
case class FilterKeysRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) { | ||
def filterKeys(setBroadcast: Broadcast[Set[K]]): RDD[(K, V)] = | ||
rdd | ||
.filter { | ||
case (k, _) ⇒ | ||
setBroadcast.value(k) | ||
} | ||
} | ||
|
||
object FilterKeysRDD { | ||
implicit def makeFilterKeysRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): FilterKeysRDD[K, V] = FilterKeysRDD(rdd) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
38 changes: 38 additions & 0 deletions
38
src/main/scala/org/hammerlab/magic/rdd/partitions/AppendEmptyPartitionRDD.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package org.hammerlab.magic.rdd.partitions | ||
|
||
import org.apache.spark.{ OneToOneDependency, Partition, TaskContext } | ||
import org.apache.spark.rdd.RDD | ||
|
||
import scala.reflect.ClassTag | ||
|
||
case class AppendEmptyPartitionRDD[T: ClassTag](rdd: RDD[T]) | ||
extends RDD[T]( | ||
rdd.sparkContext, | ||
Seq( | ||
new OneToOneDependency(rdd) | ||
) | ||
) { | ||
|
||
/** [[RDD.partitions]] is transient, so we denormalize the number of partitions here */ | ||
val num = rdd.getNumPartitions | ||
|
||
override def compute(split: Partition, context: TaskContext): Iterator[T] = | ||
if (split.index < num) | ||
rdd.compute(split, context) | ||
else | ||
Iterator() | ||
|
||
override def getPartitions: Array[Partition] = | ||
rdd.partitions :+ | ||
new Partition { | ||
override def index: Int = num | ||
} | ||
} | ||
|
||
case class AppendEmptyPartition[T: ClassTag](@transient rdd: RDD[T]) { | ||
def appendEmptyPartition: AppendEmptyPartitionRDD[T] = AppendEmptyPartitionRDD(rdd) | ||
} | ||
|
||
object AppendEmptyPartitionRDD { | ||
implicit def makeAppendEmptyPartitionRDD[T: ClassTag](rdd: RDD[T]): AppendEmptyPartition[T] = AppendEmptyPartition(rdd) | ||
} |
Oops, something went wrong.