From 7db125286e7300fef0a59403ef48afcedd51fd3f Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 10 Apr 2017 02:43:21 +0000 Subject: [PATCH] improve LociSet Java serde --- build.sbt | 2 +- project/plugins.sbt | 2 +- .../hammerlab/genomics/loci/map/Builder.scala | 4 +- .../hammerlab/genomics/loci/set/Contig.scala | 72 ++++++++----------- .../genomics/loci/set/ContigSerializer.scala | 20 +++--- .../hammerlab/genomics/loci/set/LociSet.scala | 41 ++++++----- .../loci/set/SerializableContig.scala | 54 ++++++++++++++ .../genomics/loci/set/SerializerSuite.scala | 4 +- .../genomics/loci/set/test/LociSetUtil.scala | 2 +- 9 files changed, 121 insertions(+), 80 deletions(-) create mode 100644 src/main/scala/org/hammerlab/genomics/loci/set/SerializableContig.scala diff --git a/build.sbt b/build.sbt index 3ba3ec0..f724e70 100644 --- a/build.sbt +++ b/build.sbt @@ -1,7 +1,7 @@ organization := "org.hammerlab.genomics" name := "loci" -version := "1.5.7" +version := "1.5.8-SNAPSHOT" addSparkDeps diff --git a/project/plugins.sbt b/project/plugins.sbt index 14c0989..cf38390 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1 +1 @@ -addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.7.5") +addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.7.7-SNAPSHOT") diff --git a/src/main/scala/org/hammerlab/genomics/loci/map/Builder.scala b/src/main/scala/org/hammerlab/genomics/loci/map/Builder.scala index 7f9b528..b23a27e 100644 --- a/src/main/scala/org/hammerlab/genomics/loci/map/Builder.scala +++ b/src/main/scala/org/hammerlab/genomics/loci/map/Builder.scala @@ -24,8 +24,8 @@ private[loci] class Builder[T] { /** Set the value for all loci in the given LociSet to the specified value in the LociMap under construction. */ def put(loci: LociSet, value: T): Builder[T] = { for { - contig <- loci.contigs - Interval(start, end) <- contig.ranges + contig ← loci.contigs + Interval(start, end) ← contig.ranges } { put(contig.name, start, end, value) } diff --git a/src/main/scala/org/hammerlab/genomics/loci/set/Contig.scala b/src/main/scala/org/hammerlab/genomics/loci/set/Contig.scala index fda23f4..f1312ac 100644 --- a/src/main/scala/org/hammerlab/genomics/loci/set/Contig.scala +++ b/src/main/scala/org/hammerlab/genomics/loci/set/Contig.scala @@ -1,60 +1,45 @@ package org.hammerlab.genomics.loci.set -import java.io.{ ObjectInputStream, ObjectOutputStream } - +import com.google.common.collect.Range.closedOpen import com.google.common.collect.{ RangeSet, TreeRangeSet, Range ⇒ JRange } import org.hammerlab.genomics.loci.iterator.LociIterator -import org.hammerlab.genomics.reference.Interval.orderByStart +import org.hammerlab.genomics.reference.ContigName.Factory import org.hammerlab.genomics.reference.{ ContigName, Interval, Locus, NumLoci, Region } import org.hammerlab.strings.TruncatedToString -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer /** * A set of loci on a contig, stored/manipulated as loci ranges. */ -case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) extends TruncatedToString { +case class Contig(name: ContigName, + rangeSet: RangeSet[Locus])(implicit factory: Factory) + extends TruncatedToString { import Contig.lociRange - private def readObject(in: ObjectInputStream): Unit = { - name = in.readUTF() - val num = in.readInt() - rangeSet = TreeRangeSet.create[Locus]() - for { - i <- 0 until num - } { - val start = Locus(in.readLong()) - val end = Locus(in.readLong()) - val range = lociRange(start, end) - rangeSet.add(range) - } - } - - private def writeObject(out: ObjectOutputStream): Unit = { - out.writeUTF(name.name) - out.writeInt(ranges.length) - for { - Interval(start, end) <- ranges - } { - out.writeLong(start.locus) - out.writeLong(end.locus) - } - } + /** + * [[RangeSet]] is not [[Serializable]], and we delegate work-around serialization logic to [[SerializableContig]] to + * avoid the requirement for a 0-arg constructor here. + */ + protected def writeReplace: Object = + SerializableContig(this) /** Is the given locus contained in this set? */ def contains(locus: Locus): Boolean = rangeSet.contains(locus) + @transient private lazy val jranges = rangeSet.asRanges() + @transient lazy val numRanges = jranges.size() + /** This set as a regular scala array of ranges. */ - lazy val ranges: Vector[Interval] = - rangeSet - .asRanges() + def ranges: Iterator[Interval] = + jranges + .iterator() + .asScala .map(range => Interval(range.lowerEndpoint(), range.upperEndpoint())) - .toVector - .sorted(orderByStart) - def regions: Iterator[Region] = ranges.iterator.map(range ⇒ Region(name, range)) + def regions: Iterator[Region] = ranges.map(range ⇒ Region(name, range)) /** Is this contig empty? */ def isEmpty: Boolean = rangeSet.isEmpty @@ -62,10 +47,10 @@ case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) e def nonEmpty: Boolean = !isEmpty /** Iterator through loci on this contig, sorted. */ - def iterator = new LociIterator(ranges.iterator.buffered) + def iterator = new LociIterator(ranges.buffered) /** Number of loci on this contig. */ - def count: NumLoci = ranges.map(_.length).sum + @transient lazy val count: NumLoci = ranges.map(_.length).sum /** Returns whether a given genomic region overlaps with any loci on this contig. */ def intersects(start: Locus, end: Locus): Boolean = !rangeSet.subRangeSet(lociRange(start, end)).isEmpty @@ -82,7 +67,7 @@ case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) e var remaining = numToTake var doneTaking = false for { - range <- ranges + range ← ranges } { if (doneTaking) { secondRanges.append(range) @@ -108,17 +93,18 @@ case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) e object Contig { // Empty-contig constructor, for convenience. - def apply(name: ContigName): Contig = Contig(name, TreeRangeSet.create[Locus]()) + def apply(name: ContigName)(implicit f: Factory): Contig = Contig(name, TreeRangeSet.create[Locus]()) // Constructors that make a Contig from its name and some ranges. - def apply(tuple: (ContigName, Iterable[Interval])): Contig = Contig(tuple._1, tuple._2) - def apply(name: ContigName, ranges: Iterable[Interval]): Contig = + def apply(tuple: (ContigName, Iterable[Interval]))(implicit f: Factory): Contig = Contig(tuple._1, tuple._2) + def apply(name: ContigName, ranges: Iterable[Interval])(implicit f: Factory): Contig = apply(name, ranges.iterator) + def apply(name: ContigName, ranges: Iterator[Interval])(implicit f: Factory): Contig = Contig( name, { val rangeSet = TreeRangeSet.create[Locus]() for { - Interval(start, end) <- ranges + Interval(start, end) ← ranges } { rangeSet.add(lociRange(start, end)) } @@ -126,5 +112,5 @@ object Contig { } ) - def lociRange(start: Locus, end: Locus): JRange[Locus] = JRange.closedOpen[Locus](start, end) + def lociRange(start: Locus, end: Locus): JRange[Locus] = closedOpen[Locus](start, end) } diff --git a/src/main/scala/org/hammerlab/genomics/loci/set/ContigSerializer.scala b/src/main/scala/org/hammerlab/genomics/loci/set/ContigSerializer.scala index 2f5c230..ad1309e 100644 --- a/src/main/scala/org/hammerlab/genomics/loci/set/ContigSerializer.scala +++ b/src/main/scala/org/hammerlab/genomics/loci/set/ContigSerializer.scala @@ -11,7 +11,7 @@ class ContigSerializer extends KryoSerializer[Contig] { def write(kryo: Kryo, output: Output, obj: Contig) = { kryo.writeObject(output, obj.name) - output.writeInt(obj.ranges.length) + output.writeInt(obj.numRanges) for { Interval(start, end) ← obj.ranges } { @@ -22,16 +22,18 @@ class ContigSerializer extends KryoSerializer[Contig] { def read(kryo: Kryo, input: Input, klass: Class[Contig]): Contig = { val name = kryo.readObject(input, classOf[ContigName]) - val length = input.readInt() + val numRanges = input.readInt() val treeRangeSet = TreeRangeSet.create[Locus]() - val ranges = (0 until length).foreach { _ ⇒ - treeRangeSet.add( - closedOpen( - Locus(input.readLong()), - Locus(input.readLong()) + val ranges = + (0 until numRanges).foreach { _ ⇒ + treeRangeSet.add( + closedOpen( + Locus(input.readLong()), + Locus(input.readLong()) + ) ) - ) - } + } + Contig(name, treeRangeSet) } } diff --git a/src/main/scala/org/hammerlab/genomics/loci/set/LociSet.scala b/src/main/scala/org/hammerlab/genomics/loci/set/LociSet.scala index 33a9c99..79f36f6 100644 --- a/src/main/scala/org/hammerlab/genomics/loci/set/LociSet.scala +++ b/src/main/scala/org/hammerlab/genomics/loci/set/LociSet.scala @@ -1,8 +1,9 @@ package org.hammerlab.genomics.loci.set -import htsjdk.samtools.util.{Interval => HTSJDKInterval} -import org.hammerlab.genomics.loci.parsing.{All, LociRange, LociRanges, ParsedLoci} -import org.hammerlab.genomics.reference.{ContigLengths, ContigName, Interval, Locus, NumLoci, Region} +import htsjdk.samtools.util.{ Interval ⇒ HTSJDKInterval } +import org.hammerlab.genomics.loci.parsing.{ All, LociRange, LociRanges, ParsedLoci } +import org.hammerlab.genomics.reference.ContigName.Factory +import org.hammerlab.genomics.reference.{ ContigLengths, ContigName, Interval, Locus, NumLoci, Region } import org.hammerlab.strings.TruncatedToString import org.scalautils.ConversionCheckedTripleEquals._ @@ -20,7 +21,8 @@ import scala.collection.immutable.TreeMap * * @param map A map from contig-name to Contig, which is a set or genomic intervals as described above. */ -case class LociSet(private val map: SortedMap[ContigName, Contig]) extends TruncatedToString { +case class LociSet(private val map: SortedMap[ContigName, Contig]) + extends TruncatedToString { /** The contigs included in this LociSet with a nonempty set of loci. */ @transient lazy val contigs = map.values.toArray @@ -97,7 +99,7 @@ case class LociSet(private val map: SortedMap[ContigName, Contig]) extends Trunc .ranges // We add 1 to the start to move to 1-based coordinates // Since the `Interval` end is inclusive, we are adding and subtracting 1, no-op - .map(interval => + .map(interval ⇒ new HTSJDKInterval( contig.name, interval.start.locus.toInt + 1, @@ -114,9 +116,6 @@ object LociSet { def all(contigLengths: ContigLengths) = LociSet(All, contigLengths) - // NOTE(ryan): only used in tests; TODO: move to test-specific helper. -// def apply(lociStr: String): LociSet = ParsedLoci(lociStr).result - /** * These constructors build a LociSet directly from Contigs. * @@ -128,48 +127,48 @@ object LociSet { TreeMap( contigs .filterNot(_.isEmpty) - .map(contig => contig.name -> contig) + .map(contig ⇒ contig.name → contig) .toSeq: _* ) ) - def apply(regions: Iterable[Region]): LociSet = + def apply(regions: Iterable[Region])(implicit f: Factory): LociSet = LociSet.fromContigs( (for { - Region(contigName, start, end) <- regions + Region(contigName, start, end) ← regions if start != end range = Interval(start, end) } yield - contigName -> range + contigName → range ) .groupBy(_._1) .mapValues(_.map(_._2)) .map(Contig(_)) ) - def apply(loci: ParsedLoci, contigLengths: ContigLengths): LociSet = + def apply(loci: ParsedLoci, contigLengths: ContigLengths)(implicit f: Factory): LociSet = LociSet( loci match { - case All => + case All ⇒ for { - (contig, length) <- contigLengths + (contig, length) ← contigLengths } yield Region(contig, Locus(0), Locus(length)) - case LociRanges(ranges) => + case LociRanges(ranges) ⇒ for { - LociRange(contigName, start, endOpt) <- ranges + LociRange(contigName, start, endOpt) ← ranges contigLengthOpt = contigLengths.get(contigName) } yield (endOpt, contigLengthOpt) match { - case (Some(end), Some(contigLength)) if end > Locus(contigLength) => + case (Some(end), Some(contigLength)) if end > Locus(contigLength) ⇒ throw new IllegalArgumentException( s"Invalid range $start-${endOpt.get} for contig '$contigName' which has length $contigLength" ) - case (Some(end), _) => + case (Some(end), _) ⇒ Region(contigName, start, end) - case (_, Some(contigLength)) => + case (_, Some(contigLength)) ⇒ Region(contigName, start, Locus(contigLength)) - case _ => + case _ ⇒ throw new IllegalArgumentException( s"No such contig: $contigName. Valid contigs: ${contigLengths.keys.mkString(", ")}" ) diff --git a/src/main/scala/org/hammerlab/genomics/loci/set/SerializableContig.scala b/src/main/scala/org/hammerlab/genomics/loci/set/SerializableContig.scala new file mode 100644 index 0000000..f7c0c9c --- /dev/null +++ b/src/main/scala/org/hammerlab/genomics/loci/set/SerializableContig.scala @@ -0,0 +1,54 @@ +package org.hammerlab.genomics.loci.set + +import java.io.{ ObjectInputStream, ObjectOutputStream } + +import org.hammerlab.genomics.reference.ContigName.Factory +import org.hammerlab.genomics.reference.{ ContigName, Interval, Locus } + +private class SerializableContig + extends Serializable { + var contigName: ContigName = _ + var numRanges: Int = 0 + var ranges: Iterator[Interval] = _ + implicit var factory: Factory = _ + + private def writeObject(out: ObjectOutputStream): Unit = { + out.writeObject(factory) + out.writeUTF(contigName.name) + out.writeInt(numRanges) + for { + Interval(start, end) ← ranges + } { + out.writeLong(start.locus) + out.writeLong(end.locus) + } + } + + private def readObject(in: ObjectInputStream): Unit = { + factory = in.readObject().asInstanceOf[Factory] + contigName = in.readUTF() + numRanges = in.readInt() + ranges = + (0 until numRanges) + .map { _ ⇒ + val start = Locus(in.readLong()) + val end = Locus(in.readLong()) + Interval(start, end) + } + .iterator + } + + protected def readResolve: Object = + Contig(contigName, ranges) +} + +private object SerializableContig { + def apply(contig: Contig)(implicit factory: Factory): SerializableContig = { + val sc = new SerializableContig + sc.contigName = contig.name + sc.numRanges = contig.numRanges + sc.ranges = contig.ranges + sc.factory = factory + sc + } +} diff --git a/src/test/scala/org/hammerlab/genomics/loci/set/SerializerSuite.scala b/src/test/scala/org/hammerlab/genomics/loci/set/SerializerSuite.scala index e40ff79..842917f 100644 --- a/src/test/scala/org/hammerlab/genomics/loci/set/SerializerSuite.scala +++ b/src/test/scala/org/hammerlab/genomics/loci/set/SerializerSuite.scala @@ -5,9 +5,9 @@ import java.io.{ ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, import org.apache.spark.broadcast.Broadcast import org.hammerlab.genomics.loci.set.test.LociSetUtil import org.hammerlab.genomics.reference.ContigName.Factory -import org.hammerlab.genomics.reference.{ Locus, PermissiveRegistrar } -import org.hammerlab.genomics.reference.test.{ ClearContigNames, LenientContigNameConversions } import org.hammerlab.genomics.reference.test.LociConversions._ +import org.hammerlab.genomics.reference.test.{ ClearContigNames, LenientContigNameConversions } +import org.hammerlab.genomics.reference.{ Locus, PermissiveRegistrar } import org.hammerlab.spark.test.suite.{ KryoSparkSuite, SparkSerialization } import scala.collection.mutable diff --git a/src/test/scala/org/hammerlab/genomics/loci/set/test/LociSetUtil.scala b/src/test/scala/org/hammerlab/genomics/loci/set/test/LociSetUtil.scala index 06e3cfd..5339997 100644 --- a/src/test/scala/org/hammerlab/genomics/loci/set/test/LociSetUtil.scala +++ b/src/test/scala/org/hammerlab/genomics/loci/set/test/LociSetUtil.scala @@ -13,7 +13,7 @@ import org.scalatest.Suite trait LociSetUtil { self: Suite ⇒ implicit def parsedLociFromString(lociStr: String)(implicit factory: Factory): ParsedLoci = ParsedLoci(lociStr) - implicit def lociSetFromParsedLoci(parsedLoci: ParsedLoci): LociSet = LociSet(parsedLoci, ContigLengths.empty) + implicit def lociSetFromParsedLoci(parsedLoci: ParsedLoci)(implicit factory: Factory): LociSet = LociSet(parsedLoci, ContigLengths.empty) implicit def lociSetFromString(lociStr: String)(implicit factory: Factory): LociSet = ParsedLoci(lociStr) def lociSet(lociStr: String)(implicit factory: Factory): LociSet = lociStr def lociSet(parsedLoci: ParsedLoci): LociSet = parsedLoci