Skip to content

Commit

Permalink
improve LociSet Java serde
Browse files Browse the repository at this point in the history
  • Loading branch information
ryan-williams committed Apr 10, 2017
1 parent a0b1a37 commit 7db1252
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 80 deletions.
2 changes: 1 addition & 1 deletion build.sbt
@@ -1,7 +1,7 @@

organization := "org.hammerlab.genomics"
name := "loci"
version := "1.5.7"
version := "1.5.8-SNAPSHOT"

addSparkDeps

Expand Down
2 changes: 1 addition & 1 deletion project/plugins.sbt
@@ -1 +1 @@
addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.7.5")
addSbtPlugin("org.hammerlab" % "sbt-parent" % "1.7.7-SNAPSHOT")
4 changes: 2 additions & 2 deletions src/main/scala/org/hammerlab/genomics/loci/map/Builder.scala
Expand Up @@ -24,8 +24,8 @@ private[loci] class Builder[T] {
/** Set the value for all loci in the given LociSet to the specified value in the LociMap under construction. */
def put(loci: LociSet, value: T): Builder[T] = {
for {
contig <- loci.contigs
Interval(start, end) <- contig.ranges
contig loci.contigs
Interval(start, end) contig.ranges
} {
put(contig.name, start, end, value)
}
Expand Down
72 changes: 29 additions & 43 deletions src/main/scala/org/hammerlab/genomics/loci/set/Contig.scala
@@ -1,71 +1,56 @@
package org.hammerlab.genomics.loci.set

import java.io.{ ObjectInputStream, ObjectOutputStream }

import com.google.common.collect.Range.closedOpen
import com.google.common.collect.{ RangeSet, TreeRangeSet, Range JRange }
import org.hammerlab.genomics.loci.iterator.LociIterator
import org.hammerlab.genomics.reference.Interval.orderByStart
import org.hammerlab.genomics.reference.ContigName.Factory
import org.hammerlab.genomics.reference.{ ContigName, Interval, Locus, NumLoci, Region }
import org.hammerlab.strings.TruncatedToString

import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

/**
* A set of loci on a contig, stored/manipulated as loci ranges.
*/
case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) extends TruncatedToString {
case class Contig(name: ContigName,
rangeSet: RangeSet[Locus])(implicit factory: Factory)
extends TruncatedToString {

import Contig.lociRange

private def readObject(in: ObjectInputStream): Unit = {
name = in.readUTF()
val num = in.readInt()
rangeSet = TreeRangeSet.create[Locus]()
for {
i <- 0 until num
} {
val start = Locus(in.readLong())
val end = Locus(in.readLong())
val range = lociRange(start, end)
rangeSet.add(range)
}
}

private def writeObject(out: ObjectOutputStream): Unit = {
out.writeUTF(name.name)
out.writeInt(ranges.length)
for {
Interval(start, end) <- ranges
} {
out.writeLong(start.locus)
out.writeLong(end.locus)
}
}
/**
* [[RangeSet]] is not [[Serializable]], and we delegate work-around serialization logic to [[SerializableContig]] to
* avoid the requirement for a 0-arg constructor here.
*/
protected def writeReplace: Object =
SerializableContig(this)

/** Is the given locus contained in this set? */
def contains(locus: Locus): Boolean = rangeSet.contains(locus)

@transient private lazy val jranges = rangeSet.asRanges()
@transient lazy val numRanges = jranges.size()

/** This set as a regular scala array of ranges. */
lazy val ranges: Vector[Interval] =
rangeSet
.asRanges()
def ranges: Iterator[Interval] =
jranges
.iterator()
.asScala
.map(range => Interval(range.lowerEndpoint(), range.upperEndpoint()))
.toVector
.sorted(orderByStart)

def regions: Iterator[Region] = ranges.iterator.map(range Region(name, range))
def regions: Iterator[Region] = ranges.map(range Region(name, range))

/** Is this contig empty? */
def isEmpty: Boolean = rangeSet.isEmpty

def nonEmpty: Boolean = !isEmpty

/** Iterator through loci on this contig, sorted. */
def iterator = new LociIterator(ranges.iterator.buffered)
def iterator = new LociIterator(ranges.buffered)

/** Number of loci on this contig. */
def count: NumLoci = ranges.map(_.length).sum
@transient lazy val count: NumLoci = ranges.map(_.length).sum

/** Returns whether a given genomic region overlaps with any loci on this contig. */
def intersects(start: Locus, end: Locus): Boolean = !rangeSet.subRangeSet(lociRange(start, end)).isEmpty
Expand All @@ -82,7 +67,7 @@ case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) e
var remaining = numToTake
var doneTaking = false
for {
range <- ranges
range ranges
} {
if (doneTaking) {
secondRanges.append(range)
Expand All @@ -108,23 +93,24 @@ case class Contig(var name: ContigName, private var rangeSet: RangeSet[Locus]) e

object Contig {
// Empty-contig constructor, for convenience.
def apply(name: ContigName): Contig = Contig(name, TreeRangeSet.create[Locus]())
def apply(name: ContigName)(implicit f: Factory): Contig = Contig(name, TreeRangeSet.create[Locus]())

// Constructors that make a Contig from its name and some ranges.
def apply(tuple: (ContigName, Iterable[Interval])): Contig = Contig(tuple._1, tuple._2)
def apply(name: ContigName, ranges: Iterable[Interval]): Contig =
def apply(tuple: (ContigName, Iterable[Interval]))(implicit f: Factory): Contig = Contig(tuple._1, tuple._2)
def apply(name: ContigName, ranges: Iterable[Interval])(implicit f: Factory): Contig = apply(name, ranges.iterator)
def apply(name: ContigName, ranges: Iterator[Interval])(implicit f: Factory): Contig =
Contig(
name,
{
val rangeSet = TreeRangeSet.create[Locus]()
for {
Interval(start, end) <- ranges
Interval(start, end) ranges
} {
rangeSet.add(lociRange(start, end))
}
rangeSet
}
)

def lociRange(start: Locus, end: Locus): JRange[Locus] = JRange.closedOpen[Locus](start, end)
def lociRange(start: Locus, end: Locus): JRange[Locus] = closedOpen[Locus](start, end)
}
Expand Up @@ -11,7 +11,7 @@ class ContigSerializer extends KryoSerializer[Contig] {

def write(kryo: Kryo, output: Output, obj: Contig) = {
kryo.writeObject(output, obj.name)
output.writeInt(obj.ranges.length)
output.writeInt(obj.numRanges)
for {
Interval(start, end) obj.ranges
} {
Expand All @@ -22,16 +22,18 @@ class ContigSerializer extends KryoSerializer[Contig] {

def read(kryo: Kryo, input: Input, klass: Class[Contig]): Contig = {
val name = kryo.readObject(input, classOf[ContigName])
val length = input.readInt()
val numRanges = input.readInt()
val treeRangeSet = TreeRangeSet.create[Locus]()
val ranges = (0 until length).foreach { _
treeRangeSet.add(
closedOpen(
Locus(input.readLong()),
Locus(input.readLong())
val ranges =
(0 until numRanges).foreach { _
treeRangeSet.add(
closedOpen(
Locus(input.readLong()),
Locus(input.readLong())
)
)
)
}
}

Contig(name, treeRangeSet)
}
}
41 changes: 20 additions & 21 deletions src/main/scala/org/hammerlab/genomics/loci/set/LociSet.scala
@@ -1,8 +1,9 @@
package org.hammerlab.genomics.loci.set

import htsjdk.samtools.util.{Interval => HTSJDKInterval}
import org.hammerlab.genomics.loci.parsing.{All, LociRange, LociRanges, ParsedLoci}
import org.hammerlab.genomics.reference.{ContigLengths, ContigName, Interval, Locus, NumLoci, Region}
import htsjdk.samtools.util.{ Interval HTSJDKInterval }
import org.hammerlab.genomics.loci.parsing.{ All, LociRange, LociRanges, ParsedLoci }
import org.hammerlab.genomics.reference.ContigName.Factory
import org.hammerlab.genomics.reference.{ ContigLengths, ContigName, Interval, Locus, NumLoci, Region }
import org.hammerlab.strings.TruncatedToString
import org.scalautils.ConversionCheckedTripleEquals._

Expand All @@ -20,7 +21,8 @@ import scala.collection.immutable.TreeMap
*
* @param map A map from contig-name to Contig, which is a set or genomic intervals as described above.
*/
case class LociSet(private val map: SortedMap[ContigName, Contig]) extends TruncatedToString {
case class LociSet(private val map: SortedMap[ContigName, Contig])
extends TruncatedToString {

/** The contigs included in this LociSet with a nonempty set of loci. */
@transient lazy val contigs = map.values.toArray
Expand Down Expand Up @@ -97,7 +99,7 @@ case class LociSet(private val map: SortedMap[ContigName, Contig]) extends Trunc
.ranges
// We add 1 to the start to move to 1-based coordinates
// Since the `Interval` end is inclusive, we are adding and subtracting 1, no-op
.map(interval =>
.map(interval
new HTSJDKInterval(
contig.name,
interval.start.locus.toInt + 1,
Expand All @@ -114,9 +116,6 @@ object LociSet {

def all(contigLengths: ContigLengths) = LociSet(All, contigLengths)

// NOTE(ryan): only used in tests; TODO: move to test-specific helper.
// def apply(lociStr: String): LociSet = ParsedLoci(lociStr).result

/**
* These constructors build a LociSet directly from Contigs.
*
Expand All @@ -128,48 +127,48 @@ object LociSet {
TreeMap(
contigs
.filterNot(_.isEmpty)
.map(contig => contig.name -> contig)
.map(contig contig.name contig)
.toSeq: _*
)
)

def apply(regions: Iterable[Region]): LociSet =
def apply(regions: Iterable[Region])(implicit f: Factory): LociSet =
LociSet.fromContigs(
(for {
Region(contigName, start, end) <- regions
Region(contigName, start, end) regions
if start != end
range = Interval(start, end)
} yield
contigName -> range
contigName range
)
.groupBy(_._1)
.mapValues(_.map(_._2))
.map(Contig(_))
)

def apply(loci: ParsedLoci, contigLengths: ContigLengths): LociSet =
def apply(loci: ParsedLoci, contigLengths: ContigLengths)(implicit f: Factory): LociSet =
LociSet(
loci match {
case All =>
case All
for {
(contig, length) <- contigLengths
(contig, length) contigLengths
} yield
Region(contig, Locus(0), Locus(length))
case LociRanges(ranges) =>
case LociRanges(ranges)
for {
LociRange(contigName, start, endOpt) <- ranges
LociRange(contigName, start, endOpt) ranges
contigLengthOpt = contigLengths.get(contigName)
} yield
(endOpt, contigLengthOpt) match {
case (Some(end), Some(contigLength)) if end > Locus(contigLength) =>
case (Some(end), Some(contigLength)) if end > Locus(contigLength)
throw new IllegalArgumentException(
s"Invalid range $start-${endOpt.get} for contig '$contigName' which has length $contigLength"
)
case (Some(end), _) =>
case (Some(end), _)
Region(contigName, start, end)
case (_, Some(contigLength)) =>
case (_, Some(contigLength))
Region(contigName, start, Locus(contigLength))
case _ =>
case _
throw new IllegalArgumentException(
s"No such contig: $contigName. Valid contigs: ${contigLengths.keys.mkString(", ")}"
)
Expand Down
@@ -0,0 +1,54 @@
package org.hammerlab.genomics.loci.set

import java.io.{ ObjectInputStream, ObjectOutputStream }

import org.hammerlab.genomics.reference.ContigName.Factory
import org.hammerlab.genomics.reference.{ ContigName, Interval, Locus }

private class SerializableContig
extends Serializable {
var contigName: ContigName = _
var numRanges: Int = 0
var ranges: Iterator[Interval] = _
implicit var factory: Factory = _

private def writeObject(out: ObjectOutputStream): Unit = {
out.writeObject(factory)
out.writeUTF(contigName.name)
out.writeInt(numRanges)
for {
Interval(start, end) ranges
} {
out.writeLong(start.locus)
out.writeLong(end.locus)
}
}

private def readObject(in: ObjectInputStream): Unit = {
factory = in.readObject().asInstanceOf[Factory]
contigName = in.readUTF()
numRanges = in.readInt()
ranges =
(0 until numRanges)
.map { _
val start = Locus(in.readLong())
val end = Locus(in.readLong())
Interval(start, end)
}
.iterator
}

protected def readResolve: Object =
Contig(contigName, ranges)
}

private object SerializableContig {
def apply(contig: Contig)(implicit factory: Factory): SerializableContig = {
val sc = new SerializableContig
sc.contigName = contig.name
sc.numRanges = contig.numRanges
sc.ranges = contig.ranges
sc.factory = factory
sc
}
}
Expand Up @@ -5,9 +5,9 @@ import java.io.{ ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream,
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.genomics.loci.set.test.LociSetUtil
import org.hammerlab.genomics.reference.ContigName.Factory
import org.hammerlab.genomics.reference.{ Locus, PermissiveRegistrar }
import org.hammerlab.genomics.reference.test.{ ClearContigNames, LenientContigNameConversions }
import org.hammerlab.genomics.reference.test.LociConversions._
import org.hammerlab.genomics.reference.test.{ ClearContigNames, LenientContigNameConversions }
import org.hammerlab.genomics.reference.{ Locus, PermissiveRegistrar }
import org.hammerlab.spark.test.suite.{ KryoSparkSuite, SparkSerialization }

import scala.collection.mutable
Expand Down
Expand Up @@ -13,7 +13,7 @@ import org.scalatest.Suite
trait LociSetUtil {
self: Suite
implicit def parsedLociFromString(lociStr: String)(implicit factory: Factory): ParsedLoci = ParsedLoci(lociStr)
implicit def lociSetFromParsedLoci(parsedLoci: ParsedLoci): LociSet = LociSet(parsedLoci, ContigLengths.empty)
implicit def lociSetFromParsedLoci(parsedLoci: ParsedLoci)(implicit factory: Factory): LociSet = LociSet(parsedLoci, ContigLengths.empty)
implicit def lociSetFromString(lociStr: String)(implicit factory: Factory): LociSet = ParsedLoci(lociStr)
def lociSet(lociStr: String)(implicit factory: Factory): LociSet = lociStr
def lociSet(parsedLoci: ParsedLoci): LociSet = parsedLoci
Expand Down

0 comments on commit 7db1252

Please sign in to comment.