-
Notifications
You must be signed in to change notification settings - Fork 1
/
LociSet.scala
196 lines (172 loc) · 6.54 KB
/
LociSet.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
package org.hammerlab.genomics.loci.set
import com.esotericsoftware.kryo.io.{ Input, Output }
import com.esotericsoftware.kryo.{ Kryo, Serializer }
import htsjdk.samtools.util.{ Interval ⇒ HTSJDKInterval }
import org.hammerlab.genomics.loci.parsing.{ All, LociRange, LociRanges, ParsedLoci }
import org.hammerlab.genomics.reference.ContigName.Factory
import org.hammerlab.genomics.reference.{ ContigLengths, ContigName, Interval, Locus, NumLoci, Region }
import org.hammerlab.strings.TruncatedToString
import org.scalautils.ConversionCheckedTripleEquals._
import scala.collection.SortedMap
import scala.collection.immutable.TreeMap
/**
* An immutable collection of genomic regions on any number of contigs.
*
* Used, for example, to keep track of what loci to call variants at.
*
* Since contiguous genomic intervals are a common case, this is implemented with sets of (start, end) intervals.
*
* All intervals are half open: inclusive on start, exclusive on end.
*
* @param map A map from contig-name to Contig, which is a set or genomic intervals as described above.
*/
case class LociSet(private val map: SortedMap[ContigName, Contig])
extends TruncatedToString {
/** The contigs included in this LociSet with a nonempty set of loci. */
@transient lazy val contigs = map.values.toArray
/** The number of loci in this LociSet. */
@transient lazy val count: NumLoci = contigs.map(_.count).sum
def isEmpty = map.isEmpty
def nonEmpty = map.nonEmpty
/** Given a contig name, returns a [[Contig]] giving the loci on that contig. */
def apply(contigName: ContigName): Contig = map.getOrElse(contigName, Contig(contigName))
/** Build a truncate-able toString() out of underlying contig pieces. */
def stringPieces: Iterator[String] = contigs.iterator.flatMap(_.stringPieces)
def intersects(region: Region): Boolean =
apply(region.contigName).intersects(region.start, region.end)
/**
* Split the LociSet into two sets, where the first one has `numToTake` loci, and the second one has the
* remaining loci.
*
* @param numToTake number of elements to take. Must be <= number of elements in the map.
*/
def take(numToTake: NumLoci): (LociSet, LociSet) = {
assume(numToTake <= count, s"Can't take $numToTake loci from a set of size $count.")
// Optimize for taking none or all:
if (numToTake == NumLoci(0)) {
(LociSet(), this)
} else if (numToTake == count) {
(this, LociSet())
} else {
val first = new Builder
val second = new Builder
var remaining = numToTake
var doneTaking = false
for {
contig <- contigs
} {
if (doneTaking) {
second.add(contig)
} else if (contig.count < remaining) {
first.add(contig)
remaining -= contig.count
} else {
val (takePartialContig, remainingPartialContig) = contig.take(remaining)
first.add(takePartialContig)
second.add(remainingPartialContig)
doneTaking = true
}
}
val (firstSet, secondSet) = (first.result, second.result)
assert(firstSet.count === numToTake)
assert(firstSet.count + secondSet.count === count)
(firstSet, secondSet)
}
}
/**
* Build a collection of HTSJDK Intervals which are closed [start, end], 1-based intervals
*/
def toHtsJDKIntervals: List[HTSJDKInterval] =
map
.keys
.flatMap(
contig ⇒
apply(contig)
.ranges
// We add 1 to the start to move to 1-based coordinates
// Since the `Interval` end is inclusive, we are adding and subtracting 1, no-op
.map(interval ⇒
new HTSJDKInterval(
contig.name,
interval.start.locus.toInt + 1,
interval.end.locus.toInt
)
)
)
.toList
}
object LociSet {
/** An empty LociSet. */
def apply(): LociSet = LociSet(TreeMap.empty[ContigName, Contig])
def all(contigLengths: ContigLengths) = LociSet(All, contigLengths)
/**
* These constructors build a LociSet directly from Contigs.
*
* They operate on an Iterator so that transformations to the data happen in one pass.
*/
def fromContigs(contigs: Iterable[Contig]): LociSet = fromContigs(contigs.iterator)
def fromContigs(contigs: Iterator[Contig]): LociSet =
LociSet(
TreeMap(
contigs
.filterNot(_.isEmpty)
.map(contig ⇒ contig.name → contig)
.toSeq: _*
)
)
def apply(regions: Iterable[Region])(implicit f: Factory): LociSet =
LociSet.fromContigs(
(for {
Region(contigName, start, end) ← regions
if start != end
range = Interval(start, end)
} yield
contigName → range
)
.groupBy(_._1)
.mapValues(_.map(_._2))
.map(Contig(_))
)
def apply(loci: ParsedLoci, contigLengths: ContigLengths)(implicit f: Factory): LociSet =
LociSet(
loci match {
case All ⇒
for {
(contig, length) ← contigLengths
} yield
Region(contig, Locus(0), Locus(length))
case LociRanges(ranges) ⇒
for {
LociRange(contigName, start, endOpt) ← ranges
contigLengthOpt = contigLengths.get(contigName)
} yield
(endOpt, contigLengthOpt) match {
case (Some(end), Some(contigLength)) if end > Locus(contigLength) ⇒
throw new IllegalArgumentException(
s"Invalid range $start-${endOpt.get} for contig '$contigName' which has length $contigLength"
)
case (Some(end), _) ⇒
Region(contigName, start, end)
case (_, Some(contigLength)) ⇒
Region(contigName, start, Locus(contigLength))
case _ ⇒
throw new IllegalArgumentException(
s"No such contig: $contigName. Valid contigs: ${contigLengths.keys.mkString(", ")}"
)
}
}
)
// We just serialize the underlying contigs, which contain their names which are the string keys of LociSet.map.
implicit val serializer =
new Serializer[LociSet] {
def write(kryo: Kryo, output: Output, obj: LociSet) = {
kryo.writeObject(output, obj.contigs)
}
def read(kryo: Kryo, input: Input, klass: Class[LociSet]): LociSet = {
val contigs = kryo.readObject(input, classOf[Array[Contig]])
LociSet.fromContigs(contigs)
}
}
import org.hammerlab.kryo._
implicit val alsoRegister = AlsoRegister[LociSet](arr[Contig])
}