-
Notifications
You must be signed in to change notification settings - Fork 1
/
ParsedLoci.scala
118 lines (105 loc) · 4.46 KB
/
ParsedLoci.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package org.hammerlab.genomics.loci.parsing
import java.io.File
import htsjdk.variant.vcf.VCFFileReader
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.hammerlab.genomics.loci.VariantContext
import org.hammerlab.genomics.loci.args.LociArgs
import org.hammerlab.genomics.reference.ContigName.Factory
import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
/**
* Representation of genomic-loci ranges that may be used to instantiate a [[org.hammerlab.genomics.loci.set.LociSet]].
*
* Constituent ranges can be open-ended, so a [[ParsedLoci]] is typically a short-lived intermediate representation
* between [[String]] representations of ranges (possibly originating from cmdline-flags or a file) and a
* [[org.hammerlab.genomics.loci.set.LociSet]] whose open-ended ranges have been "resolved" using contig-length
* information found in a BAM header.
*
* The two implementations are:
*
* - [[All]]: sentinel value representing all loci on all contigs.
* - [[LociRanges]]: a sequence of [[LociRange]]s denoting (possibly open-ended) genomic-intervals.
*
* Examples:
*
* - chr1,chrY
* - chr2:0-100
* - chr1:123-124,chr5:456-457
* - all
*/
sealed trait ParsedLoci extends Any
object ParsedLoci {
def apply(lociStrs: String)(implicit factory: Factory): ParsedLoci = apply(Iterator(lociStrs))
def apply(lines: Iterator[String])(implicit factory: Factory): ParsedLoci = {
val lociRanges = ArrayBuffer[LociRange]()
for {
lociStrs <- lines
lociStr <- lociStrs.replaceAll("\\s", "").split(",")
lociRange <- ParsedLociRange(lociStr)
} {
lociRange match {
case AllRange => return All
case lociRange: LociRange =>
lociRanges += lociRange
}
}
LociRanges(lociRanges)
}
/**
* Parse string representations of loci ranges, either from one string (lociOpt) or a file with one range per line
* (lociFileOpt), and return a [[ParsedLoci]] encapsulating the result. The latter can then be converted into a
* [[org.hammerlab.genomics.loci.set.LociSet]] when contig-lengths are available / have been parsed from read-sets.
*/
def fromArgs(args: LociArgs, hadoopConfiguration: Configuration): Option[ParsedLoci] =
fromArgs(args.lociStrOpt, args.lociFileOpt, hadoopConfiguration)
def fromArgs(lociStrOpt: Option[String],
lociFileOpt: Option[String],
hadoopConfiguration: Configuration)(implicit factory: Factory): Option[ParsedLoci] =
(lociStrOpt, lociFileOpt) match {
case (Some(lociStr), _) => Some(ParsedLoci(lociStr))
case (_, Some(lociFile)) => Some(loadFromFile(lociFile, hadoopConfiguration))
case _ =>
None
}
/**
* Parse loci from the specified file.
*
* @param lociFile path to file containing loci. If it ends in '.vcf' then it is read as a VCF and the variant sites
* are the loci. If it ends in '.loci' or '.txt' then it should be a file containing loci as
* "chrX:5-10,chr12-10-20", etc. Whitespace is ignored.
* @return parsed loci
*/
private def loadFromFile(lociFile: String, hadoopConfiguration: Configuration)(implicit factory: Factory): ParsedLoci =
if (lociFile.endsWith(".vcf")) {
LociRanges.fromVCF(lociFile)
} else if (lociFile.endsWith(".loci") || lociFile.endsWith(".txt")) {
val path = new Path(lociFile)
val filesystem = path.getFileSystem(hadoopConfiguration)
val is = filesystem.open(path)
val lines = Source.fromInputStream(is).getLines()
ParsedLoci(lines)
} else
throw new IllegalArgumentException(
s"Couldn't guess format for file: $lociFile. Expected file extensions: '.loci' or '.txt' for loci string format; '.vcf' for VCFs."
)
}
/**
* Special [[ParsedLoci]] value representing all genomic loci.
*/
case object All extends ParsedLoci
case class LociRanges(ranges: Iterable[LociRange]) extends AnyVal with ParsedLoci
object LociRanges {
def apply(range: LociRange): LociRanges = apply(Iterable(range))
def fromVCF(lociFile: String): LociRanges =
apply(
// VCF-reading currently only works for local files, requires "file://" scheme to not be present.
// TODO: use hadoop-bam to load VCF from local filesystem or HDFS.
new VCFFileReader(new File(lociFile), false)
.map {
case VariantContext(contigName, start, end) =>
LociRange(contigName, start, end)
}
)
}