Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tp info #97

Merged
merged 18 commits into from
Dec 21, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 7 additions & 23 deletions src/main/scala/org/broadinstitute/hail/Utils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ import breeze.linalg.{Vector => BVector, DenseVector => BDenseVector, SparseVect
import org.apache.spark.mllib.linalg.{Vector => SVector, DenseVector => SDenseVector, SparseVector => SSparseVector}
import scala.reflect.ClassTag
import org.broadinstitute.hail.Utils._
import scala.reflect.runtime.currentMirror
import scala.tools.reflect.ToolBox


// FIXME AnyVal in Scala 2.11
class RichVector[T](v: Vector[T]) {
Expand Down Expand Up @@ -209,8 +208,10 @@ class RichRDD[T](val r: RDD[T]) extends AnyVal {

def writeTable(filename: String, header: String = null) {
if (header != null)
writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) {_.write(header)}
hadoopDelete(filename, r.sparkContext.hadoopConfiguration, true)
writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) {
_.write(header)
}
hadoopDelete(filename, r.sparkContext.hadoopConfiguration, recursive = true)
r.saveAsTextFile(filename)
}
}
Expand Down Expand Up @@ -238,11 +239,8 @@ class RichOption[T](val o: Option[T]) extends AnyVal {
}

class RichStringBuilder(val sb: mutable.StringBuilder) extends AnyVal {
def tsvAppend[T](v: Option[T]) {
v match {
case Some(x) => sb.append(x)
case None => sb.append("NA")
}
def tsvAppend(a: Any) {
sb.append(org.broadinstitute.hail.methods.UserExportUtils.toTSVString(a))
}
}

Expand Down Expand Up @@ -464,11 +462,6 @@ object Utils {
}
}

def toTSVString(a: Any): String = a match {
case o: Option[Any] => o.map(toTSVString).getOrElse("NA")
case _ => a.toString
}

def someIf[T](p: Boolean, x: => T): Option[T] =
if (p)
Some(x)
Expand Down Expand Up @@ -505,14 +498,6 @@ object Utils {
def flushDouble(a: Double): Double =
if (math.abs(a) < java.lang.Double.MIN_NORMAL) 0.0 else a


def eval[T](t: String): T = {
val toolbox = currentMirror.mkToolBox()
val ast = toolbox.parse(t)
toolbox.typeCheck(ast)
toolbox.eval(ast).asInstanceOf[T]
}

def genOption[T](g: Gen[T], someFrequency: Int = 4): Gen[Option[T]] =
Gen.frequency((1, Gen.const(None)),
(someFrequency, g.map(Some(_))))
Expand All @@ -524,5 +509,4 @@ object Utils {
def genDNAString: Gen[String] = Gen.buildableOf[String, Char](genBase)

implicit def richIterator[T](it: Iterator[T]): RichIterator[T] = new RichIterator[T](it)

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.broadinstitute.hail.annotations

abstract class AnnotationSignature {
def emitUtilities: String
def emitConversionIdentifier: String
def emitType: String

}

case class SimpleSignature(emitType: String, emitConversionIdentifier: String) extends AnnotationSignature {

def emitUtilities = ""

}
105 changes: 105 additions & 0 deletions src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package org.broadinstitute.hail.annotations

case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T]) extends Serializable {

def hasMap(str: String): Boolean = maps.contains(str)

def containsVal(str: String): Boolean = vals.contains(str)

def containsInMap(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str)

def getVal(str: String): Option[T] = vals.get(str)

def getInMap(parent: String, str: String): Option[T] =
maps.get(parent).flatMap(_.get(str))

def getMap(parent: String): Option[Map[String, T]] = maps.get(parent)

def addMap(name: String, m: Map[String, T]): Annotations[T] =
Annotations(maps + ((name, m)), vals)

def addMaps(newMaps: Map[String, Map[String, T]]): Annotations[T] =
Annotations(maps ++ newMaps, vals)

def addVal(name: String, mapping: T): Annotations[T] = Annotations(maps, vals + ((name, mapping)))

def addVals(newVals: Map[String, T]): Annotations[T] = Annotations(maps, vals ++ newVals)

def ++(other: Annotations[T]): Annotations[T] = {
new Annotations(maps ++ other.maps, vals ++ other.vals)
}
}

object Annotations {

def empty[T](): Annotations[T] =
Annotations(Map.empty[String, Map[String, T]], Map.empty[String, T])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think just empty is traditional.


def emptyOfSignature(): AnnotationSignatures = empty[AnnotationSignature]()

def emptyOfData(): AnnotationData = empty[String]()

def emptyOfArrayString(nSamples: Int): IndexedSeq[AnnotationData] =
IndexedSeq.fill[Annotations[String]](nSamples)(empty[String]())
}

object AnnotationClassBuilder {

def signatures(sigs: AnnotationSignatures, className: String,
makeToString: Boolean = false): String = {
val internalClasses = sigs.maps.map {
case (subclass, subMap) =>
val attrs = subMap
.map { case (k, sig) =>
s""" val $k: Option[${sig.emitType}] = subMap.get("$k").map(_.${sig.emitConversionIdentifier})"""
}
.mkString("\n")
val methods: String = {
if (makeToString) {
s""" def __fields: Array[String] = Array(
| ${subMap.keys.toArray.sorted.map(s => s"""toTSVString($s)""").mkString(",")}
| )
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do triple-equals properly nest? Amazing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use them in ${}. Otherwise, they don't nest the way one would like

| override def toString: String = __fields.mkString(";")
| def all: String = __fields.mkString("\t")""".stripMargin
} else ""
}
s"""class __$subclass(subMap: Map[String, String]) extends Serializable {
|$attrs
|$methods
|}""".stripMargin
}
.mkString("\n")

val hiddenClass = {
val classes =
sigs.maps.map { case (subclass, subMap) =>
s""" val $subclass = new __$subclass(annot.maps("$subclass"))"""
}
.mkString("\n")
val vals = sigs.vals.map { case (k, sig) =>
s""" val $k: Option[${sig.emitType}] = annot.getVal("$k").map(_.${sig.emitConversionIdentifier})"""
}
.mkString("\n")
s"""class $className(annot: org.broadinstitute.hail.annotations.AnnotationData)
| extends Serializable {
| ${if (internalClasses.nonEmpty) internalClasses else "// no internal class declarations"}
| ${if (classes.nonEmpty) classes else "// no class instantiations"}
| ${if (vals.nonEmpty) vals else "// no vals"}
|}
|""".stripMargin
}

s"""
|$hiddenClass
""".stripMargin
}

def instantiate(exposedName: String, className: String, rawName: String): String = {
s"val $exposedName = new $className($rawName)\n"
}

def instantiateIndexedSeq(exposedName: String, classIdentifier: String, rawArrayName: String): String =
s"""val $exposedName: IndexedSeq[$classIdentifier] =
| $rawArrayName.map(new $classIdentifier(_))
""".stripMargin
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package org.broadinstitute.hail.annotations

import htsjdk.variant.vcf.{VCFInfoHeaderLine, VCFHeaderLineCount, VCFHeaderLineType}

case class VCFSignature(vcfType: String, emitType: String, number: String,
emitConversionIdentifier: String, description: String)
extends AnnotationSignature {

def emitUtilities: String = ""
}

object VCFSignature {

val arrayRegex = """Array\[(\w+)\]""".r
val setRegex = """Set\[(\w+)\]""".r
val integerRegex = """(\d+)""".r

def parseConversionIdentifier(str: String): String = {
str match {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the get prefix. Also, it returns a conversionIdentifier now, right?

case arrayRegex(subType) => s"toArray$subType"
case setRegex(subType) => s"toSet$subType"
case _ => s"to$str"
}
}

def parse(line: VCFInfoHeaderLine): AnnotationSignature = {
val vcfType = line.getType.toString
val parsedType = line.getType match {
case VCFHeaderLineType.Integer => "Int"
case VCFHeaderLineType.Float => "Double"
case VCFHeaderLineType.String => "String"
case VCFHeaderLineType.Character => "Character"
case VCFHeaderLineType.Flag => "Boolean"
}
val parsedCount = line.getCountType match {
case VCFHeaderLineCount.A => "A"
case VCFHeaderLineCount.G => "G"
case VCFHeaderLineCount.R => "R"
case VCFHeaderLineCount.INTEGER => line.getCount.toString
case VCFHeaderLineCount.UNBOUNDED => "."
}
val scalaType = parsedCount match {
case "A" | "R" | "G" => s"Array[$parsedType]"
case integerRegex(i) => if (i.toInt > 1) s"Array[$parsedType]" else parsedType
case _ => parsedType
}
val conversionMethod = parseConversionIdentifier(scalaType)
val desc = line.getDescription


new VCFSignature(vcfType, scalaType, parsedCount, conversionMethod, desc)


}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package org.broadinstitute.hail

package object annotations {
type AnnotationSignatures = Annotations[AnnotationSignature]
type AnnotationData = Annotations[String]
}
105 changes: 105 additions & 0 deletions src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package org.broadinstitute.hail.driver

import org.broadinstitute.hail.Utils._
import org.broadinstitute.hail.methods._
import org.broadinstitute.hail.variant._
import org.broadinstitute.hail.annotations._
import org.kohsuke.args4j.{Option => Args4jOption}

object ExportGenotypes extends Command {

class Options extends BaseOptions {

@Args4jOption(required = true, name = "-o", aliases = Array("--output"),
usage = "path of output tsv")
var output: String = _

@Args4jOption(required = true, name = "-c", aliases = Array("--condition"),
usage = "Comma-separated list of fields to be printed to tsv")
var condition: String = _
}

def newOptions = new Options

def name = "exportgenotypes"

def description = "Export list of sample-variant information to tsv"

def run(state: State, options: Options): State = {
val vds = state.vds
val cond = options.condition
val output = options.output

val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures
val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures
val sa = vds.metadata.sampleAnnotations
val ids = vds.sampleIds

val makeString: ((Variant, AnnotationData) =>
((Int, Genotype) => String)) = {
val cf = new ExportGenotypeEvaluator(options.condition, vds.metadata)
cf.typeCheck()
cf.apply
}

val stringVDS = vds.mapValuesWithPartialApplication(
(v: Variant, va: AnnotationData) =>
(s: Int, g: Genotype) =>
makeString(v, va)(s, g))

// FIXME add additional command parsing functionality. Somewhat hacky
val variantRegex =
"""v\.(\w+)""".r
val sampleRegex = """s\.(\w+)""".r
val topLevelSampleAnnoRegex = """sa\.(\w+)""".r
val topLevelVariantAnnoRegex = """va\.(\w+)""".r
val samplePrintMapRegex = """sa\.(\w+)\.all""".r
val variantPrintMapRegex = """va\.(\w+)\.all""".r
val annoRegex = """\wa\.(.+)""".r
def mapColumnNames(input: String): String = {
input match {
case "v" => "Variant"
case "s" => "Sample"
case "va" =>
fatal("parse error in condition: cannot print 'va', choose a group or value in annotations")
case "sa" =>
fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations")
case variantRegex(x) => x
case sampleRegex(x) => x
case topLevelSampleAnnoRegex(x) =>
if (sas.maps.contains(x)) {
val keys = sas.maps(x).keys.toArray.sorted
if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
}
else x
case topLevelVariantAnnoRegex(x) =>
if (vas.maps.contains(x)) {
val keys = vas.maps(x).keys.toArray.sorted
if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
}
else x
case samplePrintMapRegex(x) =>
val keys = sas.maps(x).keys
if (keys.isEmpty) x else keys.mkString("\t")
case variantPrintMapRegex(x) =>
val keys = vas.maps(x).keys
if (keys.isEmpty) x else keys.mkString("\t")
case annoRegex(x) => x
case _ => input
}
}

writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(cond.split(",").map(_.split("\\.").last).mkString("\t"))
s.write("\n")
}

hadoopDelete(output, state.hadoopConf, recursive = true)

stringVDS.rdd
.flatMap { case (v, va, strings) => strings }
.saveAsTextFile(output)

state
}
}
Loading