hail-is · cseed · Dec 21, 2015 · Nov 15, 2015 · Dec 9, 2015 · Dec 10, 2015
diff --git a/src/main/scala/org/broadinstitute/hail/Utils.scala b/src/main/scala/org/broadinstitute/hail/Utils.scala
@@ -15,8 +15,7 @@ import breeze.linalg.{Vector => BVector, DenseVector => BDenseVector, SparseVect
 import org.apache.spark.mllib.linalg.{Vector => SVector, DenseVector => SDenseVector, SparseVector => SSparseVector}
 import scala.reflect.ClassTag
 import org.broadinstitute.hail.Utils._
-import scala.reflect.runtime.currentMirror
-import scala.tools.reflect.ToolBox
+
 
 // FIXME AnyVal in Scala 2.11
 class RichVector[T](v: Vector[T]) {
@@ -209,8 +208,10 @@ class RichRDD[T](val r: RDD[T]) extends AnyVal {
 
   def writeTable(filename: String, header: String = null) {
     if (header != null)
-      writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) {_.write(header)}
-    hadoopDelete(filename, r.sparkContext.hadoopConfiguration, true)
+      writeTextFile(filename + ".header", r.sparkContext.hadoopConfiguration) {
+        _.write(header)
+      }
+    hadoopDelete(filename, r.sparkContext.hadoopConfiguration, recursive = true)
     r.saveAsTextFile(filename)
   }
 }
@@ -238,11 +239,8 @@ class RichOption[T](val o: Option[T]) extends AnyVal {
 }
 
 class RichStringBuilder(val sb: mutable.StringBuilder) extends AnyVal {
-  def tsvAppend[T](v: Option[T]) {
-    v match {
-      case Some(x) => sb.append(x)
-      case None => sb.append("NA")
-    }
+  def tsvAppend(a: Any) {
+    sb.append(org.broadinstitute.hail.methods.UserExportUtils.toTSVString(a))
   }
 }
 
@@ -464,11 +462,6 @@ object Utils {
     }
   }
 
-  def toTSVString(a: Any): String = a match {
-    case o: Option[Any] => o.map(toTSVString).getOrElse("NA")
-    case _ => a.toString
-  }
-
   def someIf[T](p: Boolean, x: => T): Option[T] =
     if (p)
       Some(x)
@@ -505,14 +498,6 @@ object Utils {
   def flushDouble(a: Double): Double =
     if (math.abs(a) < java.lang.Double.MIN_NORMAL) 0.0 else a
 
-
-  def eval[T](t: String): T = {
-    val toolbox = currentMirror.mkToolBox()
-    val ast = toolbox.parse(t)
-    toolbox.typeCheck(ast)
-    toolbox.eval(ast).asInstanceOf[T]
-  }
-
   def genOption[T](g: Gen[T], someFrequency: Int = 4): Gen[Option[T]] =
     Gen.frequency((1, Gen.const(None)),
       (someFrequency, g.map(Some(_))))
@@ -524,5 +509,4 @@ object Utils {
   def genDNAString: Gen[String] = Gen.buildableOf[String, Char](genBase)
 
   implicit def richIterator[T](it: Iterator[T]): RichIterator[T] = new RichIterator[T](it)
-
 }
diff --git a/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/AnnotationSignature.scala
@@ -0,0 +1,14 @@
+package org.broadinstitute.hail.annotations
+
+abstract class AnnotationSignature {
+  def emitUtilities: String
+  def emitConversionIdentifier: String
+  def emitType: String
+
+}
+
+case class SimpleSignature(emitType: String, emitConversionIdentifier: String) extends AnnotationSignature {
+
+  def emitUtilities = ""
+
+}
diff --git a/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala b/src/main/scala/org/broadinstitute/hail/annotations/Annotations.scala
@@ -0,0 +1,105 @@
+package org.broadinstitute.hail.annotations
+
+case class Annotations[T](maps: Map[String, Map[String, T]], vals: Map[String, T]) extends Serializable {
+
+  def hasMap(str: String): Boolean = maps.contains(str)
+
+  def containsVal(str: String): Boolean = vals.contains(str)
+
+  def containsInMap(parent: String, str: String): Boolean = hasMap(parent) && maps(parent).contains(str)
+
+  def getVal(str: String): Option[T] = vals.get(str)
+
+  def getInMap(parent: String, str: String): Option[T] =
+    maps.get(parent).flatMap(_.get(str))
+
+  def getMap(parent: String): Option[Map[String, T]] = maps.get(parent)
+
+  def addMap(name: String, m: Map[String, T]): Annotations[T] =
+    Annotations(maps + ((name, m)), vals)
+
+  def addMaps(newMaps: Map[String, Map[String, T]]): Annotations[T] =
+    Annotations(maps ++ newMaps, vals)
+
+  def addVal(name: String, mapping: T): Annotations[T] = Annotations(maps, vals + ((name, mapping)))
+
+  def addVals(newVals: Map[String, T]): Annotations[T] = Annotations(maps, vals ++ newVals)
+
+  def ++(other: Annotations[T]): Annotations[T] = {
+    new Annotations(maps ++ other.maps, vals ++ other.vals)
+  }
+}
+
+object Annotations {
+
+  def empty[T](): Annotations[T] =
+    Annotations(Map.empty[String, Map[String, T]], Map.empty[String, T])
+
+  def emptyOfSignature(): AnnotationSignatures = empty[AnnotationSignature]()
+
+  def emptyOfData(): AnnotationData = empty[String]()
+
+  def emptyOfArrayString(nSamples: Int): IndexedSeq[AnnotationData] =
+    IndexedSeq.fill[Annotations[String]](nSamples)(empty[String]())
+}
+
+object AnnotationClassBuilder {
+
+  def signatures(sigs: AnnotationSignatures, className: String,
+    makeToString: Boolean = false): String = {
+    val internalClasses = sigs.maps.map {
+      case (subclass, subMap) =>
+        val attrs = subMap
+          .map { case (k, sig) =>
+            s"""  val $k: Option[${sig.emitType}] = subMap.get("$k").map(_.${sig.emitConversionIdentifier})"""
+          }
+          .mkString("\n")
+        val methods: String = {
+          if (makeToString) {
+            s"""  def __fields: Array[String] = Array(
+                |    ${subMap.keys.toArray.sorted.map(s => s"""toTSVString($s)""").mkString(",")}
+                |  )
+                |  override def toString: String = __fields.mkString(";")
+                |  def all: String = __fields.mkString("\t")""".stripMargin
+          } else ""
+        }
+        s"""class __$subclass(subMap: Map[String, String]) extends Serializable {
+            |$attrs
+            |$methods
+            |}""".stripMargin
+    }
+      .mkString("\n")
+
+    val hiddenClass = {
+      val classes =
+        sigs.maps.map { case (subclass, subMap) =>
+          s"""  val $subclass = new __$subclass(annot.maps("$subclass"))"""
+        }
+          .mkString("\n")
+      val vals = sigs.vals.map { case (k, sig) =>
+        s"""  val $k: Option[${sig.emitType}] = annot.getVal("$k").map(_.${sig.emitConversionIdentifier})"""
+      }
+        .mkString("\n")
+      s"""class $className(annot: org.broadinstitute.hail.annotations.AnnotationData)
+          |  extends Serializable {
+          |  ${if (internalClasses.nonEmpty) internalClasses else "// no internal class declarations"}
+          |  ${if (classes.nonEmpty) classes else "// no class instantiations"}
+          |  ${if (vals.nonEmpty) vals else "// no vals"}
+          |}
+          |""".stripMargin
+    }
+
+    s"""
+       |$hiddenClass
+    """.stripMargin
+  }
+
+  def instantiate(exposedName: String, className: String, rawName: String): String = {
+    s"val $exposedName = new $className($rawName)\n"
+  }
+
+  def instantiateIndexedSeq(exposedName: String, classIdentifier: String, rawArrayName: String): String =
+    s"""val $exposedName: IndexedSeq[$classIdentifier] =
+        |  $rawArrayName.map(new $classIdentifier(_))
+     """.stripMargin
+}
diff --git a/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala b/src/main/scala/org/broadinstitute/hail/annotations/VCFSignature.scala
@@ -0,0 +1,55 @@
+package org.broadinstitute.hail.annotations
+
+import htsjdk.variant.vcf.{VCFInfoHeaderLine, VCFHeaderLineCount, VCFHeaderLineType}
+
+case class VCFSignature(vcfType: String, emitType: String, number: String,
+  emitConversionIdentifier: String, description: String)
+  extends AnnotationSignature {
+
+  def emitUtilities: String = ""
+}
+
+object VCFSignature {
+
+  val arrayRegex = """Array\[(\w+)\]""".r
+  val setRegex = """Set\[(\w+)\]""".r
+  val integerRegex = """(\d+)""".r
+
+  def parseConversionIdentifier(str: String): String = {
+    str match {
+      case arrayRegex(subType) => s"toArray$subType"
+      case setRegex(subType) => s"toSet$subType"
+      case _ => s"to$str"
+    }
+  }
+
+  def parse(line: VCFInfoHeaderLine): AnnotationSignature = {
+    val vcfType = line.getType.toString
+    val parsedType = line.getType match {
+      case VCFHeaderLineType.Integer => "Int"
+      case VCFHeaderLineType.Float => "Double"
+      case VCFHeaderLineType.String => "String"
+      case VCFHeaderLineType.Character => "Character"
+      case VCFHeaderLineType.Flag => "Boolean"
+    }
+    val parsedCount = line.getCountType match {
+      case VCFHeaderLineCount.A => "A"
+      case VCFHeaderLineCount.G => "G"
+      case VCFHeaderLineCount.R => "R"
+      case VCFHeaderLineCount.INTEGER => line.getCount.toString
+      case VCFHeaderLineCount.UNBOUNDED => "."
+    }
+    val scalaType = parsedCount match {
+      case "A" | "R" | "G" => s"Array[$parsedType]"
+      case integerRegex(i) => if (i.toInt > 1) s"Array[$parsedType]" else parsedType
+      case _ => parsedType
+    }
+    val conversionMethod = parseConversionIdentifier(scalaType)
+    val desc = line.getDescription
+
+
+    new VCFSignature(vcfType, scalaType, parsedCount, conversionMethod, desc)
+
+
+  }
+}
diff --git a/src/main/scala/org/broadinstitute/hail/annotations/package.scala b/src/main/scala/org/broadinstitute/hail/annotations/package.scala
@@ -0,0 +1,6 @@
+package org.broadinstitute.hail
+
+package object annotations {
+  type AnnotationSignatures = Annotations[AnnotationSignature]
+  type AnnotationData = Annotations[String]
+}
diff --git a/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala b/src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala
@@ -0,0 +1,105 @@
+package org.broadinstitute.hail.driver
+
+import org.broadinstitute.hail.Utils._
+import org.broadinstitute.hail.methods._
+import org.broadinstitute.hail.variant._
+import org.broadinstitute.hail.annotations._
+import org.kohsuke.args4j.{Option => Args4jOption}
+
+object ExportGenotypes extends Command {
+
+  class Options extends BaseOptions {
+
+    @Args4jOption(required = true, name = "-o", aliases = Array("--output"),
+      usage = "path of output tsv")
+    var output: String = _
+
+    @Args4jOption(required = true, name = "-c", aliases = Array("--condition"),
+      usage = "Comma-separated list of fields to be printed to tsv")
+    var condition: String = _
+  }
+
+  def newOptions = new Options
+
+  def name = "exportgenotypes"
+
+  def description = "Export list of sample-variant information to tsv"
+
+  def run(state: State, options: Options): State = {
+    val vds = state.vds
+    val cond = options.condition
+    val output = options.output
+
+    val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures
+    val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures
+    val sa = vds.metadata.sampleAnnotations
+    val ids = vds.sampleIds
+
+    val makeString: ((Variant, AnnotationData) =>
+      ((Int, Genotype) => String)) = {
+      val cf = new ExportGenotypeEvaluator(options.condition, vds.metadata)
+      cf.typeCheck()
+      cf.apply
+    }
+
+    val stringVDS = vds.mapValuesWithPartialApplication(
+      (v: Variant, va: AnnotationData) =>
+        (s: Int, g: Genotype) =>
+          makeString(v, va)(s, g))
+
+    // FIXME add additional command parsing functionality.  Somewhat hacky
+    val variantRegex =
+      """v\.(\w+)""".r
+    val sampleRegex = """s\.(\w+)""".r
+    val topLevelSampleAnnoRegex = """sa\.(\w+)""".r
+    val topLevelVariantAnnoRegex = """va\.(\w+)""".r
+    val samplePrintMapRegex = """sa\.(\w+)\.all""".r
+    val variantPrintMapRegex = """va\.(\w+)\.all""".r
+    val annoRegex = """\wa\.(.+)""".r
+    def mapColumnNames(input: String): String = {
+      input match {
+        case "v" => "Variant"
+        case "s" => "Sample"
+        case "va" =>
+          fatal("parse error in condition: cannot print 'va', choose a group or value in annotations")
+        case "sa" =>
+          fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations")
+        case variantRegex(x) => x
+        case sampleRegex(x) => x
+        case topLevelSampleAnnoRegex(x) =>
+          if (sas.maps.contains(x)) {
+            val keys = sas.maps(x).keys.toArray.sorted
+            if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
+          }
+          else x
+        case topLevelVariantAnnoRegex(x) =>
+          if (vas.maps.contains(x)) {
+            val keys = vas.maps(x).keys.toArray.sorted
+            if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
+          }
+          else x
+        case samplePrintMapRegex(x) =>
+          val keys = sas.maps(x).keys
+          if (keys.isEmpty) x else keys.mkString("\t")
+        case variantPrintMapRegex(x) =>
+          val keys = vas.maps(x).keys
+          if (keys.isEmpty) x else keys.mkString("\t")
+        case annoRegex(x) => x
+        case _ => input
+      }
+    }
+
+    writeTextFile(output + ".header", state.hadoopConf) { s =>
+      s.write(cond.split(",").map(_.split("\\.").last).mkString("\t"))
+      s.write("\n")
+    }
+
+    hadoopDelete(output, state.hadoopConf, recursive = true)
+
+    stringVDS.rdd
+      .flatMap { case (v, va, strings) => strings }
+      .saveAsTextFile(output)
+
+    state
+  }
+}