Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tp exportmore #119

Merged
merged 11 commits into from
Jan 7, 2016
63 changes: 13 additions & 50 deletions src/main/scala/org/broadinstitute/hail/driver/ExportGenotypes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ object ExportGenotypes extends Command {
var output: String = _

@Args4jOption(required = true, name = "-c", aliases = Array("--condition"),
usage = "Comma-separated list of fields to be printed to tsv")
usage = ".columns file, or comma-separated list of fields/computations to be printed to tsv")
var condition: String = _
}

Expand All @@ -29,15 +29,17 @@ object ExportGenotypes extends Command {
val vds = state.vds
val cond = options.condition
val output = options.output

val vas: AnnotationSignatures = vds.metadata.variantAnnotationSignatures
val sas: AnnotationSignatures = vds.metadata.sampleAnnotationSignatures
val sa = vds.metadata.sampleAnnotations
val ids = vds.sampleIds

val (header, fields) = if (cond.endsWith(".columns"))
ExportTSV.parseColumnsFile(cond, state.hadoopConf)
else
ExportTSV.parseExpression(cond)

val makeString: ((Variant, AnnotationData) =>
((Int, Genotype) => String)) = {
val cf = new ExportGenotypeEvaluator(options.condition, vds.metadata)
val cf = new ExportGenotypeEvaluator(fields, vds.metadata)
cf.typeCheck()
cf.apply
}
Expand All @@ -47,51 +49,12 @@ object ExportGenotypes extends Command {
(s: Int, g: Genotype) =>
makeString(v, va)(s, g))

// FIXME add additional command parsing functionality. Somewhat hacky
val variantRegex =
"""v\.(\w+)""".r
val sampleRegex = """s\.(\w+)""".r
val topLevelSampleAnnoRegex = """sa\.(\w+)""".r
val topLevelVariantAnnoRegex = """va\.(\w+)""".r
val samplePrintMapRegex = """sa\.(\w+)\.all""".r
val variantPrintMapRegex = """va\.(\w+)\.all""".r
val annoRegex = """\wa\.(.+)""".r
def mapColumnNames(input: String): String = {
input match {
case "v" => "Variant"
case "s" => "Sample"
case "va" =>
fatal("parse error in condition: cannot print 'va', choose a group or value in annotations")
case "sa" =>
fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations")
case variantRegex(x) => x
case sampleRegex(x) => x
case topLevelSampleAnnoRegex(x) =>
if (sas.maps.contains(x)) {
val keys = sas.maps(x).keys.toArray.sorted
if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
}
else x
case topLevelVariantAnnoRegex(x) =>
if (vas.maps.contains(x)) {
val keys = vas.maps(x).keys.toArray.sorted
if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
}
else x
case samplePrintMapRegex(x) =>
val keys = sas.maps(x).keys
if (keys.isEmpty) x else keys.mkString("\t")
case variantPrintMapRegex(x) =>
val keys = vas.maps(x).keys
if (keys.isEmpty) x else keys.mkString("\t")
case annoRegex(x) => x
case _ => input
}
}

writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(cond.split(",").map(_.split("\\.").last).mkString("\t"))
s.write("\n")
header match {
case Some(str) =>
writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(str)
s.write("\n")
}
}

hadoopDelete(output, state.hadoopConf, recursive = true)
Expand Down
58 changes: 16 additions & 42 deletions src/main/scala/org/broadinstitute/hail/driver/ExportSamples.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ object ExportSamples extends Command {
var output: String = _

@Args4jOption(required = true, name = "-c", aliases = Array("--condition"),
usage = "Comma-separated list of fields to be printed to tsv")
usage = ".columns file, or comma-separated list of fields/computations to be printed to tsv")
var condition: String = _
}

Expand All @@ -27,55 +27,29 @@ object ExportSamples extends Command {

def run(state: State, options: Options): State = {
val vds = state.vds

val sas = vds.metadata.sampleAnnotationSignatures
val cond = options.condition

val output = options.output

val sas = vds.metadata.sampleAnnotationSignatures
val (header, fields) = if (cond.endsWith(".columns"))
ExportTSV.parseColumnsFile(cond, state.hadoopConf)
else
ExportTSV.parseExpression(cond)

val makeString: (Sample, Annotations[String]) => String = {
try {
val ese = new ExportSamplesEvaluator(cond, sas)
ese.typeCheck()
ese.apply
} catch {
case e: scala.tools.reflect.ToolBoxError =>
/* e.message looks like:
reflective compilation has failed:

';' expected but '.' found. */
fatal("parse error in condition: " + e.message)//.split("\n").last)
}
val ese = new ExportSamplesEvaluator(fields, sas)
ese.typeCheck()
ese.apply
}

// FIXME add additional command parsing functionality
val sampleRegex = """s\.(\w+)""".r
val topLevelAnnoRegex = """sa\.(\w+)""".r
val printMapRegex = """sa\.(\w+)\.all""".r
val annoRegex = """sa\.(.+)""".r
def mapColumnNames(input: String): String = {
input match {
case "v" => "Sample"
case "sa" =>
fatal("parse error in condition: cannot print 'sa', choose a group or value in annotations")
case sampleRegex(x) => x
case topLevelAnnoRegex(x) =>
if (sas.maps.contains(x)) {
val keys = sas.maps(x).keys.toArray.sorted
if (keys.isEmpty) x else s"$x:" + keys.mkString(";")
}
else x
case printMapRegex(x) =>
val keys = sas.maps(x).keys
if (keys.isEmpty) x else keys.mkString("\t")
case annoRegex(x) => x
case _ => input
}
}

writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(cond.split(",").map(_.split("\\.").last).mkString(";"))
s.write("\n")
header match {
case Some(str) =>
writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(str)
s.write("\n")
}
}

hadoopDelete(output, state.hadoopConf, recursive = true)
Expand Down
64 changes: 20 additions & 44 deletions src/main/scala/org/broadinstitute/hail/driver/ExportVariants.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import org.broadinstitute.hail.variant._
import org.broadinstitute.hail.annotations._
import org.kohsuke.args4j.{Option => Args4jOption}

import scala.io.Source

object ExportVariants extends Command {

class Options extends BaseOptions {
Expand All @@ -15,7 +17,7 @@ object ExportVariants extends Command {
var output: String = _

@Args4jOption(required = true, name = "-c", aliases = Array("--condition"),
usage = "Comma-separated list of fields to be printed to tsv")
usage = ".columns file, or comma-separated list of fields/computations to be printed to tsv")
var condition: String = _
}

Expand All @@ -27,55 +29,27 @@ object ExportVariants extends Command {

def run(state: State, options: Options): State = {
val vds = state.vds

val vas = vds.metadata.variantAnnotationSignatures
val cond = options.condition

val output = options.output

val vas = vds.metadata.variantAnnotationSignatures
val makeString: (Variant, Annotations[String]) => String = {
try {
val eve = new ExportVariantsEvaluator(cond, vas)
eve.typeCheck()
eve.apply
} catch {
case e: scala.tools.reflect.ToolBoxError =>
/* e.message looks like:
reflective compilation has failed:

';' expected but '.' found. */
fatal("parse error in condition: " + e.message.split("\n").last)
}
}
val (header, fields) = if (cond.endsWith(".columns"))
ExportTSV.parseColumnsFile(cond, state.hadoopConf)
else
ExportTSV.parseExpression(cond)

// FIXME add additional command parsing functionality
val variantRegex = """v\.(\w+)""".r
val topLevelAnnoRegex = """va\.(\w+)""".r
val printMapRegex = """va\.(\w+)\.all""".r
val annoRegex = """va\.(.+)""".r
def mapColumnNames(input: String): String = {
input match {
case "v" => "Variant"
case "va" =>
fatal("parse error in condition: cannot print 'va', choose a group or value in annotations")
case variantRegex(x) => x
case topLevelAnnoRegex(x) =>
if (vas.maps.contains(x)) {
val keys = vas.maps(x).keys.toArray.sorted
if (keys.isEmpty) x else s"$x:" + keys.mkString("\t")
}
else x
case printMapRegex(x) =>
val keys = vas.maps(x).keys
if (keys.isEmpty) x else keys.mkString("\t")
case annoRegex(x) => x
case _ => input
}
val makeString: (Variant, Annotations[String]) => String = {
val eve = new ExportVariantsEvaluator(fields, vas)
eve.typeCheck()
eve.apply
}

writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(cond.split(",").map(mapColumnNames).mkString("\t"))
s.write("\n")
header match {
case Some(str) =>
writeTextFile(output + ".header", state.hadoopConf) { s =>
s.write(str)
s.write("\n")
}
}

hadoopDelete(output, state.hadoopConf, recursive = true)
Expand All @@ -86,4 +60,6 @@ object ExportVariants extends Command {

state
}


}
22 changes: 10 additions & 12 deletions src/main/scala/org/broadinstitute/hail/driver/VariantQC.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import org.kohsuke.args4j.{Option => Args4jOption}
import scala.collection.mutable

object VariantQCCombiner {
val header = "callRate\tMAC\tnCalled\t" +
val header = "callRate\tMAC\tMAF\tnCalled\t" +
"nNotCalled\t" +
"nHomRef\t" +
"nHet\t" +
Expand All @@ -25,14 +25,14 @@ object VariantQCCombiner {
"gqMeanHomRef\tgqStDevHomRef\t" +
"gqMeanHet\tgqStDevHet\t" +
"gqMeanHomVar\tgqStDevHomVar\t" +
"MAF\t" +
"nNonRef\t" +
"rHeterozygosity\t" +
"rHetHomVar\t" +
"rExpectedHetFrequency\tpHWE\t"

val signatures = Map("callRate" -> new SimpleSignature("Double", "toDouble"),
"MAC" -> new SimpleSignature("Int", "toInt"),
"MAF" -> new SimpleSignature("Double", "toDouble"),
"nCalled" -> new SimpleSignature("Int", "toInt"),
"nNotCalled" -> new SimpleSignature("Int", "toInt"),
"nHomRef" -> new SimpleSignature("Int", "toInt"),
Expand All @@ -54,7 +54,6 @@ object VariantQCCombiner {
"gqStDevHet" -> new SimpleSignature("Double", "toDouble"),
"gqMeanHomVar" -> new SimpleSignature("Double", "toDouble"),
"gqStDevHomVar" -> new SimpleSignature("Double", "toDouble"),
"MAF" -> new SimpleSignature("Double", "toDouble"),
"nNonRef" -> new SimpleSignature("Int", "toInt"),
"rHeterozygosity" -> new SimpleSignature("Double", "toDouble"),
"rHetHomVar" -> new SimpleSignature("Double", "toDouble"),
Expand Down Expand Up @@ -152,11 +151,16 @@ class VariantQCCombiner extends Serializable {
val nCalled = nHomRef + nHet + nHomVar

val callRate = divOption(nCalled, nCalled + nNotCalled)
val ac = nHet + 2 * nHomVar
val mac = nHet + 2 * nHomVar

sb.tsvAppend(callRate)
sb += '\t'
sb.append(ac)
sb.append(mac)
sb += '\t'
// MAF
val refAlleles = nHomRef * 2 + nHet
val altAlleles = nHomVar * 2 + nHet
sb.tsvAppend(divOption(altAlleles, refAlleles + altAlleles))
sb += '\t'
sb.append(nCalled)
sb += '\t'
Expand Down Expand Up @@ -187,12 +191,6 @@ class VariantQCCombiner extends Serializable {
emitSC(sb, gqHomVarSC)
sb += '\t'

// MAF
val refAlleles = nHomRef * 2 + nHet
val altAlleles = nHomVar * 2 + nHet
sb.tsvAppend(divOption(altAlleles, refAlleles + altAlleles))
sb += '\t'

// nNonRef
sb.append(nHet + nHomVar)
sb += '\t'
Expand Down Expand Up @@ -225,6 +223,7 @@ class VariantQCCombiner extends Serializable {

Map[String, Any]("callRate" -> divOption(nCalled, nCalled + nNotCalled),
"MAC" -> mac,
"MAF" -> maf,
"nCalled" -> nCalled,
"nNotCalled" -> nNotCalled,
"nHomRef" -> nHomRef,
Expand All @@ -246,7 +245,6 @@ class VariantQCCombiner extends Serializable {
"gqStDevHet" -> someIf(gqHetSC.count > 0, gqHetSC.stdev),
"gqMeanHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.mean),
"gqStDevHomVar" -> someIf(gqHomVarSC.count > 0, gqHomVarSC.stdev),
"MAF" -> maf,
"nNonRef" -> (nHet + nHomVar),
"rHeterozygosity" -> divOption(nHet, nHomRef + nHet + nHomVar),
"rHetHomVar" -> divOption(nHet, nHomVar),
Expand Down