Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for importing dosages as call values #5077

Merged
merged 13 commits into from
Jan 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hail/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ plugins {
id 'jacoco'
id 'com.github.johnrengelman.shadow' version '1.2.3'
id "de.undercouch.download" version "3.2.0"
id 'eclipse'
}

import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
Expand Down
6 changes: 6 additions & 0 deletions hail/python/hail/ir/matrix_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import abc
import json

from ..expr.types import tfloat32, tfloat64
from ..typecheck import *
from ..utils import wrap_to_list
from ..utils.java import escape_str
Expand Down Expand Up @@ -58,6 +59,7 @@ def __eq__(self, other):
class MatrixVCFReader(MatrixReader):
@typecheck_method(path=oneof(str, sequenceof(str)),
call_fields=oneof(str, sequenceof(str)),
entry_float_type=enumeration(tfloat32, tfloat64),
header_file=nullable(str),
min_partitions=nullable(int),
reference_genome=nullable(reference_genome_type),
Expand All @@ -70,6 +72,7 @@ class MatrixVCFReader(MatrixReader):
def __init__(self,
path,
call_fields,
entry_float_type,
header_file,
min_partitions,
reference_genome,
Expand All @@ -83,6 +86,7 @@ def __init__(self,
self.header_file = header_file
self.min_partitions = min_partitions
self.call_fields = wrap_to_list(call_fields)
self.entry_float_type = entry_float_type._parsable_string()
self.reference_genome = reference_genome
self.contig_recoding = contig_recoding
self.array_elements_required = array_elements_required
Expand All @@ -95,6 +99,7 @@ def render(self, r):
reader = {'name': 'MatrixVCFReader',
'files': self.path,
'callFields': self.call_fields,
'entryFloatType': self.entry_float_type,
'headerFile': self.header_file,
'minPartitions': self.min_partitions,
'rg': self.reference_genome.name if self.reference_genome else None,
Expand All @@ -110,6 +115,7 @@ def __eq__(self, other):
return isinstance(other, MatrixVCFReader) and \
other.path == self.path and \
other.call_fields == self.call_fields and \
other.entry_float_type == self.entry_float_type and \
other.header_file == self.header_file and \
other.min_partitions == self.min_partitions and \
other.reference_genome == self.reference_genome and \
Expand Down
11 changes: 10 additions & 1 deletion hail/python/hail/methods/impex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,7 @@ def get_vcf_metadata(path):
contig_recoding=nullable(dictof(str, str)),
array_elements_required=bool,
skip_invalid_loci=bool,
entry_float_type=enumeration(tfloat32, tfloat64),
# json
_partitions=nullable(str))
def import_vcf(path,
Expand All @@ -1760,6 +1761,7 @@ def import_vcf(path,
contig_recoding=None,
array_elements_required=True,
skip_invalid_loci=False,
entry_float_type=tfloat64,
_partitions=None) -> MatrixTable:
"""Import VCF file(s) as a :class:`.MatrixTable`.

Expand Down Expand Up @@ -1876,13 +1878,17 @@ def import_vcf(path,
element.
skip_invalid_loci : :obj:`bool`
If ``True``, skip loci that are not consistent with `reference_genome`.
entry_float_type: :class:`.HailType`
Type of floating point entries in matrix table. Must be one of:
:py:data:`.tfloat32` or :py:data:`.tfloat64`. Default:
:py:data:`.tfloat64`.

Returns
-------
:class:`.MatrixTable`
"""

reader = MatrixVCFReader(path, call_fields, header_file, min_partitions,
reader = MatrixVCFReader(path, call_fields, entry_float_type, header_file, min_partitions,
reference_genome, contig_recoding, array_elements_required,
skip_invalid_loci, force_bgz, force, _partitions)
return MatrixTable(MatrixRead(reader, drop_cols=drop_samples))
Expand All @@ -1892,6 +1898,7 @@ def import_vcf(path,
force=bool,
force_bgz=bool,
call_fields=oneof(str, sequenceof(str)),
entry_float_type=enumeration(tfloat32, tfloat64),
reference_genome=nullable(reference_genome_type),
contig_recoding=nullable(dictof(str, str)),
array_elements_required=bool,
Expand All @@ -1901,6 +1908,7 @@ def import_vcfs(path,
force=False,
force_bgz=False,
call_fields=[],
entry_float_type=tfloat64,
reference_genome='default',
contig_recoding=None,
array_elements_required=True,
Expand Down Expand Up @@ -1951,6 +1959,7 @@ def import_vcfs(path,
jmts = _cached_importvcfs.pyApply(
wrap_to_list(path),
wrap_to_list(call_fields),
entry_float_type._parsable_string(),
rg,
contig_recoding,
array_elements_required,
Expand Down
28 changes: 26 additions & 2 deletions hail/python/test/hail/methods/test_impex.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import json
import os
import unittest

import shutil

import hail as hl
from ..helpers import *
from hail.utils import new_temp_file, FatalError, run_command, uri_path
import unittest
import os

setUpModule = startTestHailContext
tearDownModule = stopTestHailContext
Expand Down Expand Up @@ -160,6 +163,27 @@ def test_import_vcf_set_field_missing(self):
mt = hl.import_vcf(resource('test_set_field_missing.vcf'))
mt.aggregate_entries(hl.agg.sum(mt.DP))

def test_import_vcf_dosages_as_doubles_or_floats(self):
mt = hl.import_vcf(resource('small-ds.vcf'))
self.assertEqual(hl.expr.expressions.typed_expressions.Float64Expression, type(mt.entry.DS))
mt32 = hl.import_vcf(resource('small-ds.vcf'), entry_float_type=hl.tfloat32)
self.assertEqual(hl.expr.expressions.typed_expressions.Float32Expression, type(mt32.entry.DS))
mt_result = mt.annotate_entries(DS32=mt32.index_entries(mt.row_key, mt.col_key).DS)
compare = mt_result.annotate_entries(
test=(hl.coalesce(hl.approx_equal(mt_result.DS, mt_result.DS32, nan_same=True), True))
)
self.assertTrue(all(compare.test.collect()))

def test_import_vcf_invalid_float_type(self):
with self.assertRaises(TypeError):
mt = hl.import_vcf(resource('small-ds.vcf'), entry_float_type=hl.tstr)
with self.assertRaises(TypeError):
mt = hl.import_vcf(resource('small-ds.vcf'), entry_float_type=hl.tint)
with self.assertRaises(TypeError):
mt = hl.import_vcf(resource('small-ds.vcf'), entry_float_type=hl.tint32)
with self.assertRaises(TypeError):
mt = hl.import_vcf(resource('small-ds.vcf'), entry_float_type=hl.tint64)

def test_export_vcf(self):
dataset = hl.import_vcf(resource('sample.vcf.bgz'))
vcf_metadata = hl.get_vcf_metadata(resource('sample.vcf.bgz'))
Expand Down
25 changes: 21 additions & 4 deletions hail/python/test/hail/methods/test_statgen.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import unittest
from subprocess import DEVNULL, call as syscall
import numpy as np

import pytest
import numpy as np
import shutil

import hail as hl
import hail.expr.aggregators as agg
Expand Down Expand Up @@ -379,6 +382,20 @@ def test_linear_regression_with_dosage(self):
self.assertAlmostEqual(results[3].p_value, 0.2533675, places=6)
self.assertTrue(np.isnan(results[6].standard_error))

def test_linear_regression_equivalence_between_ds_and_gt(self):
"""Test that linear regressions on data converted from dosage to genotype returns the same results"""
ds_mt = hl.import_vcf(resource('small-ds.vcf'))
gt_mt = hl.import_vcf(resource('small-gt.vcf'))
pheno_t = hl.read_table(resource('small-pheno.t'))
ds_mt = ds_mt.annotate_cols(**pheno_t[ds_mt.s])
gt_mt = gt_mt.annotate_cols(**pheno_t[gt_mt.s])
ds_results_mt = hl.linear_regression_rows(y=ds_mt.phenotype, x=ds_mt.DS, covariates=[1.0])
gt_results_mt = hl.linear_regression_rows(y=gt_mt.phenotype, x=gt_mt.GT.n_alt_alleles(), covariates=[1.0])
ds_results_t = ds_results_mt.select(ds_p_value=ds_results_mt.p_value)
gt_results_t = gt_results_mt.select(gt_p_value=gt_results_mt.p_value)
results_t = ds_results_t.annotate(**gt_results_t[ds_results_t.locus, ds_results_t.alleles])
self.assertTrue(all(hl.approx_equal(results_t.ds_p_value, results_t.gt_p_value, nan_same=True).collect()))

def test_linear_regression_with_import_fam_boolean(self):
covariates = hl.import_table(resource('regressionLinear.cov'),
key='Sample',
Expand Down Expand Up @@ -1348,7 +1365,7 @@ def test_stat(k, n, m, seed):
# test af distribution
def variance(expr):
return hl.bind(lambda mean: hl.mean(hl.map(lambda elt: (elt - mean) ** 2, expr)), hl.mean(expr))
delta_mean = 0.2 # consider alternatives to 0.2
delta_mean = 0.2 # consider alternatives to 0.2
delta_var = 0.1
per_row = hl.bind(lambda mean, var, ancestral:
(ancestral > mean - delta_mean) &
Expand Down Expand Up @@ -1434,7 +1451,7 @@ def test_de_novo(self):
ped = hl.Pedigree.read(resource('denovo.fam'))
r = hl.de_novo(mt, ped, mt.info.ESP)
r = r.select(
prior = r.prior,
prior=r.prior,
kid_id=r.proband.s,
dad_id=r.father.s,
mom_id=r.mother.s,
Expand Down Expand Up @@ -1475,4 +1492,4 @@ def test_regression_field_dependence(self):

hl.logistic_regression_rows('wald', y=mt.c1, x=x_expr, covariates=[1])
hl.poisson_regression_rows('wald', y=mt.c1, x=x_expr, covariates=[1])
hl.linear_regression_rows(y=mt.c1, x=x_expr, covariates=[1])
hl.linear_regression_rows(y=mt.c1, x=x_expr, covariates=[1])
5 changes: 3 additions & 2 deletions hail/python/test/hail/stats/test_linear_mixed_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,14 +326,15 @@ def test_linear_mixed_regression_full_rank(self):
mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
p_path = utils.new_temp_file()

h2_fastlmm = 0.14276125
h2_fastlmm = 0.142761
h2_places = 6
beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]
pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
model.fit()
self.assertAlmostEqual(model.h_sq, h2_fastlmm)
self.assertAlmostEqual(model.h_sq, h2_fastlmm, places=h2_places)

mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
Expand Down
2 changes: 1 addition & 1 deletion hail/python/test/hail/test_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def test_matrix_ir_parses(self):
ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
matrix_read,
matrix_range,
ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], None, None, None, None,
ir.MatrixRead(ir.MatrixVCFReader(resource('sample.vcf'), ['GT'], hl.tfloat64, None, None, None, None,
False, True, False, True, None)),
ir.MatrixRead(ir.MatrixBGENReader(resource('example.8bits.bgen'), None, {}, 10, 1, None)),
ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
Expand Down
14 changes: 13 additions & 1 deletion hail/src/main/scala/is/hail/io/vcf/HtsjdkRecordReader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package is.hail.io.vcf

import htsjdk.variant.variantcontext.VariantContext
import is.hail.annotations._
import is.hail.expr.ir.IRParser
import is.hail.expr.types._
import is.hail.expr.types.virtual._
import is.hail.utils._
Expand All @@ -18,10 +19,21 @@ class BufferedLineIterator(bit: BufferedIterator[String]) extends htsjdk.tribble
}
}

class HtsjdkRecordReader(val callFields: Set[String]) extends Serializable {
class HtsjdkRecordReader(
val callFields: Set[String],
val entryFloatTypeName: String = TFloat64()._toPretty) extends Serializable {

import HtsjdkRecordReader._

val entryFloatType: TNumeric = IRParser.parseType(entryFloatTypeName) match {
case t32: TFloat32 => t32
case t64: TFloat64 => t64
case _ => fatal(
s"""invalid floating point type:
| expected ${TFloat32()._toPretty} or ${TFloat64()._toPretty}, got ${entryFloatTypeName}"""
)
}

def readVariantInfo(
vc: VariantContext,
rvb: RegionValueBuilder,
Expand Down
Loading