Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[query] better error message when info array field has missing elements #14105

Merged
merged 9 commits into from Jan 11, 2024
Merged
11 changes: 11 additions & 0 deletions hail/python/hail/docs/data/missing-values-in-array-fields.vcf
@@ -0,0 +1,11 @@
##fileformat=VCFv4.1
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=X,Number=.,Type=Integer,Description="">
##FORMAT=<ID=Y,Number=.,Type=Integer,Description="">
##FORMAT=<ID=Z,Number=.,Type=Integer,Description="">
##INFO=<ID=A,Number=A,Type=Integer,Description="">
##INFO=<ID=B,Number=R,Type=Float,Description="">
##INFO=<ID=C,Number=3,Type=Float,Description="">
##INFO=<ID=D,Number=.,Type=Float,Description="">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1
1 123456 . A C . . A=1,.;B=.,2,.;C=. GT:X:Y:Z 0/0:1,.,1:.
42 changes: 42 additions & 0 deletions hail/python/hail/methods/impex.py
Expand Up @@ -2872,6 +2872,48 @@ def import_vcf(

>>> ds = hl.import_vcf('data/sample.vcf.gz', force_bgz=True)

Import a VCF which has missing values (".") inside INFO or FORMAT array fields:

>>> print(open('data/missing-values-in-array-fields.vcf').read())
##fileformat=VCFv4.1
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=X,Number=.,Type=Integer,Description="">
##FORMAT=<ID=Y,Number=.,Type=Integer,Description="">
##FORMAT=<ID=Z,Number=.,Type=Integer,Description="">
##INFO=<ID=A,Number=A,Type=Integer,Description="">
##INFO=<ID=B,Number=R,Type=Float,Description="">
##INFO=<ID=C,Number=3,Type=Float,Description="">
##INFO=<ID=D,Number=.,Type=Float,Description="">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1
1 123456 . A C . . A=1,.;B=.,2,.;C=. GT:X:Y:Z 0/0:1,.,1:.

>>> ds = hl.import_vcf('data/missing-values-in-array-fields.vcf', array_elements_required=False)
>>> ds.show(n_rows=1, n_cols=1, include_row_fields=True)
+---------------+------------+------+-----------+----------+--------------+
| locus | alleles | rsid | qual | filters | info.A |
+---------------+------------+------+-----------+----------+--------------+
| locus<GRCh37> | array<str> | str | float64 | set<str> | array<int32> |
+---------------+------------+------+-----------+----------+--------------+
| 1:123456 | ["A","C"] | NA | -1.00e+01 | NA | [1,NA] |
+---------------+------------+------+-----------+----------+--------------+
<BLANKLINE>
+------------------+----------------+----------------+--------------+
| info.B | info.C | info.D | 'SAMPLE1'.GT |
+------------------+----------------+----------------+--------------+
| array<float64> | array<float64> | array<float64> | call |
+------------------+----------------+----------------+--------------+
| [NA,2.00e+00,NA] | NA | NA | 0/0 |
+------------------+----------------+----------------+--------------+
<BLANKLINE>
+--------------+--------------+--------------+
| 'SAMPLE1'.X | 'SAMPLE1'.Y | 'SAMPLE1'.Z |
+--------------+--------------+--------------+
| array<int32> | array<int32> | array<int32> |
+--------------+--------------+--------------+
| [1,NA,1] | NA | NA |
+--------------+--------------+--------------+


Notes
-----

Expand Down
27 changes: 23 additions & 4 deletions hail/python/test/hail/methods/test_impex.py
Expand Up @@ -212,28 +212,47 @@ def test_import_vcf_can_import_negative_numbers(self):
)
)

def test_import_vcf_missing_info_field_elements(self):
def test_import_vcf_has_good_error_message_when_info_fields_have_missing_elements(self):
mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37')
with pytest.raises(
FatalError,
match=".*Missing value in INFO array. Use 'hl.import_vcf[(][.][.][.], array_elements_required=False[)]'[.].*",
):
mt._force_count_rows()

def test_import_vcf_array_elements_required_is_false_parses_info_fields_with_missing_elements(self):
mt = hl.import_vcf(resource('missingInfoArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
mt = mt.select_rows(FOO=mt.info.FOO, BAR=mt.info.BAR)
mt = mt.select_rows(**mt.info)
expected = hl.Table.parallelize(
[
{'locus': hl.Locus('X', 16050036), 'alleles': ['A', 'C'], 'FOO': [1, None], 'BAR': [2, None, None]},
{
'locus': hl.Locus('X', 16050036),
'alleles': ['A', 'C'],
'FOO': [1, None],
'BAR': [2, None, None],
'JUST_A_DOT': None,
'NOT_EVEN_PRESENT': None,
},
{
'locus': hl.Locus('X', 16061250),
'alleles': ['T', 'A', 'C'],
'FOO': [None, 2, None],
'BAR': [None, 1.0, None],
'JUST_A_DOT': None,
'NOT_EVEN_PRESENT': None,
},
],
hl.tstruct(
locus=hl.tlocus('GRCh37'),
alleles=hl.tarray(hl.tstr),
FOO=hl.tarray(hl.tint),
BAR=hl.tarray(hl.tfloat64),
JUST_A_DOT=hl.tarray(hl.tfloat64),
NOT_EVEN_PRESENT=hl.tarray(hl.tfloat64),
),
key=['locus', 'alleles'],
)
self.assertTrue(mt.rows()._same(expected))
assert mt.rows()._same(expected)

def test_import_vcf_missing_format_field_elements(self):
mt = hl.import_vcf(resource('missingFormatArray.vcf'), reference_genome='GRCh37', array_elements_required=False)
Expand Down
58 changes: 35 additions & 23 deletions hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala
Expand Up @@ -740,7 +740,7 @@ final class VCFLine(
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
s"missing value in FORMAT array. Import with argument 'array_elements_required=False'"
"Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
ab.addMissing()
pos += 1
Expand All @@ -749,11 +749,11 @@ final class VCFLine(
}
}

def parseIntArrayElement() {
def parseArrayIntElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
s"missing value in FORMAT array. Import with argument 'array_elements_required=False'"
"Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abi.addMissing()
pos += 1
Expand All @@ -766,7 +766,7 @@ final class VCFLine(
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
s"missing value in FORMAT array. Import with argument 'array_elements_required=False'"
"Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abf.addMissing()
pos += 1
Expand All @@ -775,11 +775,11 @@ final class VCFLine(
}
}

def parseDoubleArrayElement() {
def parseArrayDoubleElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
s"missing value in FORMAT array. Import with argument 'array_elements_required=False'"
"Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abd.addMissing()
pos += 1
Expand All @@ -788,11 +788,11 @@ final class VCFLine(
}
}

def parseStringArrayElement() {
def parseArrayStringElement() {
if (formatArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
s"missing value in FORMAT array. Import with argument 'array_elements_required=False'"
"Missing value in FORMAT array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abs.addMissing()
pos += 1
Expand All @@ -808,11 +808,11 @@ final class VCFLine(
} else {
assert(abi.length == 0)

parseIntArrayElement()
parseArrayIntElement()

while (!endFormatField()) {
pos += 1 // comma
parseIntArrayElement()
parseArrayIntElement()
}

rvb.startArray(abi.length)
Expand All @@ -837,10 +837,10 @@ final class VCFLine(
} else {
assert(abs.length == 0)

parseStringArrayElement()
parseArrayStringElement()
while (!endFormatField()) {
pos += 1 // comma
parseStringArrayElement()
parseArrayStringElement()
}

rvb.startArray(abs.length)
Expand Down Expand Up @@ -890,10 +890,10 @@ final class VCFLine(
} else {
assert(abd.length == 0)

parseDoubleArrayElement()
parseArrayDoubleElement()
while (!endFormatField()) {
pos += 1 // comma
parseDoubleArrayElement()
parseArrayDoubleElement()
}

rvb.startArray(abd.length)
Expand Down Expand Up @@ -993,24 +993,36 @@ final class VCFLine(

def parseDoubleInInfoArray(): Double = VCFUtils.parseVcfDouble(parseStringInInfoArray())

def parseIntInfoArrayElement() {
def parseInfoArrayIntElement() {
if (infoArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
"Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abi.addMissing()
pos += 1 // dot
} else
abi += parseIntInInfoArray()
}

def parseStringInfoArrayElement() {
def parseInfoArrayStringElement() {
if (infoArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
"Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abs.addMissing()
pos += 1 // dot
} else
abs += parseStringInInfoArray()
}

def parseDoubleInfoArrayElement() {
def parseInfoArrayDoubleElement() {
if (infoArrayElementMissing()) {
if (arrayElementsRequired)
parseError(
"Missing value in INFO array. Use 'hl.import_vcf(..., array_elements_required=False)'."
)
abd.addMissing()
pos += 1
} else {
Expand All @@ -1022,10 +1034,10 @@ final class VCFLine(
if (!infoFieldMissing()) {
rvb.setPresent()
assert(abi.length == 0)
parseIntInfoArrayElement()
parseInfoArrayIntElement()
while (!endInfoField()) {
pos += 1 // comma
parseIntInfoArrayElement()
parseInfoArrayIntElement()
}

rvb.startArray(abi.length)
Expand All @@ -1046,10 +1058,10 @@ final class VCFLine(
if (!infoFieldMissing()) {
rvb.setPresent()
assert(abs.length == 0)
parseStringInfoArrayElement()
parseInfoArrayStringElement()
while (!endInfoField()) {
pos += 1 // comma
parseStringInfoArrayElement()
parseInfoArrayStringElement()
}

rvb.startArray(abs.length)
Expand All @@ -1070,10 +1082,10 @@ final class VCFLine(
if (!infoFieldMissing()) {
rvb.setPresent()
assert(abd.length == 0)
parseDoubleInfoArrayElement()
parseInfoArrayDoubleElement()
while (!endInfoField()) {
pos += 1 // comma
parseDoubleInfoArrayElement()
parseInfoArrayDoubleElement()
}

rvb.startArray(abd.length)
Expand Down
6 changes: 4 additions & 2 deletions hail/src/test/resources/missingInfoArray.vcf
Expand Up @@ -9,6 +9,8 @@
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=FOO,Number=R,Type=Integer,Description="">
##INFO=<ID=BAR,Number=3,Type=Float,Description="">
##INFO=<ID=JUST_A_DOT,Number=3,Type=Float,Description="">
##INFO=<ID=NOT_EVEN_PRESENT,Number=3,Type=Float,Description="">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT C1046::HG02024 C1046::HG02025
X 16050036 . A C 19961.13 . FOO=1,.;BAR=2,.,. GT:GTA:GTZ:AD:DP:GQ:PL 0/0:./.:0/1:10,0:10:44:0,44,180 1:.:0:0,6:7:70:70,0
X 16061250 . T A,C 547794.46 . FOO=.,2,.;BAR=.,1.0,. GT:GTA:GTZ:AD:DP:GQ:PL 2/2:2/1:1/1:0,0,11:11:33:396,402,411,33,33,0 2:.:1:0,0,9:9:24:24,40,0
X 16050036 . A C 19961.13 . FOO=1,.;BAR=2,.,.;JUST_A_DOT=. GT:GTA:GTZ:AD:DP:GQ:PL 0/0:./.:0/1:10,0:10:44:0,44,180 1:.:0:0,6:7:70:70,0
X 16061250 . T A,C 547794.46 . FOO=.,2,.;BAR=.,1.0,.;JUST_A_DOT=. GT:GTA:GTZ:AD:DP:GQ:PL 2/2:2/1:1/1:0,0,11:11:33:396,402,411,33,33,0 2:.:1:0,0,9:9:24:24,40,0