Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hail] Add table and matrix table indices #6266

Merged
merged 5 commits into from
Jul 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions hail/python/hail/ir/matrix_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import abc
import json

import hail as hl

from .utils import make_filter_and_replace
from ..expr.types import tfloat32, tfloat64
from ..genetics.reference_genome import reference_genome_type
Expand All @@ -20,18 +22,48 @@ def __eq__(self, other):


class MatrixNativeReader(MatrixReader):
@typecheck_method(path=str)
def __init__(self, path):
@typecheck_method(path=str,
intervals=nullable(sequenceof(anytype)),
filter_intervals=bool)
def __init__(self, path, intervals, filter_intervals):
if intervals is not None:
t = hl.expr.impute_type(intervals)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this logic should be in read_matrix_table -- we generally assume that parameters on the IR have been validated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, shouldn't there be a check that the interval type agrees with the key type?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't do that until this is wrapped in a Table/MatrixTable (python classes). We have no access to the key type at this point. I can certainly move much of the logic into read(_matrix)_table.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, realized that after I commented and was looking at some of the code i master. I think it would be preferable to do the type lookup in read_matrix_table and then pass it into the IR node, but that's definitely out of scope of this change.

if not isinstance(t, hl.tarray) and not isinstance(t.element_type, hl.tinterval):
raise TypeError("'intervals' must be an array of tintervals")
pt = t.element_type.point_type
if isinstance(pt, hl.tstruct):
self._interval_type = t
else:
self._interval_type = hl.tarray(hl.tinterval(hl.tstruct(__point=pt)))

self.path = path
self.filter_intervals = filter_intervals
if intervals is not None and t != self._interval_type:
self.intervals = [hl.Interval(hl.Struct(__point=i.start),
hl.Struct(__point=i.end),
i.includes_start,
i.includes_end) for i in intervals]
else:
self.intervals = intervals

def render(self, r):
reader = {'name': 'MatrixNativeReader',
'path': self.path}
if self.intervals is not None:
assert self._interval_type is not None
reader['options'] = {
'name': 'NativeReaderOptions',
'intervals': self._interval_type._convert_to_json(self.intervals),
'intervalPointType': self._interval_type.element_type.point_type._parsable_string(),
'filterIntervals': self.filter_intervals,
}
return escape_str(json.dumps(reader))

def __eq__(self, other):
return isinstance(other, MatrixNativeReader) and \
other.path == self.path
other.path == self.path and \
other.intervals == self.intervals and \
other.filter_intervals == self.filter_intervals


class MatrixRangeReader(MatrixReader):
Expand Down
38 changes: 35 additions & 3 deletions hail/python/hail/ir/table_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import abc
import json

import hail as hl

from hail.ir.utils import make_filter_and_replace
from hail.typecheck import *
from hail.utils.misc import escape_str
Expand All @@ -17,18 +19,48 @@ def __eq__(self, other):


class TableNativeReader(TableReader):
@typecheck_method(path=str)
def __init__(self, path):
@typecheck_method(path=str,
intervals=nullable(sequenceof(anytype)),
filter_intervals=bool)
def __init__(self, path, intervals, filter_intervals):
chrisvittal marked this conversation as resolved.
Show resolved Hide resolved
if intervals is not None:
t = hl.expr.impute_type(intervals)
if not isinstance(t, hl.tarray) and not isinstance(t.element_type, hl.tinterval):
raise TypeError("'intervals' must be an array of tintervals")
pt = t.element_type.point_type
if isinstance(pt, hl.tstruct):
self._interval_type = t
else:
self._interval_type = hl.tarray(hl.tinterval(hl.tstruct(__point=pt)))

self.path = path
self.filter_intervals = filter_intervals
if intervals is not None and t != self._interval_type:
self.intervals = [hl.Interval(hl.Struct(__point=i.start),
hl.Struct(__point=i.end),
i.includes_start,
i.includes_end) for i in intervals]
else:
self.intervals = intervals

def render(self):
reader = {'name': 'TableNativeReader',
'path': self.path}
if self.intervals is not None:
assert self._interval_type is not None
reader['options'] = {
'name': 'NativeReaderOptions',
'intervals': self._interval_type._convert_to_json(self.intervals),
'intervalPointType': self._interval_type.element_type.point_type._parsable_string(),
'filterIntervals': self.filter_intervals,
}
return escape_str(json.dumps(reader))

def __eq__(self, other):
return isinstance(other, TableNativeReader) and \
other.path == self.path
other.path == self.path and \
other.intervals == self.intervals and \
other.filter_intervals == self.filter_intervals

class TextTableReader(TableReader):
def __init__(self, paths, min_partitions, types, comment,
Expand Down
16 changes: 11 additions & 5 deletions hail/python/hail/methods/impex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1785,9 +1785,12 @@ def import_plink(bed, bim, fam,


@typecheck(path=str,
_intervals=nullable(sequenceof(anytype)),
_filter_intervals=bool,
_drop_cols=bool,
_drop_rows=bool)
def read_matrix_table(path, _drop_cols=False, _drop_rows=False) -> MatrixTable:
def read_matrix_table(path, *, _intervals=None, _filter_intervals=False, _drop_cols=False,
_drop_rows=False) -> MatrixTable:
"""Read in a :class:`.MatrixTable` written with :meth:`.MatrixTable.write`.

Parameters
Expand All @@ -1799,7 +1802,8 @@ def read_matrix_table(path, _drop_cols=False, _drop_rows=False) -> MatrixTable:
-------
:class:`.MatrixTable`
"""
return MatrixTable(MatrixRead(MatrixNativeReader(path), _drop_cols, _drop_rows))
return MatrixTable(MatrixRead(MatrixNativeReader(path, _intervals, _filter_intervals),
_drop_cols, _drop_rows))


@typecheck(path=str)
Expand Down Expand Up @@ -2172,8 +2176,10 @@ def index_bgen(path,
Env.hc()._jhc.indexBgen(wrap_to_list(path), index_file_map, joption(rg), contig_recoding, skip_invalid_loci)


@typecheck(path=str)
def read_table(path) -> Table:
@typecheck(path=str,
_intervals=nullable(sequenceof(anytype)),
_filter_intervals=bool)
def read_table(path, *, _intervals=None, _filter_intervals=False) -> Table:
"""Read in a :class:`.Table` written with :meth:`.Table.write`.

Parameters
Expand All @@ -2185,7 +2191,7 @@ def read_table(path) -> Table:
-------
:class:`.Table`
"""
tr = TableNativeReader(path)
tr = TableNativeReader(path, _intervals, _filter_intervals)
return Table(TableRead(tr, False))


Expand Down
2 changes: 1 addition & 1 deletion hail/python/hail/methods/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def maximal_independent_set(i, j, keep=True, tie_breaker=None, keyed=True) -> Ta

When multiple nodes have the same degree, this algorithm will order the
nodes according to ``tie_breaker`` and remove the *largest* node.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whitespace change, and not a fix to something against default formatting?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My editor does this automatically. I can revert this in particular if we want, but also I feel that it's good style to not have trailing whitespace.

Copy link
Contributor

@tpoterba tpoterba Jun 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I'm sympathetic, but it does make the diff harder to read. A setting that applies the transformation only in lines otherwise modified would be great -- does that exist?


If `keyed` is ``False``, then a node may appear twice in the resulting
table.

Expand Down
41 changes: 22 additions & 19 deletions hail/python/test/hail/matrixtable/test_file_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,26 +42,29 @@ def test_backward_compatability(self):

all_values_table, all_values_matrix_table = create_all_values_datasets()

table_dir = resource('backward_compatability/1.0.0/table')
matrix_table_dir = resource('backward_compatability/1.0.0/matrix_table')
resource_dir = resource('backward_compatability')
versions = os.listdir(resource_dir)

n = 0
i = 0
f = os.path.join(table_dir, '{}.ht'.format(i))
while os.path.exists(f):
ds = hl.read_table(f)
self.assertTrue(ds._same(all_values_table))
i += 1
for v in versions:
table_dir = os.path.join(resource_dir, v, 'table')
i = 0
f = os.path.join(table_dir, '{}.ht'.format(i))
n += 1

i = 0
f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
while os.path.exists(f):
ds = hl.read_matrix_table(f)
self.assertTrue(ds._same(all_values_matrix_table))
i += 1
while os.path.exists(f):
ds = hl.read_table(f)
self.assertTrue(ds._same(all_values_table))
i += 1
f = os.path.join(table_dir, '{}.ht'.format(i))
n += 1

matrix_table_dir = os.path.join(resource_dir, v, 'matrix_table')
i = 0
f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
n += 1

self.assertEqual(n, 8)
while os.path.exists(f):
ds = hl.read_matrix_table(f)
self.assertTrue(ds._same(all_values_matrix_table))
i += 1
f = os.path.join(matrix_table_dir, '{}.hmt'.format(i))
n += 1

self.assertEqual(n, 20)
32 changes: 32 additions & 0 deletions hail/python/test/hail/matrixtable/test_matrix_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,38 @@ def test_read_stored_globals(self):
t = hl.read_table(f + '/globals')
self.assertTrue(ds.globals_table()._same(t))

def test_indexed_read(self):
mt = hl.utils.range_matrix_table(2000, 100, 10)
f = new_temp_file(suffix='mt')
mt.write(f)
mt2 = hl.read_matrix_table(f, _intervals=[
hl.Interval(start=150, end=250, includes_start=True, includes_end=False),
hl.Interval(start=250, end=500, includes_start=True, includes_end=False),
])
self.assertEqual(mt2.n_partitions(), 2)
self.assertTrue(mt.filter_rows((mt.row_idx >= 150) & (mt.row_idx < 500))._same(mt2))

mt2 = hl.read_matrix_table(f, _intervals=[
hl.Interval(start=150, end=250, includes_start=True, includes_end=False),
hl.Interval(start=250, end=500, includes_start=True, includes_end=False),
], _filter_intervals=True)
self.assertEqual(mt2.n_partitions(), 3)
self.assertTrue(mt.filter_rows((mt.row_idx >= 150) & (mt.row_idx < 500))._same(mt2))

def test_indexed_read_vcf(self):
vcf = self.get_vds(10)
f = new_temp_file(suffix='mt')
vcf.write(f)
l1, l2, l3, l4 = hl.Locus('20', 10000000), hl.Locus('20', 11000000), hl.Locus('20', 13000000), hl.Locus('20', 14000000)
mt = hl.read_matrix_table(f, _intervals=[
hl.Interval(start=l1, end=l2),
hl.Interval(start=l3, end=l4),
])
self.assertEqual(mt.n_partitions(), 2)
p = (vcf.locus >= l1) & (vcf.locus < l2)
q = (vcf.locus >= l3) & (vcf.locus < l4)
self.assertTrue(vcf.filter_rows(p | q)._same(mt))

def test_codecs_matrix(self):
from hail.utils.java import scala_object
codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
Expand Down
22 changes: 20 additions & 2 deletions hail/python/test/hail/table/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,24 @@ def test_read_back_same_as_exported(self):
t_read_back = hl.import_table(tmp_file, types=dict(t.row.dtype)).key_by('idx')
self.assertTrue(t.select_globals()._same(t_read_back, tolerance=1e-4, absolute=True))

def test_indexed_read(self):
t = hl.utils.range_table(2000, 10)
f = new_temp_file(suffix='ht')
t.write(f)
t2 = hl.read_table(f, _intervals=[
hl.Interval(start=150, end=250, includes_start=True, includes_end=False),
hl.Interval(start=250, end=500, includes_start=True, includes_end=False),
])
self.assertEqual(t2.n_partitions(), 2)
self.assertTrue(t.filter((t.idx >= 150) & (t.idx < 500))._same(t2))

t2 = hl.read_table(f, _intervals=[
hl.Interval(start=150, end=250, includes_start=True, includes_end=False),
hl.Interval(start=250, end=500, includes_start=True, includes_end=False),
], _filter_intervals=True)
self.assertEqual(t2.n_partitions(), 3)
self.assertTrue(t.filter((t.idx >= 150) & (t.idx < 500))._same(t2))

def test_order_by_parsing(self):
hl.utils.range_table(1).annotate(**{'a b c' : 5}).order_by('a b c')._force_count()

Expand Down Expand Up @@ -1003,7 +1021,7 @@ def test_same_different_type(self):

t2 = t1.annotate_globals(x = 7)
self.assertFalse(t1._same(t2))

t3 = t1.annotate(x = 7)
self.assertFalse(t1._same(t3))

Expand All @@ -1019,7 +1037,7 @@ def test_same_different_global(self):
def test_same_different_rows(self):
t1 = (hl.utils.range_table(2)
.annotate(x = 7))

t2 = t1.annotate(x = 8)
self.assertFalse(t1._same(t2))

Expand Down
14 changes: 9 additions & 5 deletions hail/python/test/hail/test_ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def value_irs(self):
table = ir.TableRange(5, 3)

matrix_read = ir.MatrixRead(ir.MatrixNativeReader(
resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)
resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False),
False, False)

block_matrix_read = ir.BlockMatrixRead(ir.BlockMatrixNativeReader('fake_file_path'))

Expand Down Expand Up @@ -139,11 +140,12 @@ class TableIRTests(unittest.TestCase):
def table_irs(self):
b = ir.TrueIR()
table_read = ir.TableRead(
ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)
ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False)
table_read_row_type = hl.dtype('struct{idx: int32, f32: float32, i64: int64, m: float64, astruct: struct{a: int32, b: float64}, mstruct: struct{x: int32, y: str}, aset: set<str>, mset: set<float64>, d: dict<array<str>, float64>, md: dict<int32, str>, h38: locus<GRCh38>, ml: locus<GRCh37>, i: interval<locus<GRCh37>>, c: call, mc: call, t: tuple(call, str, str), mt: tuple(locus<GRCh37>, bool)}')

matrix_read = ir.MatrixRead(
ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)
ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False),
False, False)

range = ir.TableRange(10, 4)
table_irs = [
Expand Down Expand Up @@ -208,9 +210,11 @@ def matrix_irs(self):
collect = ir.MakeStruct([('x', ir.ApplyAggOp('Collect', [], None, [ir.I32(0)]))])

matrix_read = ir.MatrixRead(
ir.MatrixNativeReader(resource('backward_compatability/1.0.0/matrix_table/0.hmt')), False, False)
ir.MatrixNativeReader(
resource('backward_compatability/1.0.0/matrix_table/0.hmt'), None, False),
False, False)
table_read = ir.TableRead(
ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht')), False)
ir.TableNativeReader(resource('backward_compatability/1.0.0/table/0.ht'), None, False), False)

matrix_range = ir.MatrixRead(ir.MatrixRangeReader(1, 1, 10))
matrix_irs = [
Expand Down
Loading