Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feature] Add semi- and anti-joins. #5529

Merged
merged 2 commits into from Mar 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions hail/python/hail/conftest.py
Expand Up @@ -59,6 +59,10 @@ def init(doctest_namespace):

s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
doctest_namespace['s_metadata'] = s_metadata
doctest_namespace['cols_to_keep'] = s_metadata
doctest_namespace['cols_to_remove'] = s_metadata
doctest_namespace['rows_to_keep'] = v_metadata
doctest_namespace['rows_to_remove'] = v_metadata

# Table
table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
Expand Down
157 changes: 157 additions & 0 deletions hail/python/hail/matrixtable.py
Expand Up @@ -1327,6 +1327,163 @@ def check_key(name, keys):

return m

@typecheck_method(other=Table)
def semi_join_rows(self, other: 'Table') -> 'MatrixTable':
"""Filters the matrix table to rows whose key appears in `other`.

Parameters
----------
other : :class:`.Table`
Table with compatible key field(s).

Returns
-------
:class:`.MatrixTable`

Notes
-----
The row key type of the matrix table must match the key type of `other`.

This method does not change the schema of the matrix table; it is a
filtering the matrix table to row keys not present in another table.

To discard rows whose key is present in `other`, use
:meth:`.anti_join_rows`.

Examples
--------
>>> ds_result = ds.semi_join_rows(rows_to_keep)

It may be expensive to key the matrix table by the right-side key.
In this case, it is possible to implement a semi-join using a non-key
field as follows:

>>> ds_result = ds.filter_rows(hl.is_defined(rows_to_keep.index(ds['locus'], ds['alleles'])))

See Also
--------
:meth:`.anti_join_rows`, :meth:`.filter_rows`, :meth:`.semi_join_cols`
"""
return self.filter_rows(hl.is_defined(other.index(self.row_key)))

@typecheck_method(other=Table)
def anti_join_rows(self, other: 'Table') -> 'MatrixTable':
"""Filters the table to rows whose key does not appear in `other`.

Parameters
----------
other : :class:`.Table`
Table with compatible key field(s).

Returns
-------
:class:`.MatrixTable`

Notes
-----
The row key type of the matrix table must match the key type of `other`.

This method does not change the schema of the table; it is a method of
filtering the matrix table to row keys not present in another table.

To restrict to rows whose key is present in `other`, use
:meth:`.anti_join_rows`.

Examples
--------
>>> ds_result = ds.anti_join_rows(rows_to_remove)

It may be expensive to key the matrix table by the right-side key.
In this case, it is possible to implement an anti-join using a non-key
field as follows:

>>> ds_result = ds.filter_rows(hl.is_missing(rows_to_remove.index(ds['locus'], ds['alleles'])))

See Also
--------
:meth:`.anti_join_rows`, :meth:`.filter_rows`, :meth:`.anti_join_cols`
"""
return self.filter_rows(hl.is_missing(other.index(self.row_key)))


@typecheck_method(other=Table)
def semi_join_cols(self, other: 'Table') -> 'MatrixTable':
"""Filters the matrix table to columns whose key appears in `other`.

Parameters
----------
other : :class:`.Table`
Table with compatible key field(s).

Returns
-------
:class:`.MatrixTable`

Notes
-----
The column key type of the matrix table must match the key type of `other`.

This method does not change the schema of the matrix table; it is a
filtering the matrix table to column keys not present in another table.

To discard collumns whose key is present in `other`, use
:meth:`.anti_join_cols`.

Examples
--------
>>> ds_result = ds.semi_join_cols(cols_to_keep)

It may be inconvenient to key the matrix table by the right-side key.
In this case, it is possible to implement a semi-join using a non-key
field as follows:

>>> ds_result = ds.filter_cols(hl.is_defined(cols_to_keep.index(ds['s'])))

See Also
--------
:meth:`.anti_join_cols`, :meth:`.filter_cols`, :meth:`.semi_join_rows`
"""
return self.filter_cols(hl.is_defined(other.index(self.col_key)))

@typecheck_method(other=Table)
def anti_join_cols(self, other: 'Table') -> 'MatrixTable':
"""Filters the table to columns whose key does not appear in `other`.

Parameters
----------
other : :class:`.Table`
Table with compatible key field(s).

Returns
-------
:class:`.MatrixTable`

Notes
-----
The column key type of the matrix table must match the key type of `other`.

This method does not change the schema of the table; it is a method of
filtering the matrix table to column keys not present in another table.

To restrict to columns whose key is present in `other`, use
:meth:`.anti_join_cols`.

Examples
--------
>>> ds_result = ds.anti_join_cols(cols_to_remove)

It may be inconvenient to key the matrix table by the right-side key.
In this case, it is possible to implement an anti-join using a non-key
field as follows:

>>> ds_result = ds.filter_cols(hl.is_missing(cols_to_remove.index(ds['s'])))

See Also
--------
:meth:`.semi_join_cols`, :meth:`.filter_cols`, :meth:`.anti_join_rows`
"""
return self.filter_cols(hl.is_missing(other.index(self.col_key)))

@typecheck_method(expr=expr_bool, keep=bool)
def filter_rows(self, expr, keep: bool = True) -> 'MatrixTable':
"""Filter rows of the matrix.
Expand Down
77 changes: 77 additions & 0 deletions hail/python/hail/table.py
Expand Up @@ -1988,6 +1988,83 @@ def naive_coalesce(self, max_partitions: int) -> 'Table':
return Table(TableRepartition(
self._tir, max_partitions, RepartitionStrategy.NAIVE_COALESCE))


@typecheck_method(other=table_type)
def semi_join(self, other: 'Table') -> 'Table':
"""Filters the table to rows whose key appears in `other`.

Parameters
----------
other : :class:`.Table`
Table with compatible key field(s).

Returns
-------
:class:`.Table`

Notes
-----
The key type of the table must match the key type of `other`.

This method does not change the schema of the table; it is a method of
filtering the table to keys present in another table.

To discard keys present in `other`, use :meth:`.anti_join`.

Examples
--------
>>> table_result = table1.semi_join(table2)

It may be expensive to key the left-side table by the right-side key.
In this case, it is possible to implement a semi-join using a non-key
field as follows:

>>> table_result = table1.filter(hl.is_defined(table2.index(table1['ID'])))

See Also
--------
:meth:`.anti_join`
"""
return self.filter(hl.is_defined(other.index(self.key)))

@typecheck_method(other=table_type)
def anti_join(self, other: 'Table') -> 'Table':
"""Filters the table to rows whose key does not appear in `other`.

Parameters
----------
other : :class:`.Table`
Table with compatible key field(s).

Returns
-------
:class:`.Table`

Notes
-----
The key type of the table must match the key type of `other`.

This method does not change the schema of the table; it is a method of
filtering the table to keys not present in another table.

To restrict to keys present in `other`, use :meth:`.semi_join`.

Examples
--------
>>> table_result = table1.anti_join(table2)

It may be expensive to key the left-side table by the right-side key.
In this case, it is possible to implement an anti-join using a non-key
field as follows:

>>> table_result = table1.filter(hl.is_missing(table2.index(table1['ID'])))

See Also
--------
:meth:`.semi_join`, :meth:`.filter`
"""
return self.filter(hl.is_missing(other.index(self.key)))

@typecheck_method(right=table_type,
how=enumeration('inner', 'outer', 'left', 'right'),
_mangle=anyfunc)
Expand Down
14 changes: 14 additions & 0 deletions hail/python/test/hail/matrixtable/test_matrix_table.py
Expand Up @@ -332,6 +332,20 @@ def test_weird_names(self):
ds2.explode_rows(ds2['\%!^!@#&#&$%#$%'])
ds2.group_rows_by(ds2.a).aggregate(**{'*``81': agg.count()})

def test_semi_anti_join_rows(self):
mt = hl.utils.range_matrix_table(10, 3)
ht = hl.utils.range_table(3)

assert mt.semi_join_rows(ht).count() == (3, 3)
assert mt.anti_join_rows(ht).count() == (7, 3)

def test_semi_anti_join_cols(self):
mt = hl.utils.range_matrix_table(3, 10)
ht = hl.utils.range_table(3)

assert mt.semi_join_cols(ht).count() == (3, 3)
assert mt.anti_join_cols(ht).count() == (3, 7)

def test_joins(self):
vds = self.get_vds().select_rows(x1=1, y1=1)
vds2 = vds.select_rows(x2=1, y2=2)
Expand Down
7 changes: 7 additions & 0 deletions hail/python/test/hail/table/test_table.py
Expand Up @@ -316,6 +316,13 @@ def f():

self.assertRaises(NotImplementedError, f)

def test_semi_anti_join(self):
ht = hl.utils.range_table(10)
ht2 = ht.filter(ht.idx < 3)

assert ht.semi_join(ht2).count() == 3
assert ht.anti_join(ht2).count() == 7

def test_joins(self):
kt = hl.utils.range_table(1).key_by().drop('idx')
kt = kt.annotate(a='foo')
Expand Down