New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Hail][feature] add outer
option to union_cols
#7475
Changes from 5 commits
e005d8b
9e2ade1
2cb653f
b0b9431
7ab56df
a2c3e11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3570,8 +3570,9 @@ def union_rows(*datasets: 'MatrixTable', _check_cols=True) -> 'MatrixTable': | |
f"Datasets 0 and {wrong_keys+1} have different columns (or possibly different order).") | ||
return MatrixTable(MatrixUnionRows(*[d._mir for d in datasets])) | ||
|
||
@typecheck_method(other=matrix_table_type) | ||
def union_cols(self, other: 'MatrixTable') -> 'MatrixTable': | ||
@typecheck_method(other=matrix_table_type, | ||
row_join_type=enumeration('inner', 'outer', 'left', 'right')) | ||
def union_cols(self, other: 'MatrixTable', row_join_type='inner') -> 'MatrixTable': | ||
"""Take the union of dataset columns. | ||
|
||
Examples | ||
|
@@ -3593,10 +3594,22 @@ def union_cols(self, other: 'MatrixTable') -> 'MatrixTable': | |
The row fields in the resulting dataset are the row fields from the | ||
first dataset; the row schemas do not need to match. | ||
|
||
This method performs an inner join on rows and concatenates entries | ||
from the two datasets for each row. Only distinct keys from each | ||
dataset are included (equivalent to calling :meth:`.distinct_by_row` | ||
on each dataset first). | ||
This method creates a :class:`.MatrixTable` which contains all columns | ||
from both input datasets. The set of rows included in the result is | ||
determined by the `row_join_type` parameter. | ||
|
||
- With the default ``row_join_type=inner``, an inner join is performed | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We generally don't format the arg bit as code: should be something like:
|
||
on rows, so that only rows whose row key exists in both input datasets | ||
are included. In this case, the entries for each row are the | ||
concatenation of all entries of the corresponding rows in the input | ||
datasets. | ||
- With ``row_join_type=outer``, an outer join is perfomed on rows, so | ||
that row keys which exist in only one input dataset are also included. | ||
For those rows, the entrie fields for the columns coming from the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo: entrie |
||
other dataset will be missing. | ||
|
||
Only distinct row keys from each dataset are included (equivalent to | ||
calling :meth:`.distinct_by_row` on each dataset first). | ||
|
||
This method does not deduplicate; if a column key exists identically in | ||
two datasets, then it will be duplicated in the result. | ||
|
@@ -3605,6 +3618,9 @@ def union_cols(self, other: 'MatrixTable') -> 'MatrixTable': | |
---------- | ||
other : :class:`.MatrixTable` | ||
Dataset to concatenate. | ||
outer : bool | ||
If `True`, perform an outer join on rows, otherwise perform an | ||
inner join. Default `False`. | ||
|
||
Returns | ||
------- | ||
|
@@ -3628,7 +3644,7 @@ def union_cols(self, other: 'MatrixTable') -> 'MatrixTable': | |
f' left: {", ".join(self.row_key.dtype.values())}\n' | ||
f' right: {", ".join(other.row_key.dtype.values())}') | ||
|
||
return MatrixTable(MatrixUnionCols(self._mir, other._mir)) | ||
return MatrixTable(MatrixUnionCols(self._mir, other._mir, row_join_type)) | ||
|
||
@typecheck_method(n=nullable(int), n_cols=nullable(int)) | ||
def head(self, n: Optional[int], n_cols: Optional[int] = None) -> 'MatrixTable': | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -530,6 +530,22 @@ def test_union_cols_distinct(self): | |
mt = mt.key_rows_by(x = mt.row_idx // 2) | ||
assert mt.union_cols(mt).count_rows() == 5 | ||
|
||
def test_union_cols_outer(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a test for correct entry joining? |
||
r, c = 10, 10 | ||
mt = hl.utils.range_matrix_table(2*r, c) | ||
mt = mt.annotate_entries(entry=hl.tuple([mt.row_idx, mt.col_idx])) | ||
mt2 = hl.utils.range_matrix_table(2*r, c) | ||
mt2 = mt2.key_rows_by(row_idx=mt2.row_idx + r) | ||
mt2 = mt2.key_cols_by(col_idx=mt2.col_idx + c) | ||
mt2 = mt2.annotate_entries(entry=hl.tuple([mt2.row_idx, mt2.col_idx])) | ||
expected = hl.utils.range_matrix_table(3*r, 2*c) | ||
missing = hl.null(hl.ttuple(hl.tint, hl.tint)) | ||
expected = expected.annotate_entries(entry=hl.cond( | ||
expected.col_idx < c, | ||
hl.cond(expected.row_idx < 2*r, hl.tuple([expected.row_idx, expected.col_idx]), missing), | ||
hl.cond(expected.row_idx >= r, hl.tuple([expected.row_idx, expected.col_idx]), missing))) | ||
assert mt.union_cols(mt2, row_join_type='outer')._same(expected) | ||
|
||
def test_union_rows_different_col_schema(self): | ||
mt = hl.utils.range_matrix_table(10, 10) | ||
mt2 = hl.utils.range_matrix_table(10, 10) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you don't support left/right
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Eek, lazy copy/pasting
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that's what review is for!