Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hail] Promote localize_entries to public & tested #5247

Merged
merged 11 commits into from Feb 11, 2019
57 changes: 57 additions & 0 deletions hail/python/hail/matrixtable.py
Expand Up @@ -2533,6 +2533,63 @@ def _localize_entries(self, entries_field_name, cols_field_name) -> 'Table':
return Table(CastMatrixToTable(
self._mir, entries_field_name, cols_field_name))

@typecheck_method(entries_array_field_name=nullable(str),
columns_array_field_name=nullable(str))
def localize_entries(self,
entries_array_field_name=None,
columns_array_field_name=None) -> 'Table':
"""Represent this matrix as a table of entry-rows.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very confusing sentence. How about:

"Convert the matrix table to a table with entries localized as an array of structs."


Examples
--------
Build a numpy ndarray from a small :class:`.MatrixTable`:

>>> mt = hl.utils.range_matrix_table(3,3)
>>> mt = mt.select_entries(x = mt.row_idx * mt.col_idx)
>>> mt.x.show()
+---------+---------+-------+
| row_idx | col_idx | x |
+---------+---------+-------+
| int32 | int32 | int32 |
+---------+---------+-------+
| 0 | 2 | 0 |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are the col indices in reverse order?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doctest also flagged this. I must've had a buggy version of hail locally when I ran this.

| 0 | 0 | 0 |
| 0 | 1 | 0 |
| 1 | 0 | 0 |
| 1 | 1 | 1 |
| 1 | 2 | 2 |
| 2 | 2 | 4 |
| 2 | 1 | 2 |
| 2 | 0 | 0 |
+---------+---------+-------+
>>> t = mt.localize_entries('entry_structs', 'columns')
>>> t = t.select(entries = t.entry_structs.map(lambda entry: entry.x))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a .describe() here - that would help people understand what this is doing

>>> np.array(t.entries.collect())
array([[0, 0, 0],
[0, 1, 2],
[0, 2, 4]])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a note that filtered entries are represented as a missing struct?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ditched the struct on the second to last line so this would blow up without latest numpy, I assume, b/c you can't represent a missing value in a numpy ndarray. I think this is doctested, no?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean add a "Notes" section to the docs, and describe that the array of entries always contains Ncols elements (structs), with filtered entries appearing as missing structs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the second comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still want a notes section with a description of the type!

Parameters
----------
entries_array_field_name : :obj:`str`
The name of the table field containing the array of entry structs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, you added it in the parameter description. Can we move the last sentence of both of these up?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'm generally a bit allergic to the proliferation of Notes that we have in the docs. Seems like the parameters or returns section is the right place to put this information, right?

for the given row
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: periods at the end of parameter descriptions

columns_array_field_name : :obj:`str`
The name of the global field containing the array of column structs

Returns
-------
:class:`.Table`
"""
entries = entries_array_field_name or Env.get_uid()
cols = columns_array_field_name or Env.get_uid()
t = self._localize_entries(self, entries, cols)
if entries_array_field_name is None:
t = t.drop(entries)
if columns_array_field_name is None:
t = t.drop_globals(cols)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drop_globals isn't a thing. Just drop

return t

def _unfilter_entries(self):
entry_ir = hl.cond(hl.is_defined(self.entry), self.entry, hl.struct(**self.entry))._ir
return MatrixTable(MatrixMapEntries(self._mir, entry_ir))
Expand Down
37 changes: 35 additions & 2 deletions hail/python/test/hail/matrixtable/test_matrix_table.py
Expand Up @@ -5,6 +5,7 @@

import hail as hl
import hail.expr.aggregators as agg
from hail.utils.java import Env
from hail.utils.misc import new_temp_file
from ..helpers import *

Expand Down Expand Up @@ -605,7 +606,7 @@ def test_read_stored_globals(self):
self.assertTrue(ds.globals_table()._same(t))

def test_codecs_matrix(self):
from hail.utils.java import Env, scala_object
from hail.utils.java import scala_object
codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
ds = self.get_vds()
temp = new_temp_file(suffix='hmt')
Expand All @@ -615,7 +616,7 @@ def test_codecs_matrix(self):
self.assertTrue(ds._same(ds2))

def test_codecs_table(self):
from hail.utils.java import Env, scala_object
from hail.utils.java import scala_object
codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs()
rt = self.get_vds().rows()
temp = new_temp_file(suffix='ht')
Expand Down Expand Up @@ -1046,3 +1047,35 @@ def test_agg_cols_group_by(self):
]
for aggregation, expected in tests:
self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)

def localize_entries_with_both_none_is_rows_table(self):
mt = hl.utils.range_matrix_table(10, 10)
mt = mt.select_entries(x = mt.row_idx * mt.col_idx)
localized = mt.localize_entries(entries_array_field_name=None,
columns_array_field_name=None)
rows_table = mt.rows()
assert rows_table.collect() == localized.collect()
assert rows_table.globals_table().collect() == localized.globals_table().collect()

def localize_entries_with_none_cols_adds_no_globals(self):
mt = hl.utils.range_matrix_table(10, 10)
mt = mt.select_entries(x = mt.row_idx * mt.col_idx)
localized = mt.localize_entries(entries_array_field_name=Env.get_uid(),
columns_array_field_name=None)
assert mt.globals_table().collect() == localized.globals_table().collect()

def localize_entries_with_none_entries_changes_no_rows(self):
mt = hl.utils.range_matrix_table(10, 10)
mt = mt.select_entries(x = mt.row_idx * mt.col_idx)
localized = mt.localize_entries(entries_array_field_name=None,
columns_array_field_name=Env.get_uid())
rows_table = mt.rows()
assert rows_table.collect() == localized.collect()

def localize_entries_creates_arrays_of_entries_and_array_of_cols(self):
mt = hl.utils.range_matrix_table(10, 10)
mt = mt.select_entries(x = mt.row_idx * mt.col_idx)
localized = mt.localize_entries(entries_array_field_name='entries',
columns_array_field_name='cols')
assert [[x * y for x in range(0, 10)] for y in range(0, 10)] == localized.entries.collect()
assert range(0, 10) == localized.cols.collect()