From ab6f91ac9afbc15cf31f14c5604680fedcbd1dd2 Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Wed, 26 Jun 2019 17:57:31 -0400 Subject: [PATCH] [hail] Publicize and document Table.multi_way_zip_join It remains experimental. --- hail/python/hail/experimental/vcf_combiner.py | 2 +- hail/python/hail/table.py | 45 ++++++++++++++++--- hail/python/test/hail/table/test_table.py | 10 ++--- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/hail/python/hail/experimental/vcf_combiner.py b/hail/python/hail/experimental/vcf_combiner.py index d7e7743d6af..3102403d504 100644 --- a/hail/python/hail/experimental/vcf_combiner.py +++ b/hail/python/hail/experimental/vcf_combiner.py @@ -153,7 +153,7 @@ def renumber_entry(entry, old_to_new) -> StructExpression: def combine_gvcfs(mts): """merges vcfs using multi way join""" - ts = hl.Table._multi_way_zip_join([localize(mt) for mt in mts], 'data', 'g') + ts = hl.Table.multi_way_zip_join([localize(mt) for mt in mts], 'data', 'g') combined = combine(ts) return unlocalize(combined) diff --git a/hail/python/hail/table.py b/hail/python/hail/table.py index aea06dd2bd9..dc448c24a86 100644 --- a/hail/python/hail/table.py +++ b/hail/python/hail/table.py @@ -328,7 +328,7 @@ class Table(ExprContainer): @staticmethod def _from_java(jtir): return Table(JavaTable(jtir)) - + def __init__(self, tir): super(Table, self).__init__() @@ -2963,7 +2963,7 @@ def from_spark(df, key=[]) -> 'Table': ---------- df : :class:`.pyspark.sql.DataFrame` PySpark DataFrame. - + key : :obj:`str` or :obj:`list` of :obj:`str` Key fields. @@ -3056,14 +3056,14 @@ def _same(self, other, tolerance=1e-6, absolute=False): r = other r = r.select_globals(**{right_global_value: r.globals}) r = r.select(**{right_value: r._value}) - + t = l._zip_join(r) if not hl.eval(_values_similar(t[left_global_value], t[right_global_value], tolerance, absolute)): g = hl.eval(t.globals) print(f'Table._same: globals differ: {g[left_global_value]}, {g[right_global_value]}') return False - + if not t.all(_values_similar(t[left_value], t[right_value], tolerance, absolute)): print('Table._same: rows differ:') t = t.filter(~ _values_similar(t[left_value], t[right_value], tolerance, absolute)) @@ -3071,7 +3071,7 @@ def _same(self, other, tolerance=1e-6, absolute=False): for r in bad_rows: print(f' {r[left_value]}, {r[right_value]}') return False - + return True @@ -3212,7 +3212,40 @@ def _unlocalize_entries(self, entries_field_name, cols_field_name, col_key) -> ' @staticmethod @typecheck(tables=sequenceof(table_type), data_field_name=str, global_field_name=str) - def _multi_way_zip_join(tables, data_field_name, global_field_name) -> 'Table': + def multi_way_zip_join(tables, data_field_name, global_field_name) -> 'Table': + """Combine many tables in a zip join + + Notes + ----- + The row type of the returned table is a struct with the key fields, and + one extra field, `data_field_name`, which is an array of structs with + the non key fields, one per input. The array elements are missing if + their corresponding input had no row with that key or possibly if there + is another input with more rows with that key than the corresponding + input. + + The global type of the returned table is an array of structs of the + global type of all of the inputs. + + The types for every input must be identical, not merely compatible, + including the keys. + + A zip join is similar to an outer join however rows are not duplicated + to create the full Cartesian product of duplicate keys. Instead, there + is exactly one entry in some `data_field_name` array for every row in + the inputs. + + Parameters + ---------- + tables : :obj:`List[Table]` + A list of tables to combine + data_field_name : :obj:`str` + The name of the resulting data field + global_field_name : :obj:`str` + The name of the resulting global field + + .. include:: _templates/experimental.rst + """ if not tables: raise ValueError('multi_way_zip_join must have at least one table as an argument') head = tables[0] diff --git a/hail/python/test/hail/table/test_table.py b/hail/python/test/hail/table/test_table.py index 3cc03bcc6b7..f38d06a1641 100644 --- a/hail/python/test/hail/table/test_table.py +++ b/hail/python/test/hail/table/test_table.py @@ -453,7 +453,7 @@ def test_multi_way_zip_join(self): {"id": 3, "name": "z", "data": 0.01}] s = hl.tstruct(id=hl.tint32, name=hl.tstr, data=hl.tfloat64) ts = [hl.Table.parallelize(r, schema=s, key='id') for r in [d1, d2, d3]] - joined = hl.Table._multi_way_zip_join(ts, '__data', '__globals').drop('__globals') + joined = hl.Table.multi_way_zip_join(ts, '__data', '__globals').drop('__globals') dexpected = [{"id": 0, "__data": [{"name": "a", "data": 0.0}, {"name": "d", "data": 1.1}, None]}, @@ -476,10 +476,10 @@ def test_multi_way_zip_join(self): self.assertTrue(expected._same(joined)) expected2 = expected.transmute(data=expected['__data']) - joined_same_name = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('globals') + joined_same_name = hl.Table.multi_way_zip_join(ts, 'data', 'globals').drop('globals') self.assertTrue(expected2._same(joined_same_name)) - joined_nothing = hl.Table._multi_way_zip_join(ts, 'data', 'globals').drop('data', 'globals') + joined_nothing = hl.Table.multi_way_zip_join(ts, 'data', 'globals').drop('data', 'globals') self.assertEqual(joined_nothing._force_count(), 5) def test_multi_way_zip_join_globals(self): @@ -490,14 +490,14 @@ def test_multi_way_zip_join_globals(self): hl.struct(x=hl.null(hl.tint32)), hl.struct(x=5), hl.struct(x=0)])) - joined = hl.Table._multi_way_zip_join([t1, t2, t3], '__data', '__globals') + joined = hl.Table.multi_way_zip_join([t1, t2, t3], '__data', '__globals') self.assertEqual(hl.eval(joined.globals), hl.eval(expected)) def test_multi_way_zip_join_key_downcast(self): mt = hl.import_vcf(resource('sample.vcf.bgz')) mt = mt.key_rows_by('locus') ht = mt.rows() - j = hl.Table._multi_way_zip_join([ht, ht], 'd', 'g') + j = hl.Table.multi_way_zip_join([ht, ht], 'd', 'g') j._force_count() def test_index_maintains_count(self):