Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hail] Better scaling on RVD.union #6943

Merged
merged 1 commit into from Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions hail/python/test/hail/table/test_table.py
Expand Up @@ -813,6 +813,13 @@ def test_union(self):
self.assertTrue(t1.key_by().union(t2.key_by(), t3.key_by())
._same(hl.utils.range_table(15).key_by()))

def test_nested_union(self):
N = 100
M = 200
t = hl.utils.range_table(N, n_partitions=16)

assert hl.Table.union(*[t for _ in range(M)])._force_count() == N * M

def test_union_unify(self):
t1 = hl.utils.range_table(2)
t2 = t1.annotate(x=hl.int32(1), y='A')
Expand Down
2 changes: 1 addition & 1 deletion hail/src/main/scala/is/hail/rvd/RVD.scala
Expand Up @@ -1449,7 +1449,7 @@ object RVD {
val sc = first.sparkContext
RVD.unkeyed(first.rowPType, ContextRDD.union(sc, rvds.map(_.crdd)))
} else
rvds.reduce(_.orderedMerge(_, joinKey))
rvds.toArray.treeReduce(_.orderedMerge(_, joinKey))
}

def union(rvds: Seq[RVD]): RVD =
Expand Down
15 changes: 15 additions & 0 deletions hail/src/main/scala/is/hail/utils/richUtils/RichIndexedSeq.scala
@@ -1,5 +1,9 @@
package is.hail.utils.richUtils

import is.hail.utils._

import scala.reflect.ClassTag

/** Rich wrapper for an indexed sequence.
*
* Houses the generic binary search methods. All methods taking
Expand Down Expand Up @@ -153,4 +157,15 @@ class RichIndexedSeq[T](val a: IndexedSeq[T]) extends AnyVal {
}
notFound(left)
}

def treeReduce(f: (T, T) => T)(implicit tct: ClassTag[T]): T = {
var is: IndexedSeq[T] = a
while (is.length > 1) {
is = is.iterator.grouped(2).map {
case Seq(x1, x2) => f(x1, x2)
case Seq(x1) => x1
}.toFastIndexedSeq
}
is.head
}
}