hail-is · danking · Jun 7, 2019 · Jun 5, 2019 · Jun 6, 2019 · Jun 6, 2019
diff --git a/hail/python/benchmark/matrix_table_benchmarks.py b/hail/python/benchmark/matrix_table_benchmarks.py
@@ -20,6 +20,12 @@ def matrix_table_entries_table():
     mt = hl.read_matrix_table(resource('profile.mt'))
     mt.entries()._force_count()
 
+
+@benchmark
+def matrix_table_entries_table_no_key():
+    mt = hl.read_matrix_table(resource('profile.mt')).key_rows_by().key_cols_by()
+    mt.entries()._force_count()
+
 @benchmark
 def matrix_table_rows_force_count():
     ht = hl.read_matrix_table(resource('profile.mt')).rows().key_by()

diff --git a/hail/src/main/scala/is/hail/expr/ir/IRBuilder.scala b/hail/src/main/scala/is/hail/expr/ir/IRBuilder.scala
@@ -230,6 +230,18 @@ object IRBuilder {
 
     def dropFields(fields: Symbol*): IRProxy = dropFieldList(fields.map(_.name))
 
+    def insertStruct(other: IRProxy): IRProxy = (env: E) => {
+      val right = other(env)
+      val sym = genUID()
+      Let(
+        sym,
+        right,
+        InsertFields(
+          ir(env),
+          right.typ.asInstanceOf[TStruct].fieldNames.map(f => f -> GetField(Ref(sym, right.typ), f)),
+          None))
+    }
+
     def len: IRProxy = (env: E) => ArrayLen(ir(env))
 
     def isNA: IRProxy = (env: E) => IsNA(ir(env))

diff --git a/hail/src/main/scala/is/hail/expr/ir/LowerMatrixIR.scala b/hail/src/main/scala/is/hail/expr/ir/LowerMatrixIR.scala
@@ -513,29 +513,54 @@ object LowerMatrixIR {
           .rename(Map(entriesFieldName -> entries), Map(colsFieldName -> cols))
 
       case MatrixEntriesTable(child) =>
-        val oldColIdx = Symbol(genUID())
-        val lambdaIdx1 = Symbol(genUID())
-        val currentColIdx = Symbol(genUID())
-        lower(child, ab)
-          .mapGlobals('global.insertFields(oldColIdx ->
-            irRange(0, 'global (colsField).len)
-              .map(lambdaIdx1 ~> makeStruct('key -> 'global (colsField)(lambdaIdx1).selectFields(child.typ.colKey: _*), 'value -> lambdaIdx1))
-              .sort(ascending = true, onKey = true)
-              .map(lambdaIdx1 ~> lambdaIdx1('value))))
-          .mapRows('row.insertFields(currentColIdx -> 'global (oldColIdx)
-            .filter(lambdaIdx1 ~> !'row (entriesField)(lambdaIdx1).isNA)))
-          .explode(currentColIdx)
-          .mapRows(let(
-            __current_idx = 'row (currentColIdx),
-            __col_struct = 'global (colsField)('__current_idx),
-            __entry_struct = 'row (entriesField)('__current_idx)) {
-            val newFields = child.typ.colType.fieldNames.map(Symbol(_)).map(f => f -> '__col_struct (f)) ++
-              child.typ.entryType.fieldNames.map(Symbol(_)).map(f => f -> '__entry_struct (f))
-            'row
-              .dropFields(entriesField, currentColIdx)
-              .insertFields(newFields: _*)
-          }).mapGlobals('global.dropFields(colsField, oldColIdx))
-          .keyBy(child.typ.rowKey ++ child.typ.colKey, isSorted = !(child.typ.rowKey.isEmpty && child.typ.colKey.nonEmpty))
+        val lc = lower(child, ab)
+
+        if (child.typ.colKey.nonEmpty) {
+          val oldColIdx = Symbol(genUID())
+          val lambdaIdx1 = Symbol(genUID())
+          val lambdaIdx2 = Symbol(genUID())
+          val lambdaIdx3 = Symbol(genUID())
+          val toExplode = Symbol(genUID())
+          val values = Symbol(genUID())
+          lc
+            .mapGlobals('global.insertFields(oldColIdx ->
+              irRange(0, 'global (colsField).len)
+                .map(lambdaIdx1 ~> makeStruct('key -> 'global (colsField)(lambdaIdx1).selectFields(child.typ.colKey: _*), 'value -> lambdaIdx1))
+                .sort(ascending = true, onKey = true)
+                .map(lambdaIdx1 ~> lambdaIdx1('value))))
+            .aggregateByKey(makeStruct(values -> applyAggOp(Collect(), seqOpArgs = FastIndexedSeq('row.selectFields(lc.typ.valueType.fieldNames: _*)))))
+            .mapRows('row.dropFields(values).insertFields(toExplode ->
+              'global (oldColIdx)
+                .flatMap(lambdaIdx1 ~> 'row (values)
+                  .filter(lambdaIdx2 ~> !lambdaIdx2(entriesField)(lambdaIdx1).isNA)
+                  .map(lambdaIdx3 ~> let(__col = 'global (colsField)(lambdaIdx1), __entry = lambdaIdx3(entriesField)(lambdaIdx1)) {
+                    makeStruct(
+                      child.typ.rowValueStruct.fieldNames.map(Symbol(_)).map(f => f -> lambdaIdx3(f)) ++
+                        child.typ.colType.fieldNames.map(Symbol(_)).map(f => f -> '__col (f)) ++
+                        child.typ.entryType.fieldNames.map(Symbol(_)).map(f => f -> '__entry (f)): _*
+                    )
+                  }))))
+
+            .explode(toExplode)
+            .mapRows('row.dropFields(toExplode).insertStruct('row (toExplode)))
+            .mapGlobals('global.dropFields(colsField, oldColIdx))
+            .keyBy(child.typ.rowKey ++ child.typ.colKey, isSorted = !(child.typ.rowKey.isEmpty && child.typ.colKey.nonEmpty))
+        } else {
+          val colIdx = Symbol(genUID())
+          val lambdaIdx = Symbol(genUID())
+          lc
+            .mapRows('row.insertFields(colIdx -> irRange(0, 'global (colsField).len)
+              .filter(lambdaIdx ~> !'row (entriesField)(lambdaIdx).isNA)))
+            .explode(colIdx)
+            .mapRows(let(__col_struct = 'global (colsField)('row (colIdx)),
+              __entry_struct = 'row (entriesField)('row (colIdx))) {
+              val newFields = child.typ.colType.fieldNames.map(Symbol(_)).map(f => f -> '__col_struct (f)) ++
+                child.typ.entryType.fieldNames.map(Symbol(_)).map(f => f -> '__entry_struct (f))
+
+              'row.dropFields(entriesField, colIdx).insertFields(newFields: _*)
+            })
+            .mapGlobals('global.dropFields(colsField))
+        }
 
       case MatrixToTableApply(child, function) =>
         val loweredChild = lower(child, ab)

diff --git a/hail/src/main/scala/is/hail/rvd/RVD.scala b/hail/src/main/scala/is/hail/rvd/RVD.scala
@@ -105,7 +105,7 @@ class RVD(
     if (nPreservedFields == newKey.length)
       this
     else if (isSorted)
-      truncateKey(newKey.take(nPreservedFields)).extendKeyPreservesPartitioning(newKey)
+      truncateKey(newKey.take(nPreservedFields)).extendKeyPreservesPartitioning(newKey).checkKeyOrdering()
     else
       changeKey(newKey)
   }
@@ -130,6 +130,61 @@ class RVD(
     }
   }
 
+  def checkKeyOrdering(): RVD = {
+    val partitionerBc = partitioner.broadcast(crdd.sparkContext)
+    val localType = typ
+    val localKPType = typ.kType
+
+    new RVD(
+      typ,
+      partitioner,
+      crdd.cmapPartitionsWithIndex { case (i, ctx, it) =>
+        val prevK = WritableRegionValue(localType.kType, ctx.freshRegion)
+        val kUR = new UnsafeRow(localKPType)
+
+        new Iterator[RegionValue] {
+          var first = true
+
+          def hasNext: Boolean = it.hasNext
+
+          def next(): RegionValue = {
+            val rv = it.next()
+
+            if (first)
+              first = false
+            else {
+              if (localType.kRowOrd.gt(prevK.value, rv)) {
+                kUR.set(prevK.value)
+                val prevKeyString = kUR.toString()
+
+                prevK.setSelect(localType.rowType, localType.kFieldIdx, rv)
+                kUR.set(prevK.value)
+                val currKeyString = kUR.toString()
+                fatal(
+                  s"""RVD error! Keys found out of order:
+                     |  Current key:  $currKeyString
+                     |  Previous key: $prevKeyString
+                     |This error can occur after a split_multi if the dataset
+                     |contains both multiallelic variants and duplicated loci.
+                   """.stripMargin)
+              }
+            }
+
+            prevK.setSelect(localType.rowType, localType.kFieldIdx, rv)
+            kUR.set(prevK.value)
+
+            if (!partitionerBc.value.rangeBounds(i).contains(localType.kType.virtualType.ordering, kUR))
+              fatal(
+                s"""RVD error! Unexpected key in partition $i
+                   |  Range bounds for partition $i: ${ partitionerBc.value.rangeBounds(i) }
+                   |  Range of partition IDs for key: [${ partitionerBc.value.lowerBound(kUR) }, ${ partitionerBc.value.upperBound(kUR) })
+                   |  Invalid key: ${ kUR.toString() }""".stripMargin)
+            rv
+          }
+        }
+      })
+  }
+
   def truncateKey(n: Int): RVD = {
     require(n <= typ.key.length)
     truncateKey(typ.key.take(n))
@@ -1275,62 +1330,8 @@ object RVD {
   ): RVD = {
     if (!HailContext.get.checkRVDKeys)
       return new RVD(typ, partitioner, crdd)
-
-    val sc = crdd.sparkContext
-
-    val partitionerBc = partitioner.broadcast(sc)
-    val localType = typ
-    val localKPType = typ.kType
-
-    new RVD(
-      typ,
-      partitioner,
-      crdd.cmapPartitionsWithIndex { case (i, ctx, it) =>
-        val prevK = WritableRegionValue(localType.kType, ctx.freshRegion)
-        val kUR = new UnsafeRow(localKPType)
-
-        new Iterator[RegionValue] {
-          var first = true
-
-          def hasNext: Boolean = it.hasNext
-
-          def next(): RegionValue = {
-            val rv = it.next()
-
-            if (first)
-              first = false
-            else {
-              if (localType.kRowOrd.gt(prevK.value, rv)) {
-                kUR.set(prevK.value)
-                val prevKeyString = kUR.toString()
-
-                prevK.setSelect(localType.rowType, localType.kFieldIdx, rv)
-                kUR.set(prevK.value)
-                val currKeyString = kUR.toString()
-                fatal(
-                  s"""RVD error! Keys found out of order:
-                     |  Current key:  $currKeyString
-                     |  Previous key: $prevKeyString
-                     |This error can occur after a split_multi if the dataset
-                     |contains both multiallelic variants and duplicated loci.
-                   """.stripMargin)
-              }
-            }
-
-            prevK.setSelect(localType.rowType, localType.kFieldIdx, rv)
-            kUR.set(prevK.value)
-
-            if (!partitionerBc.value.rangeBounds(i).contains(localType.kType.virtualType.ordering, kUR))
-              fatal(
-                s"""RVD error! Unexpected key in partition $i
-                   |  Range bounds for partition $i: ${ partitionerBc.value.rangeBounds(i) }
-                   |  Range of partition IDs for key: [${ partitionerBc.value.lowerBound(kUR) }, ${ partitionerBc.value.upperBound(kUR) })
-                   |  Invalid key: ${ kUR.toString() }""".stripMargin)
-
-            rv
-          }
-        }
-      })
+    else
+      return new RVD(typ, partitioner, crdd).checkKeyOrdering()
   }
 
   def union(rvds: Seq[RVD], joinKey: Int): RVD = rvds match {