Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bugfix] Fix unicode string ordering #5770

Merged
merged 2 commits into from Apr 3, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions hail/python/test/hail/table/test_table.py
Expand Up @@ -1020,6 +1020,12 @@ def test_import_multiple_missing(self):

assert ht.f0.collect() == [None, None, 'gene5', 'gene4', 'gene3']

def test_unicode_ordering(self):
a = hl.literal(["é", "e"])
ht = hl.utils.range_table(1, 1)
ht = ht.annotate(fd=hl.sorted(a))
assert ht.fd.collect()[0] == ["e", "é"]

def test_large_number_of_fields(tmpdir):
ht = hl.utils.range_table(100)
ht = ht.annotate(**{
Expand Down
12 changes: 6 additions & 6 deletions hail/src/main/scala/is/hail/expr/types/physical/PBinary.scala
Expand Up @@ -32,10 +32,10 @@ class PBinary(override val required: Boolean) extends PType {
var i = 0

while (i < lim) {
val b1 = r1.loadByte(bOff1 + i)
val b2 = r2.loadByte(bOff2 + i)
val b1 = java.lang.Byte.toUnsignedInt(r1.loadByte(bOff1 + i))
val b2 = java.lang.Byte.toUnsignedInt(r2.loadByte(bOff2 + i))
if (b1 != b2)
return java.lang.Byte.compare(b1, b2)
return java.lang.Integer.compare(b1, b2)

i += 1
}
Expand All @@ -62,9 +62,9 @@ class PBinary(override val required: Boolean) extends PType {
i := 0,
cmp := 0,
Code.whileLoop(cmp.ceq(0) && i < lim,
cmp := Code.invokeStatic[java.lang.Byte, Byte, Byte, Int]("compare",
rx.loadByte(PBinary.bytesOffset(x) + i.toL),
ry.loadByte(PBinary.bytesOffset(y) + i.toL)),
cmp := Code.invokeStatic[java.lang.Integer, Int, Int, Int]("compare",
Code.invokeStatic[java.lang.Byte, Byte, Int]("toUnsignedInt", rx.loadByte(PBinary.bytesOffset(x) + i.toL)),
Code.invokeStatic[java.lang.Byte, Byte, Int]("toUnsignedInt", ry.loadByte(PBinary.bytesOffset(y) + i.toL))),
i += 1),
cmp.ceq(0).mux(Code.invokeStatic[java.lang.Integer, Int, Int, Int]("compare", l1, l2), cmp))
}
Expand Down
8 changes: 6 additions & 2 deletions hail/src/main/scala/is/hail/expr/types/virtual/TBinary.scala
Expand Up @@ -22,8 +22,12 @@ class TBinary(override val required: Boolean) extends Type {

override def scalaClassTag: ClassTag[Array[Byte]] = classTag[Array[Byte]]

val ordering: ExtendedOrdering =
ExtendedOrdering.extendToNull(Ordering.Iterable[Byte])
val ordering: ExtendedOrdering = ExtendedOrdering.iterableOrdering(new ExtendedOrdering {
override def compareNonnull(x: Any, y: Any): Int =
java.lang.Integer.compare(
java.lang.Byte.toUnsignedInt(x.asInstanceOf[Byte]),
java.lang.Byte.toUnsignedInt(y.asInstanceOf[Byte]))
})
}

object TBinary {
Expand Down
131 changes: 131 additions & 0 deletions hail/src/test/scala/is/hail/expr/ir/IRSuite.scala
Expand Up @@ -1834,4 +1834,135 @@ class IRSuite extends SparkSuite {

assertEvalsTo(ir, 61L)
}

@Test def testfoo() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a super huge fan of this as a test---do we need it? (At the very least can we rename it?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops meant to delete it

val s ="""
| (TableCollect
| (TableHead 11
| (TableOrderBy (Aoccupation)
| (TableKeyByAndAggregate None 50
| (TableKeyByAndAggregate None 50
| (TableMapRows
| (TableKeyBy () False
| (TableLeftJoinRightDistinct __uid_150
| (TableKeyBy (__uid_151) False
| (TableMapRows
| (TableKeyBy () False
| (TableLeftJoinRightDistinct __uid_144
| (TableKeyBy (__uid_145) False
| (TableMapRows
| (TableRead None False "{\"name\":\"TableNativeReader\",\"path\":\"/Users/tpoterba/misc/debug/rvd-error-movielens/data/ratings.ht\",\"_spec\":null}")
| (Let row
| (InsertFields
| (Ref row)
| None
| (__uid_145
| (GetField movie_id
| (Ref row))))
| (Let __iruid_3909
| (Ref row)
| (MakeStruct
| (user_id
| (GetField user_id
| (Ref __iruid_3909)))
| (rating
| (GetField rating
| (Ref __iruid_3909)))
| (__uid_145
| (GetField __uid_145
| (Ref __iruid_3909))))))))
| (TableRead Table{global:Struct{},key:[id],row:Struct{id:Int32,title:String}} False "{\"name\":\"TableNativeReader\",\"path\":\"/Users/tpoterba/misc/debug/rvd-error-movielens/data/movies.ht\",\"_spec\":null}")))
| (Let row
| (Let row
| (InsertFields
| (SelectFields (user_id rating)
| (Ref row))
| None
| (title
| (GetField title
| (GetField __uid_144
| (Ref row)))))
| (InsertFields
| (Ref row)
| None
| (__uid_151
| (GetField user_id
| (Ref row)))))
| (Let __iruid_3910
| (Ref row)
| (MakeStruct
| (rating
| (GetField rating
| (Ref __iruid_3910)))
| (title
| (GetField title
| (Ref __iruid_3910)))
| (__uid_151
| (GetField __uid_151
| (Ref __iruid_3910))))))))
| (TableRead Table{global:Struct{},key:[id],row:Struct{id:Int32,occupation:String}} False "{\"name\":\"TableNativeReader\",\"path\":\"/Users/tpoterba/misc/debug/rvd-error-movielens/data/users.ht\",\"_spec\":null}")))
| (InsertFields
| (SelectFields (rating title)
| (Ref row))
| None
| (occupation
| (GetField occupation
| (GetField __uid_150
| (Ref row))))))
| (MakeStruct
| (rating
| (ApplyBinaryPrimOp FloatingPointDivide
| (ApplyAggOp Sum
| ()
| None
| ( (ApplyIR toFloat64
| (GetField rating
| (Ref row)))))
| (ApplyIR toFloat64
| (ApplyAggOp Sum
| ()
| None
| ( (Apply toInt64
| (ApplyUnaryPrimOp Bang
| (IsNA
| (ApplyIR toFloat64
| (GetField rating
| (Ref row))))))))))))
| (InsertFields
| (SelectFields ()
| (Ref row))
| ("x")
| (x
| (MakeStruct
| (occupation
| (GetField occupation
| (Ref row)))
| (title
| (GetField title
| (Ref row)))))))
| (MakeStruct
| (favorite
| (ApplyAggOp TakeBy
| ( (I32 1))
| None
| ( (GetField title
| (GetField x
| (Ref row)))
| (ApplyUnaryPrimOp Negate
| (GetField rating
| (Ref row)))))))
| (InsertFields
| (SelectFields ()
| (Ref row))
| ("occupation")
| (occupation
| (GetField occupation
| (GetField x
| (Ref row))))))))
| )
""".stripMargin

val ir = IRParser.parse_value_ir(s)
println(Interpret(ir))
}
}