Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hail] variation of hl.float/hl.int that returns null on invalid string #7453

Merged
merged 8 commits into from Nov 6, 2019
Merged
6 changes: 6 additions & 0 deletions hail/python/hail/expr/__init__.py
Expand Up @@ -151,9 +151,15 @@
'float',
'float32',
'float64',
'parse_float',
'parse_float32',
'parse_float64',
'int',
'int32',
'int64',
'parse_int',
'parse_int32',
'parse_int64',
'bool',
'get_sequence',
'reverse_complement',
Expand Down
182 changes: 182 additions & 0 deletions hail/python/hail/expr/functions.py
Expand Up @@ -4197,6 +4197,35 @@ def float64(x) -> Float64Expression:
else:
return x._method("toFloat64", tfloat64)

@typecheck(x=expr_str)
def parse_float64(x) -> Float64Expression:
"""Parse a string as a 64-bit floating point number.

Examples
--------

>>> hl.eval(hl.parse_float64('1.1')) # doctest: +SKIP_OUTPUT_CHECK
1.1

>>> hl.eval(hl.parse_float64('asdf'))
None

Notes
-----
If the input is an invalid floating point number, then result of this call will be missing.

Parameters
----------
x : :class:`.StringExpression`

Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat64`

"""
return x._method("toFloat64OrMissing", tfloat64)


@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def float32(x) -> Float32Expression:
"""Convert to a 32-bit floating point expression.
Expand Down Expand Up @@ -4226,6 +4255,35 @@ def float32(x) -> Float32Expression:
else:
return x._method("toFloat32", tfloat32)

@typecheck(x=expr_str)
def parse_float32(x) -> Float32Expression:
"""Parse a string as a 32-bit floating point number.

Examples
--------

>>> hl.eval(hl.parse_float32('1.1')) # doctest: +SKIP_OUTPUT_CHECK
1.1

>>> hl.eval(hl.parse_float32('asdf'))
None

Notes
-----
If the input is an invalid floating point number, then result of this call will be missing.

Parameters
----------
x : :class:`.StringExpression`

Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat32`

"""
return x._method("toFloat32OrMissing", tfloat32)


@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def int64(x) -> Int64Expression:
"""Convert to a 64-bit integer expression.
Expand Down Expand Up @@ -4255,6 +4313,37 @@ def int64(x) -> Int64Expression:
else:
return x._method("toInt64", tint64)

@typecheck(x=expr_str)
def parse_int64(x) -> Int64Expression:
"""Parse a string as a 64-bit integer.

Examples
--------

>>> hl.eval(hl.parse_int64('154'))
154

>>> hl.eval(hl.parse_int64('15.4'))
None

>>> hl.eval(hl.parse_int64('asdf'))
None

Notes
-----
If the input is an invalid integer, then result of this call will be missing.

Parameters
----------
x : :class:`.StringExpression`

Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint64`

"""
return x._method("toInt64OrMissing", tint64)


@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def int32(x) -> Int32Expression:
Expand Down Expand Up @@ -4285,6 +4374,38 @@ def int32(x) -> Int32Expression:
else:
return x._method("toInt32", tint32)

@typecheck(x=expr_str)
def parse_int32(x) -> Int32Expression:
"""Parse a string as a 32-bit integer.

Examples
--------

>>> hl.eval(hl.parse_int32('154'))
154

>>> hl.eval(hl.parse_int32('15.4'))
None

>>> hl.eval(hl.parse_int32('asdf'))
None

Notes
-----
If the input is an invalid integer, then result of this call will be missing.

Parameters
----------
x : :class:`.StringExpression`

Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint32`

"""
return x._method("toInt32OrMissing", tint32)


@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def int(x) -> Int32Expression:
"""Convert to a 32-bit integer expression.
Expand Down Expand Up @@ -4316,6 +4437,38 @@ def int(x) -> Int32Expression:
return int32(x)


@typecheck(x=expr_str)
def parse_int(x) -> Int32Expression:
"""Parse a string as a 32-bit integer.

Examples
--------

>>> hl.eval(hl.parse_int('154'))
154

>>> hl.eval(hl.parse_int('15.4'))
None

>>> hl.eval(hl.parse_int('asdf'))
None

Notes
-----
If the input is an invalid integer, then result of this call will be missing.

Parameters
----------
x : :class:`.StringExpression`

Returns
-------
:class:`.NumericExpression` of type :py:data:`.tint32`

"""
return parse_int32(x)


@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def float(x) -> Float64Expression:
"""Convert to a 64-bit floating point expression.
Expand Down Expand Up @@ -4347,6 +4500,35 @@ def float(x) -> Float64Expression:
return float64(x)


@typecheck(x=expr_str)
def parse_float(x) -> Float64Expression:
"""Parse a string as a 64-bit floating point number.

Examples
--------

>>> hl.eval(hl.parse_float('1.1')) # doctest: +SKIP_OUTPUT_CHECK
1.1

>>> hl.eval(hl.parse_float('asdf'))
None

Notes
-----
If the input is an invalid floating point number, then result of this call will be missing.

Parameters
----------
x : :class:`.StringExpression`

Returns
-------
:class:`.NumericExpression` of type :py:data:`.tfloat64`

"""
return parse_float64(x)


@typecheck(x=expr_oneof(expr_numeric, expr_bool, expr_str))
def bool(x) -> BooleanExpression:
"""Convert to a Boolean expression.
Expand Down
43 changes: 25 additions & 18 deletions hail/python/test/hail/expr/test_expr.py
Expand Up @@ -1315,22 +1315,6 @@ def test_str_ops(self):
self.assertEqual(hl.eval(hl.float32(s)), 1.5)
self.assertEqual(hl.eval(hl.float64(s)), 1.5)

s1 = hl.literal('true')
s2 = hl.literal('True')
s3 = hl.literal('TRUE')

s4 = hl.literal('false')
s5 = hl.literal('False')
s6 = hl.literal('FALSE')

self.assertTrue(hl.eval(hl.bool(s1)))
self.assertTrue(hl.eval(hl.bool(s2)))
self.assertTrue(hl.eval(hl.bool(s3)))

self.assertFalse(hl.eval(hl.bool(s4)))
self.assertFalse(hl.eval(hl.bool(s5)))
self.assertFalse(hl.eval(hl.bool(s6)))

s = hl.literal('abcABC123')
self.assertEqual(hl.eval(s.lower()), 'abcabc123')
self.assertEqual(hl.eval(s.upper()), 'ABCABC123')
Expand All @@ -1352,18 +1336,41 @@ def test_str_ops(self):
self.assertFalse(hl.eval(s_whitespace.endswith('a')))

def test_str_parsing(self):
for x in ('true', 'True', 'TRUE'):
self.assertTrue(hl.eval(hl.bool(x)))

for x in ('false', 'False', 'FALSE'):
self.assertFalse(hl.eval(hl.bool(x)))

for x in ('nan', 'Nan', 'naN', 'NaN'):
for f in (hl.float, hl.float32, hl.float64):
for f in (hl.float, hl.float32, hl.float64, hl.parse_float32, hl.parse_float64):
self.assertTrue(hl.eval(hl.is_nan(f(x))))
self.assertTrue(hl.eval(hl.is_nan(f('+' + x))))
self.assertTrue(hl.eval(hl.is_nan(f('-' + x))))

for x in ('inf', 'Inf', 'iNf', 'InF', 'infinity', 'InfiNitY', 'INFINITY'):
for f in (hl.float, hl.float32, hl.float64):
for f in (hl.float, hl.float32, hl.float64, hl.parse_float32, hl.parse_float64):
self.assertTrue(hl.eval(hl.is_infinite(f(x))))
self.assertTrue(hl.eval(hl.is_infinite(f('+' + x))))
self.assertTrue(hl.eval(hl.is_infinite(f('-' + x))))
self.assertTrue(hl.eval(f('-' + x) < 0.0))

for x in ('0', '1', '-5', '12382421'):
for f in (hl.int32, hl.int64, hl.parse_int32, hl.parse_int64):
self.assertEqual(hl.eval(f(hl.literal(x))), int(x))
for f in (hl.float32, hl.float64, hl.parse_float32, hl.parse_float64):
self.assertEqual(hl.eval(f(hl.literal(x))), float(x))

for x in ('-1.5', '0.0', '2.5'):
for f in (hl.float32, hl.float64, hl.parse_float32, hl.parse_float64):
self.assertEqual(hl.eval(f(hl.literal(x))), float(x))
for f in (hl.parse_int32, hl.parse_int64):
self.assertEqual(hl.eval(f(hl.literal(x))), None)

for x in ('abc', '1abc', ''):
for f in (hl.parse_float32, hl.parse_float64, hl.parse_int32, hl.parse_int64):
self.assertEqual(hl.eval(f(hl.literal(x))), None)

def test_str_missingness(self):
self.assertEqual(hl.eval(hl.str(1)), '1')
self.assertEqual(hl.eval(hl.str(hl.null('int32'))), None)
Expand Down
66 changes: 42 additions & 24 deletions hail/src/main/scala/is/hail/expr/ir/functions/UtilFunctions.scala
Expand Up @@ -48,6 +48,25 @@ object UtilFunctions extends RegistryFunctions {
case _ => s.toDouble
}

def isValidBoolean(s: String): Boolean =
(s equalsCI "true") || (s equalsCI "false")

def isValidInt32(s: String): Boolean =
try { s.toInt; true } catch { case _: NumberFormatException => false }

def isValidInt64(s: String): Boolean =
try { s.toLong; true } catch { case _: NumberFormatException => false }

def isValidFloat32(s: String): Boolean = parseSpecialNum(s) match {
case 0 => try { s.toFloat; true } catch { case _: NumberFormatException => false }
case _ => true
}

def isValidFloat64(s: String): Boolean = parseSpecialNum(s) match {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're parsing twice here -- once for the check and once for the parse. That's not ideal, but I do see the reason why it's not easy to do it just once in generated code without boxing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah :I if we could catch exceptions in asm4s then that would be one nicer way to handle this.

case 0 => try { s.toDouble; true } catch { case _: NumberFormatException => false }
case _ => true
}

def min_ignore_missing(l: Int, lMissing: Boolean, r: Int, rMissing: Boolean): Int =
if (lMissing) r else if (rMissing) l else Math.min(l, r)

Expand Down Expand Up @@ -120,30 +139,29 @@ object UtilFunctions extends RegistryFunctions {
registerCode[Boolean]("toInt64", TBoolean(), TInt64(), null) { case (_, rt, (xT, x: Code[Boolean])) => x.toI.toL }
registerCode[Boolean]("toFloat32", TBoolean(), TFloat32(), null) { case (_, rt, (xT, x: Code[Boolean])) => x.toI.toF }
registerCode[Boolean]("toFloat64", TBoolean(), TFloat64(), null) { case (_, rt, (xT, x: Code[Boolean])) => x.toI.toD }
registerCode("toInt32", TString(), TInt32(), null) {
case (r, rt, (xT: PString, x: Code[Long])) =>
val s = asm4s.coerce[String](wrapArg(r, xT)(x))
Code.invokeScalaObject[String, Int](thisClass, "parseInt32", s)
}
registerCode("toInt64", TString(), TInt64(), null) {
case (r, rt, (xT: PString, x: Code[Long])) =>
val s = asm4s.coerce[String](wrapArg(r, xT)(x))
Code.invokeScalaObject[String, Long](thisClass, "parseInt64", s)
}
registerCode("toFloat32", TString(), TFloat32(), null) {
case (r, rt, (xT: PString, x: Code[Long])) =>
val s = asm4s.coerce[String](wrapArg(r, xT)(x))
Code.invokeScalaObject[String, Float](thisClass, "parseFloat32", s)
}
registerCode("toFloat64", TString(), TFloat64(), null) {
case (r, rt, (xT: PString, x: Code[Long])) =>
val s = asm4s.coerce[String](wrapArg(r, xT)(x))
Code.invokeScalaObject[String, Double](thisClass, "parseFloat64", s)
}
registerCode("toBoolean", TString(), TBoolean(), null) {
case (r, rt, (xT: PString, x: Code[Long])) =>
val s = asm4s.coerce[String](wrapArg(r, xT)(x))
Code.invokeScalaObject[String, Boolean](thisClass, "parseBoolean", s)

for ((name, t, ct) <- Seq[(String, Type, ClassTag[_])](
("Boolean", TBoolean(), implicitly[ClassTag[Boolean]]),
("Int32", TInt32(), implicitly[ClassTag[Int]]),
("Int64", TInt64(), implicitly[ClassTag[Long]]),
("Float64", TFloat64(), implicitly[ClassTag[Double]]),
("Float32", TFloat32(), implicitly[ClassTag[Float]])
)) {
val ctString: ClassTag[String] = implicitly
registerCode(s"to$name", TString(), t, null) {
case (r, rt, (xT: PString, x: Code[Long])) =>
val s = asm4s.coerce[String](wrapArg(r, xT)(x))
Code.invokeScalaObject(thisClass, s"parse$name", s)(ctString, ct)
}
registerCodeWithMissingness(s"to${name}OrMissing", TString(), t, null) {
case (r, rt, (xT: PString, x: EmitTriplet)) =>
val s = r.mb.newLocal[String]
val m = r.mb.newLocal[Boolean]
EmitTriplet(
Code(x.setup, m := x.m, (!m).orEmpty(s := asm4s.coerce[String](wrapArg(r, xT)(x.v)))),
(m || !Code.invokeScalaObject[String, Boolean](thisClass, s"isValid$name", s)),
Code.invokeScalaObject(thisClass, s"parse$name", s)(ctString, ct))
}
}

Array(TInt32(), TInt64()).foreach { t =>
Expand Down