Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test, fix complex-valued TypeCast in PyOpenCL #846

Merged
merged 9 commits into from
Jun 27, 2024
40 changes: 20 additions & 20 deletions doc/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ ambiguous.
...
for (int j = 0; j <= -1 + n; ++j)
for (int i = 0; i <= -1 + n; ++i)
a[n * i + j] = 0.0f;
a[n * i + j] = (float) (0.0f);
...

No more warnings! Loop nesting is also reflected in the dependency graph:
Expand Down Expand Up @@ -563,7 +563,7 @@ Consider this example:
...
for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
a[16 * i_outer + i_inner] = 0.0f;
a[16 * i_outer + i_inner] = (float) (0.0f);
...

By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
Expand Down Expand Up @@ -594,7 +594,7 @@ relation to loop nesting. For example, it's perfectly possible to request
...
for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
a[16 * i_outer + i_inner] = 0.0f;
a[16 * i_outer + i_inner] = (float) (0.0f);
...

Notice how loopy has automatically generated guard conditionals to make
Expand Down Expand Up @@ -662,10 +662,10 @@ loop's tag to ``"unr"``:
...
for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
...

Expand Down Expand Up @@ -737,7 +737,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
__kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
{
if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
a[128 * gid(0) + lid(0)] = 0.0f;
a[128 * gid(0) + lid(0)] = (float) (0.0f);
}

Loopy requires that workgroup sizes are fixed and constant at compile time.
Expand Down Expand Up @@ -782,13 +782,13 @@ assumption:
...
for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[1 + 4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[2 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (-4 + -4 * i_outer + n >= 0)
a[3 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
...

Expand All @@ -812,24 +812,24 @@ enabling some cost savings:
/* bulk slab for 'i_outer' */
for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
{
a[4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
a[1 + 4 * i_outer] = (float) (0.0f);
a[2 + 4 * i_outer] = (float) (0.0f);
a[3 + 4 * i_outer] = (float) (0.0f);
}
/* final slab for 'i_outer' */
{
int const i_outer = -1 + n + -1 * ((3 * n) / 4);
<BLANKLINE>
if (-1 + n >= 0)
{
a[4 * i_outer] = 0.0f;
a[4 * i_outer] = (float) (0.0f);
if (-2 + -4 * i_outer + n >= 0)
a[1 + 4 * i_outer] = 0.0f;
a[1 + 4 * i_outer] = (float) (0.0f);
if (-3 + -4 * i_outer + n >= 0)
a[2 + 4 * i_outer] = 0.0f;
a[2 + 4 * i_outer] = (float) (0.0f);
if (4 + 4 * i_outer + -1 * n == 0)
a[3 + 4 * i_outer] = 0.0f;
a[3 + 4 * i_outer] = (float) (0.0f);
}
}
...
Expand Down
22 changes: 16 additions & 6 deletions loopy/target/c/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from loopy.symbolic import IdentityMapper
from loopy.target.execution import ExecutorBase
from loopy.translation_unit import FunctionIdT, TranslationUnit
from loopy.types import NumpyType, LoopyType
from loopy.types import NumpyType, LoopyType, to_loopy_type
from loopy.typing import ExpressionT
from loopy.kernel import LoopKernel
from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
Expand Down Expand Up @@ -120,7 +120,10 @@ def map_nan(self, expr):

def c99_preamble_generator(preamble_info):
if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
yield ("10_stdint", "#include <stdint.h>")
yield ("10_stdint", """
#include <stdint.h>
#include <stdbool.h>
""")
if any(dtype.numpy_dtype == np.dtype("bool")
for dtype in preamble_info.seen_dtypes
if isinstance(dtype, NumpyType)):
Expand Down Expand Up @@ -525,16 +528,19 @@ def with_types(self, arg_id_to_dtype, callables_table):
dtype))

if name in ["abs", "real", "imag"]:
dtype = real_dtype
result_dtype = real_dtype
else:
result_dtype = dtype

if dtype.kind == "c" or name in ["real", "imag", "abs"]:
if dtype.kind == "c":
if name != "conj":
name = "c" + name

return (
self.copy(name_in_target=name,
arg_id_to_dtype={0: NumpyType(dtype), -1:
NumpyType(dtype)}),
arg_id_to_dtype={
0: NumpyType(dtype),
-1: NumpyType(result_dtype)}),
callables_table)

# binary functions
Expand Down Expand Up @@ -1134,6 +1140,10 @@ def emit_assignment(self, codegen_state, insn):

lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)

if isinstance(insn.assignee, p.Lookup):
lhs_dtype = to_loopy_type(lhs_dtype.numpy_dtype[insn.assignee.name])

if lhs_atomicity is None:
from cgen import Assign
return Assign(
Expand Down
35 changes: 17 additions & 18 deletions loopy/target/c/codegen/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"""


from typing import Optional
import numpy as np

from pymbolic.mapper import RecursiveMapper, IdentityMapper
Expand All @@ -42,6 +43,7 @@
from loopy.types import LoopyType
from loopy.target.c import CExpression
from loopy.typing import ExpressionT
from loopy.symbolic import TypeCast


__doc__ = """
Expand Down Expand Up @@ -108,24 +110,23 @@ def find_array(self, expr):

return ary

def wrap_in_typecast(self, actual_type, needed_dtype, s):
return s
def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
if actual_type != needed_type:
registry = self.codegen_state.ast_builder.target.get_dtype_registry()
cast = var("(%s) " % registry.dtype_to_ctype(needed_type))
return cast(s)

def wrap_in_typecast_lazy(self, actual_type_func, needed_dtype, s):
"""This is similar to *wrap_in_typecast*, but takes a function for
the actual type argument instead of a type. This can be helpful
when actual type argument is expensive to calculate and is not
needed in some cases.
"""
return s

def rec(self, expr, type_context=None, needed_dtype=None):
if needed_dtype is None:
return RecursiveMapper.rec(self, expr, type_context)
def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None):
result = RecursiveMapper.rec(self, expr, type_context)

return self.wrap_in_typecast_lazy(
lambda: self.infer_type(expr), needed_dtype,
RecursiveMapper.rec(self, expr, type_context))
if needed_type is None:
return result
else:
return self.wrap_in_typecast(
self.infer_type(expr), needed_type,
result)

def __call__(self, expr, prec=None, type_context=None, needed_dtype=None):
if prec is None:
Expand Down Expand Up @@ -415,10 +416,8 @@ def map_comparison(self, expr, type_context):
expr.operator,
self.rec(expr.right, inner_type_context))

def map_type_cast(self, expr, type_context):
registry = self.codegen_state.ast_builder.target.get_dtype_registry()
cast = var("(%s)" % registry.dtype_to_ctype(expr.type))
return cast(self.rec(expr.child, type_context))
def map_type_cast(self, expr: TypeCast, type_context: str):
return self.rec(expr.child, type_context, expr.type)

def map_constant(self, expr, type_context):
from loopy.symbolic import Literal
Expand Down
6 changes: 3 additions & 3 deletions loopy/target/opencl.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,13 +521,13 @@ def opencl_preamble_generator(preamble_info):

class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):

def wrap_in_typecast_lazy(self, actual_dtype, needed_dtype, s):
if needed_dtype.dtype.kind == "b" and actual_dtype().dtype.kind == "f":
def wrap_in_typecast(self, actual_type, needed_dtype, s):
if needed_dtype.dtype.kind == "b" and actual_type.dtype.kind == "f":
# CL does not perform implicit conversion from float-type to a bool.
from pymbolic.primitives import Comparison
return Comparison(s, "!=", 0)

return super().wrap_in_typecast_lazy(actual_dtype, needed_dtype, s)
return super().wrap_in_typecast(actual_type, needed_dtype, s)

def map_group_hw_index(self, expr, type_context):
return var("gid")(expr.axis)
Expand Down
23 changes: 8 additions & 15 deletions loopy/target/pyopencl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations

"""OpenCL target integrated with PyOpenCL."""

__copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
Expand Down Expand Up @@ -209,23 +210,15 @@ def complex_type_name(self, dtype):
else:
raise RuntimeError

def wrap_in_typecast_lazy(self, actual_type_func, needed_dtype, s):
if needed_dtype.is_complex():
return self.wrap_in_typecast(actual_type_func(), needed_dtype, s)
else:
return super().wrap_in_typecast_lazy(actual_type_func,
needed_dtype, s)

def wrap_in_typecast(self, actual_type, needed_dtype, s):
if (actual_type.is_complex() and needed_dtype.is_complex()
and actual_type != needed_dtype):
return p.Variable("%s_cast" % self.complex_type_name(needed_dtype))(s)
elif not actual_type.is_complex() and needed_dtype.is_complex():
return p.Variable("%s_fromreal" % self.complex_type_name(needed_dtype))(
def wrap_in_typecast(self, actual_type, needed_type, s):
if (actual_type.is_complex() and needed_type.is_complex()
and actual_type != needed_type):
return p.Variable("%s_cast" % self.complex_type_name(needed_type))(s)
elif not actual_type.is_complex() and needed_type.is_complex():
return p.Variable("%s_fromreal" % self.complex_type_name(needed_type))(
s)
else:
return super().wrap_in_typecast_lazy(actual_type,
needed_dtype, s)
return super().wrap_in_typecast(actual_type, needed_type, s)

def map_sum(self, expr, type_context):
# I've added 'type_context == "i"' because of the following
Expand Down
8 changes: 3 additions & 5 deletions loopy/type_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,22 +522,20 @@ def map_lookup(self, expr):
return [NumpyType(dtype)]

def map_comparison(self, expr):
# "bool" is unusable because OpenCL's bool has indeterminate memory
# format.
self(expr.left, return_tuple=False, return_dtype_set=False)
self(expr.right, return_tuple=False, return_dtype_set=False)
return [NumpyType(np.dtype(np.int32))]
return [NumpyType(np.dtype(np.bool_))]

def map_logical_not(self, expr):
self.rec(expr.child)

return [NumpyType(np.dtype(np.int32))]
return [NumpyType(np.dtype(np.bool_))]

def map_logical_and(self, expr):
for child in expr.children:
self.rec(child)

return [NumpyType(np.dtype(np.int32))]
return [NumpyType(np.dtype(np.bool_))]

map_logical_or = map_logical_and

Expand Down
3 changes: 2 additions & 1 deletion test/test_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,8 +557,9 @@ def test_complex_support(ctx_factory, target):

kwargs = {"in1": in1, "in2": in2}

knl = lp.set_options(knl, write_code=True)

if target == lp.PyOpenCLTarget:
knl = lp.set_options(knl, write_code=True)
cl_ctx = ctx_factory()
with cl.CommandQueue(cl_ctx) as queue:
evt, out = knl(queue, **kwargs)
Expand Down
32 changes: 31 additions & 1 deletion test/test_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def test_nan_support(ctx_factory, target):


@pytest.mark.parametrize("target", [lp.PyOpenCLTarget, lp.ExecutableCTarget])
def test_opencl_emits_ternary_operators_correctly(ctx_factory, target):
def test_emits_ternary_operators_correctly(ctx_factory, target):
# See: https://github.com/inducer/loopy/issues/390
ctx = ctx_factory()
queue = cl.CommandQueue(ctx)
Expand Down Expand Up @@ -805,6 +805,36 @@ def test_ispc_private_var():
print(cg_result.device_code())


def test_to_complex_casts(ctx_factory):
arith_dtypes = "bhilqpBHILQPfdFD"

out_type = lp.to_loopy_type(np.dtype(np.complex128))
other = np.complex64(7)
from pymbolic import var

knl = lp.make_kernel(
[],
[
lp.Assignment(
f"out_{typename}",
lp.TypeCast(out_type, var(f"in_{typename}"))
+
lp.TypeCast(out_type, other)
)
for typename in arith_dtypes
],
[
lp.GlobalArg(f"in_{typename}", dtype=np.dtype(typename), shape=())
for typename in arith_dtypes
] + [...]
)

ctx = ctx_factory()
code = lp.generate_code_v2(knl).device_code()
# just testing here that the generated code builds
cl.Program(ctx, code).build()


if __name__ == "__main__":
if len(sys.argv) > 1:
exec(sys.argv[1])
Expand Down
Loading