inducer · inducer · Jun 27, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -505,7 +505,7 @@ ambiguous.
     ...
       for (int j = 0; j <= -1 + n; ++j)
         for (int i = 0; i <= -1 + n; ++i)
-          a[n * i + j] = 0.0f;
+          a[n * i + j] = (float) (0.0f);
     ...
 
 No more warnings! Loop nesting is also reflected in the dependency graph:
@@ -563,7 +563,7 @@ Consider this example:
     ...
       for (int i_outer = 0; i_outer <= -1 + (15 + n) / 16; ++i_outer)
         for (int i_inner = 0; i_inner <= ((-17 + n + -16 * i_outer >= 0) ? 15 : -1 + n + -16 * i_outer); ++i_inner)
-          a[16 * i_outer + i_inner] = 0.0f;
+          a[16 * i_outer + i_inner] = (float) (0.0f);
     ...
 
 By default, the new, split inames are named *OLD_outer* and *OLD_inner*,
@@ -594,7 +594,7 @@ relation to loop nesting. For example, it's perfectly possible to request
     ...
       for (int i_inner = 0; i_inner <= ((-17 + n >= 0) ? 15 : -1 + n); ++i_inner)
         for (int i_outer = 0; i_outer <= -1 + -1 * i_inner + (15 + n + 15 * i_inner) / 16; ++i_outer)
-          a[16 * i_outer + i_inner] = 0.0f;
+          a[16 * i_outer + i_inner] = (float) (0.0f);
     ...
 
 Notice how loopy has automatically generated guard conditionals to make
@@ -662,10 +662,10 @@ loop's tag to ``"unr"``:
     ...
       for (int i_outer = 0; i_outer <= loopy_floor_div_pos_b_int32(-4 + n, 4); ++i_outer)
       {
-        a[4 * i_outer] = 0.0f;
-        a[1 + 4 * i_outer] = 0.0f;
-        a[2 + 4 * i_outer] = 0.0f;
-        a[3 + 4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
+        a[1 + 4 * i_outer] = (float) (0.0f);
+        a[2 + 4 * i_outer] = (float) (0.0f);
+        a[3 + 4 * i_outer] = (float) (0.0f);
       }
     ...
 
@@ -737,7 +737,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
     {
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
-        a[128 * gid(0) + lid(0)] = 0.0f;
+        a[128 * gid(0) + lid(0)] = (float) (0.0f);
     }
 
 Loopy requires that workgroup sizes are fixed and constant at compile time.
@@ -782,13 +782,13 @@ assumption:
     ...
       for (int i_outer = 0; i_outer <= -1 + (3 + n) / 4; ++i_outer)
       {
-        a[4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
         if (-2 + -4 * i_outer + n >= 0)
-          a[1 + 4 * i_outer] = 0.0f;
+          a[1 + 4 * i_outer] = (float) (0.0f);
         if (-3 + -4 * i_outer + n >= 0)
-          a[2 + 4 * i_outer] = 0.0f;
+          a[2 + 4 * i_outer] = (float) (0.0f);
         if (-4 + -4 * i_outer + n >= 0)
-          a[3 + 4 * i_outer] = 0.0f;
+          a[3 + 4 * i_outer] = (float) (0.0f);
       }
     ...
 
@@ -812,24 +812,24 @@ enabling some cost savings:
       /* bulk slab for 'i_outer' */
       for (int i_outer = 0; i_outer <= -2 + (3 + n) / 4; ++i_outer)
       {
-        a[4 * i_outer] = 0.0f;
-        a[1 + 4 * i_outer] = 0.0f;
-        a[2 + 4 * i_outer] = 0.0f;
-        a[3 + 4 * i_outer] = 0.0f;
+        a[4 * i_outer] = (float) (0.0f);
+        a[1 + 4 * i_outer] = (float) (0.0f);
+        a[2 + 4 * i_outer] = (float) (0.0f);
+        a[3 + 4 * i_outer] = (float) (0.0f);
       }
       /* final slab for 'i_outer' */
       {
         int const i_outer = -1 + n + -1 * ((3 * n) / 4);
     <BLANKLINE>
         if (-1 + n >= 0)
         {
-          a[4 * i_outer] = 0.0f;
+          a[4 * i_outer] = (float) (0.0f);
           if (-2 + -4 * i_outer + n >= 0)
-            a[1 + 4 * i_outer] = 0.0f;
+            a[1 + 4 * i_outer] = (float) (0.0f);
           if (-3 + -4 * i_outer + n >= 0)
-            a[2 + 4 * i_outer] = 0.0f;
+            a[2 + 4 * i_outer] = (float) (0.0f);
           if (4 + 4 * i_outer + -1 * n == 0)
-            a[3 + 4 * i_outer] = 0.0f;
+            a[3 + 4 * i_outer] = (float) (0.0f);
         }
       }
     ...

diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
@@ -40,7 +40,7 @@
 from loopy.symbolic import IdentityMapper
 from loopy.target.execution import ExecutorBase
 from loopy.translation_unit import FunctionIdT, TranslationUnit
-from loopy.types import NumpyType, LoopyType
+from loopy.types import NumpyType, LoopyType, to_loopy_type
 from loopy.typing import ExpressionT
 from loopy.kernel import LoopKernel
 from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
@@ -120,7 +120,10 @@ def map_nan(self, expr):
 
 def c99_preamble_generator(preamble_info):
     if any(dtype.is_integral() for dtype in preamble_info.seen_dtypes):
-        yield ("10_stdint", "#include <stdint.h>")
+        yield ("10_stdint", """
+            #include <stdint.h>
+            #include <stdbool.h>
+            """)
     if any(dtype.numpy_dtype == np.dtype("bool")
            for dtype in preamble_info.seen_dtypes
            if isinstance(dtype, NumpyType)):
@@ -525,16 +528,19 @@ def with_types(self, arg_id_to_dtype, callables_table):
                     dtype))
 
             if name in ["abs", "real", "imag"]:
-                dtype = real_dtype
+                result_dtype = real_dtype
+            else:
+                result_dtype = dtype
 
-            if dtype.kind == "c" or name in ["real", "imag", "abs"]:
+            if dtype.kind == "c":
                 if name != "conj":
                     name = "c" + name
 
             return (
                     self.copy(name_in_target=name,
-                        arg_id_to_dtype={0: NumpyType(dtype), -1:
-                            NumpyType(dtype)}),
+                        arg_id_to_dtype={
+                            0: NumpyType(dtype),
+                            -1: NumpyType(result_dtype)}),
                     callables_table)
 
         # binary functions
@@ -1134,6 +1140,10 @@ def emit_assignment(self, codegen_state, insn):
 
         lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
         rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
+
+        if isinstance(insn.assignee, p.Lookup):
+            lhs_dtype = to_loopy_type(lhs_dtype.numpy_dtype[insn.assignee.name])
+
         if lhs_atomicity is None:
             from cgen import Assign
             return Assign(

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
@@ -21,6 +21,7 @@
 """
 
 
+from typing import Optional
 import numpy as np
 
 from pymbolic.mapper import RecursiveMapper, IdentityMapper
@@ -42,6 +43,7 @@
 from loopy.types import LoopyType
 from loopy.target.c import CExpression
 from loopy.typing import ExpressionT
+from loopy.symbolic import TypeCast
 
 
 __doc__ = """
@@ -108,24 +110,23 @@ def find_array(self, expr):
 
         return ary
 
-    def wrap_in_typecast(self, actual_type, needed_dtype, s):
-        return s
+    def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
+        if actual_type != needed_type:
+            registry = self.codegen_state.ast_builder.target.get_dtype_registry()
+            cast = var("(%s) " % registry.dtype_to_ctype(needed_type))
+            return cast(s)
 
-    def wrap_in_typecast_lazy(self, actual_type_func, needed_dtype, s):
-        """This is similar to *wrap_in_typecast*, but takes a function for
-        the actual type argument instead of a type. This can be helpful
-        when actual type argument is expensive to calculate and is not
-        needed in some cases.
-        """
         return s
 
-    def rec(self, expr, type_context=None, needed_dtype=None):
-        if needed_dtype is None:
-            return RecursiveMapper.rec(self, expr, type_context)
+    def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None):
+        result = RecursiveMapper.rec(self, expr, type_context)
 
-        return self.wrap_in_typecast_lazy(
-                lambda: self.infer_type(expr), needed_dtype,
-                RecursiveMapper.rec(self, expr, type_context))
+        if needed_type is None:
+            return result
+        else:
+            return self.wrap_in_typecast(
+                    self.infer_type(expr), needed_type,
+                    result)
 
     def __call__(self, expr, prec=None, type_context=None, needed_dtype=None):
         if prec is None:
@@ -415,10 +416,8 @@ def map_comparison(self, expr, type_context):
                     expr.operator,
                     self.rec(expr.right, inner_type_context))
 
-    def map_type_cast(self, expr, type_context):
-        registry = self.codegen_state.ast_builder.target.get_dtype_registry()
-        cast = var("(%s)" % registry.dtype_to_ctype(expr.type))
-        return cast(self.rec(expr.child, type_context))
+    def map_type_cast(self, expr: TypeCast, type_context: str):
+        return self.rec(expr.child, type_context, expr.type)
 
     def map_constant(self, expr, type_context):
         from loopy.symbolic import Literal

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
@@ -521,13 +521,13 @@ def opencl_preamble_generator(preamble_info):
 
 class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
 
-    def wrap_in_typecast_lazy(self, actual_dtype, needed_dtype, s):
-        if needed_dtype.dtype.kind == "b" and actual_dtype().dtype.kind == "f":
+    def wrap_in_typecast(self, actual_type, needed_dtype, s):
+        if needed_dtype.dtype.kind == "b" and actual_type.dtype.kind == "f":
             # CL does not perform implicit conversion from float-type to a bool.
             from pymbolic.primitives import Comparison
             return Comparison(s, "!=", 0)
 
-        return super().wrap_in_typecast_lazy(actual_dtype, needed_dtype, s)
+        return super().wrap_in_typecast(actual_type, needed_dtype, s)
 
     def map_group_hw_index(self, expr, type_context):
         return var("gid")(expr.axis)

diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 """OpenCL target integrated with PyOpenCL."""
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -209,23 +210,15 @@ def complex_type_name(self, dtype):
         else:
             raise RuntimeError
 
-    def wrap_in_typecast_lazy(self, actual_type_func, needed_dtype, s):
-        if needed_dtype.is_complex():
-            return self.wrap_in_typecast(actual_type_func(), needed_dtype, s)
-        else:
-            return super().wrap_in_typecast_lazy(actual_type_func,
-                                                 needed_dtype, s)
-
-    def wrap_in_typecast(self, actual_type, needed_dtype, s):
-        if (actual_type.is_complex() and needed_dtype.is_complex()
-                and actual_type != needed_dtype):
-            return p.Variable("%s_cast" % self.complex_type_name(needed_dtype))(s)
-        elif not actual_type.is_complex() and needed_dtype.is_complex():
-            return p.Variable("%s_fromreal" % self.complex_type_name(needed_dtype))(
+    def wrap_in_typecast(self, actual_type, needed_type, s):
+        if (actual_type.is_complex() and needed_type.is_complex()
+                and actual_type != needed_type):
+            return p.Variable("%s_cast" % self.complex_type_name(needed_type))(s)
+        elif not actual_type.is_complex() and needed_type.is_complex():
+            return p.Variable("%s_fromreal" % self.complex_type_name(needed_type))(
                     s)
         else:
-            return super().wrap_in_typecast_lazy(actual_type,
-                                                 needed_dtype, s)
+            return super().wrap_in_typecast(actual_type, needed_type, s)
 
     def map_sum(self, expr, type_context):
         # I've added 'type_context == "i"' because of the following

diff --git a/loopy/type_inference.py b/loopy/type_inference.py
@@ -522,22 +522,20 @@ def map_lookup(self, expr):
         return [NumpyType(dtype)]
 
     def map_comparison(self, expr):
-        # "bool" is unusable because OpenCL's bool has indeterminate memory
-        # format.
         self(expr.left, return_tuple=False, return_dtype_set=False)
         self(expr.right, return_tuple=False, return_dtype_set=False)
-        return [NumpyType(np.dtype(np.int32))]
+        return [NumpyType(np.dtype(np.bool_))]
 
     def map_logical_not(self, expr):
         self.rec(expr.child)
 
-        return [NumpyType(np.dtype(np.int32))]
+        return [NumpyType(np.dtype(np.bool_))]
 
     def map_logical_and(self, expr):
         for child in expr.children:
             self.rec(child)
 
-        return [NumpyType(np.dtype(np.int32))]
+        return [NumpyType(np.dtype(np.bool_))]
 
     map_logical_or = map_logical_and
 

diff --git a/test/test_expression.py b/test/test_expression.py
@@ -557,8 +557,9 @@ def test_complex_support(ctx_factory, target):
 
     kwargs = {"in1": in1, "in2": in2}
 
+    knl = lp.set_options(knl, write_code=True)
+
     if target == lp.PyOpenCLTarget:
-        knl = lp.set_options(knl, write_code=True)
         cl_ctx = ctx_factory()
         with cl.CommandQueue(cl_ctx) as queue:
             evt, out = knl(queue, **kwargs)

diff --git a/test/test_target.py b/test/test_target.py
@@ -431,7 +431,7 @@ def test_nan_support(ctx_factory, target):
 
 
 @pytest.mark.parametrize("target", [lp.PyOpenCLTarget, lp.ExecutableCTarget])
-def test_opencl_emits_ternary_operators_correctly(ctx_factory, target):
+def test_emits_ternary_operators_correctly(ctx_factory, target):
     # See: https://github.com/inducer/loopy/issues/390
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -805,6 +805,36 @@ def test_ispc_private_var():
     print(cg_result.device_code())
 
 
+def test_to_complex_casts(ctx_factory):
+    arith_dtypes = "bhilqpBHILQPfdFD"
+
+    out_type = lp.to_loopy_type(np.dtype(np.complex128))
+    other = np.complex64(7)
+    from pymbolic import var
+
+    knl = lp.make_kernel(
+        [],
+        [
+            lp.Assignment(
+                f"out_{typename}",
+                lp.TypeCast(out_type, var(f"in_{typename}"))
+                +
+                lp.TypeCast(out_type, other)
+            )
+            for typename in arith_dtypes
+        ],
+        [
+            lp.GlobalArg(f"in_{typename}", dtype=np.dtype(typename), shape=())
+            for typename in arith_dtypes
+        ] + [...]
+    )
+
+    ctx = ctx_factory()
+    code = lp.generate_code_v2(knl).device_code()
+    # just testing here that the generated code builds
+    cl.Program(ctx, code).build()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])