diff --git a/experimental/cuda2/graph_command_buffer.c b/experimental/cuda2/graph_command_buffer.c
index fd58217e2e44..fd4310fc1ce8 100644
--- a/experimental/cuda2/graph_command_buffer.c
+++ b/experimental/cuda2/graph_command_buffer.c
@@ -591,7 +591,7 @@ static iree_status_t iree_hal_cuda2_graph_command_buffer_push_descriptor_set(
           iree_hal_buffer_allocated_buffer(binding->buffer));
       iree_device_size_t offset = iree_hal_buffer_byte_offset(binding->buffer);
       device_ptr = device_buffer + offset + binding->offset;
-    };
+    }
     current_bindings[binding->binding] = device_ptr;
   }
 
@@ -665,6 +665,7 @@ static iree_status_t iree_hal_cuda2_graph_command_buffer_dispatch(
   iree_host_size_t set_count =
       iree_hal_cuda2_pipeline_layout_descriptor_set_count(kernel_info.layout);
   for (iree_host_size_t i = 0; i < set_count; ++i) {
+    // TODO: cache this information in the kernel info to avoid recomputation.
     iree_host_size_t binding_count =
         iree_hal_cuda2_descriptor_set_layout_binding_count(
             iree_hal_cuda2_pipeline_layout_descriptor_set_layout(
@@ -678,6 +679,10 @@ static iree_status_t iree_hal_cuda2_graph_command_buffer_dispatch(
   // Append the push constants to the kernel arguments.
   iree_host_size_t base_index =
       iree_hal_cuda2_pipeline_layout_push_constant_index(kernel_info.layout);
+  // As commented in the above, what each kernel parameter points to is a
+  // CUdeviceptr, which as the size of a pointer on the target machine. we are
+  // just storing a 32-bit value for the push constant here instead. So we must
+  // process one element each type, for 64-bit machines.
   for (iree_host_size_t i = 0; i < push_constant_count; i++) {
     *((uint32_t*)params_ptr[base_index + i]) =
         command_buffer->push_constants[i];
diff --git a/experimental/cuda2/stream_command_buffer.c b/experimental/cuda2/stream_command_buffer.c
index 9fe5bd9a2bd9..f43f6f0470d9 100644
--- a/experimental/cuda2/stream_command_buffer.c
+++ b/experimental/cuda2/stream_command_buffer.c
@@ -491,7 +491,7 @@ static iree_status_t iree_hal_cuda2_stream_command_buffer_push_descriptor_set(
           iree_hal_buffer_allocated_buffer(binding->buffer));
       iree_device_size_t offset = iree_hal_buffer_byte_offset(binding->buffer);
       device_ptr = device_buffer + offset + binding->offset;
-    };
+    }
     current_bindings[binding->binding] = device_ptr;
   }
 
@@ -571,6 +571,7 @@ static iree_status_t iree_hal_cuda2_stream_command_buffer_dispatch(
   iree_host_size_t set_count =
       iree_hal_cuda2_pipeline_layout_descriptor_set_count(kernel_info.layout);
   for (iree_host_size_t i = 0; i < set_count; ++i) {
+    // TODO: cache this information in the kernel info to avoid recomputation.
     iree_host_size_t binding_count =
         iree_hal_cuda2_descriptor_set_layout_binding_count(
             iree_hal_cuda2_pipeline_layout_descriptor_set_layout(
@@ -584,6 +585,10 @@ static iree_status_t iree_hal_cuda2_stream_command_buffer_dispatch(
   // Append the push constants to the kernel arguments.
   iree_host_size_t base_index =
       iree_hal_cuda2_pipeline_layout_push_constant_index(kernel_info.layout);
+  // As commented in the above, what each kernel parameter points to is a
+  // CUdeviceptr, which as the size of a pointer on the target machine. we are
+  // just storing a 32-bit value for the push constant here instead. So we must
+  // process one element each type, for 64-bit machines.
   for (iree_host_size_t i = 0; i < push_constant_count; i++) {
     *((uint32_t*)params_ptr[base_index + i]) =
         command_buffer->push_constants[i];