From 3adaa9c562af58eafc9ddbad14850ce2329aa8c2 Mon Sep 17 00:00:00 2001 From: Juniper Tyree Date: Sat, 3 Dec 2022 14:13:25 -0800 Subject: [PATCH] Some progress on shared slices --- src/lib.rs | 1 + src/safety/stack_only.rs | 6 +- src/utils/shared/mod.rs | 2 +- src/utils/shared/slice.rs | 160 +++++++++++++++++++++++++++---------- src/utils/shared/static.rs | 72 ++++++++--------- 5 files changed, 160 insertions(+), 81 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 3185fbbd..f496bb3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,7 @@ any(all(not(feature = "host"), target_os = "cuda"), doc), feature(asm_const) )] +#![cfg_attr(target_os = "cuda", feature(ptr_metadata))] #![cfg_attr(any(feature = "alloc", doc), feature(allocator_api))] #![feature(doc_cfg)] #![feature(cfg_version)] diff --git a/src/safety/stack_only.rs b/src/safety/stack_only.rs index ce8887bb..eb3a6970 100644 --- a/src/safety/stack_only.rs +++ b/src/safety/stack_only.rs @@ -37,8 +37,10 @@ mod sealed { impl !StackOnly for &mut T {} impl !StackOnly for crate::utils::shared::r#static::ThreadBlockShared {} - // impl !StackOnly for - // crate::utils::shared::slice::ThreadBlockSharedSlice {} + impl !StackOnly + for crate::utils::shared::slice::ThreadBlockSharedSlice + { + } impl StackOnly for core::marker::PhantomData {} } diff --git a/src/utils/shared/mod.rs b/src/utils/shared/mod.rs index dcfe3b00..88a586ad 100644 --- a/src/utils/shared/mod.rs +++ b/src/utils/shared/mod.rs @@ -1,2 +1,2 @@ -// pub mod slice; +pub mod slice; pub mod r#static; diff --git a/src/utils/shared/slice.rs b/src/utils/shared/slice.rs index 098670fb..238b1aac 100644 --- a/src/utils/shared/slice.rs +++ b/src/utils/shared/slice.rs @@ -1,73 +1,151 @@ +#[cfg(not(target_os = "cuda"))] +use core::marker::PhantomData; + +use const_type_layout::TypeGraphLayout; use rustacuda_core::DeviceCopy; +use crate::common::{CudaAsRust, DeviceAccessible, RustToCuda}; + +#[cfg(not(target_os = "cuda"))] +#[allow(clippy::module_name_repetitions)] +#[repr(transparent)] +pub struct ThreadBlockSharedSlice { + len: usize, + marker: PhantomData, +} + +#[cfg(target_os = "cuda")] #[allow(clippy::module_name_repetitions)] +#[repr(transparent)] +pub struct ThreadBlockSharedSlice { + shared: *mut [T], +} + +#[doc(hidden)] #[derive(TypeLayout)] +#[layout(bound = "T: 'static + ~const TypeGraphLayout")] #[repr(C)] -pub struct ThreadBlockSharedSlice { +pub struct ThreadBlockSharedSliceCudaRepresentation { len: usize, - byte_offset: usize, + // Note: uses a zero-element array instead of PhantomData here so that + // TypeLayout can still observe T's layout marker: [T; 0], } -unsafe impl DeviceCopy for ThreadBlockSharedSlice {} +unsafe impl DeviceCopy + for ThreadBlockSharedSliceCudaRepresentation +{ +} -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockSharedSlice { +// #[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] +// #[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] +impl ThreadBlockSharedSlice { + #[cfg(any(not(target_os = "cuda"), doc))] + #[doc(cfg(not(target_os = "cuda")))] #[must_use] - pub fn with_len(len: usize) -> Self { + pub fn new_uninit_with_len(len: usize) -> Self { Self { len, - byte_offset: 0, - marker: [], + marker: PhantomData::, } } + #[cfg(not(target_os = "cuda"))] #[must_use] pub fn len(&self) -> usize { self.len } + #[cfg(target_os = "cuda")] + #[must_use] + pub fn len(&self) -> usize { + core::ptr::metadata(self.shared) + } + #[must_use] pub fn is_empty(&self) -> bool { - self.len == 0 + self.len() == 0 + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[must_use] + pub fn as_mut_slice_ptr(&self) -> *mut [T] { + self.shared + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[must_use] + pub fn as_mut_ptr(&self) -> *mut T { + self.shared.cast() } } -#[cfg(all(not(feature = "host"), target_os = "cuda"))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -impl ThreadBlockSharedSlice { - /// # Safety - /// - /// The thread-block shared dynamic memory must be initialised once and - /// only once per kernel. - pub unsafe fn init() { - unsafe { - core::arch::asm!( - ".shared .align {align} .b8 rust_cuda_dynamic_shared[];", - align = const(core::mem::align_of::()), - ); - } +unsafe impl RustToCuda for ThreadBlockSharedSlice { + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + type CudaAllocation = crate::host::NullCudaAlloc; + type CudaRepresentation = ThreadBlockSharedSliceCudaRepresentation; + + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn borrow( + &self, + alloc: A, + ) -> rustacuda::error::CudaResult<( + DeviceAccessible, + crate::host::CombinedCudaAlloc, + )> { + Ok(( + DeviceAccessible::from(ThreadBlockSharedSliceCudaRepresentation { + len: self.len, + marker: [], + }), + crate::host::CombinedCudaAlloc::new(crate::host::NullCudaAlloc, alloc), + )) } - /// # Safety - /// - /// Exposing the [`ThreadBlockSharedSlice`] must be preceded by exactly one - /// call to [`ThreadBlockSharedSlice::init`] for the type `T` amongst - /// all `ThreadBlockSharedSlice` that has the largest alignment. - pub unsafe fn with_uninit Q, Q>(self, inner: F) -> Q { - let base: *mut u8; - - unsafe { - core::arch::asm!( - "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;", - reg = out(reg64) base, - ); - } + #[cfg(feature = "host")] + #[doc(cfg(feature = "host"))] + unsafe fn restore( + &mut self, + alloc: crate::host::CombinedCudaAlloc, + ) -> rustacuda::error::CudaResult { + let (_null, alloc): (crate::host::NullCudaAlloc, A) = alloc.split(); + + Ok(alloc) + } +} + +unsafe impl CudaAsRust + for ThreadBlockSharedSliceCudaRepresentation +{ + type RustRepresentation = ThreadBlockSharedSlice; + + #[cfg(any(not(feature = "host"), doc))] + #[doc(cfg(not(feature = "host")))] + unsafe fn as_rust(_this: &DeviceAccessible) -> Self::RustRepresentation { + todo!() + + // unsafe { + // core::arch::asm!( + // ".shared .align {align} .b8 rust_cuda_dynamic_shared[];", + // align = const(core::mem::align_of::()), + // ); + // } + + // let base: *mut u8; - let slice = - core::ptr::slice_from_raw_parts_mut(base.add(self.byte_offset).cast(), self.len); + // unsafe { + // core::arch::asm!( + // "cvta.shared.u64 {reg}, rust_cuda_dynamic_shared;", + // reg = out(reg64) base, + // ); + // } - inner(slice) + // let slice = core::ptr::slice_from_raw_parts_mut( + // base.add(self.byte_offset).cast(), self.len, + // ); } } diff --git a/src/utils/shared/static.rs b/src/utils/shared/static.rs index fc3e86b3..b93e2452 100644 --- a/src/utils/shared/static.rs +++ b/src/utils/shared/static.rs @@ -29,6 +29,41 @@ pub struct ThreadBlockSharedCudaRepresentation { unsafe impl DeviceCopy for ThreadBlockSharedCudaRepresentation {} +impl ThreadBlockShared { + #[cfg(not(target_os = "cuda"))] + #[must_use] + pub fn new_uninit() -> Self { + Self { + marker: PhantomData::, + } + } + + #[cfg(target_os = "cuda")] + #[must_use] + pub fn new_uninit() -> Self { + let shared: *mut T; + + unsafe { + core::arch::asm!( + ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", + "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", + reg = out(reg64) shared, + align = const(core::mem::align_of::()), + size = const(core::mem::size_of::()), + ); + } + + Self { shared } + } + + #[cfg(any(target_os = "cuda", doc))] + #[doc(cfg(target_os = "cuda"))] + #[must_use] + pub fn as_mut_ptr(&self) -> *mut T { + self.shared + } +} + unsafe impl RustToCuda for ThreadBlockShared { #[cfg(feature = "host")] #[doc(cfg(feature = "host"))] @@ -73,40 +108,3 @@ unsafe impl CudaAsRust ThreadBlockShared::new_uninit() } } - -#[cfg(not(any(all(not(feature = "host"), target_os = "cuda"), doc)))] -#[doc(cfg(not(all(not(feature = "host"), target_os = "cuda"))))] -impl ThreadBlockShared { - #[must_use] - pub fn new_uninit() -> Self { - Self { - marker: PhantomData::, - } - } -} - -#[cfg(any(all(not(feature = "host"), target_os = "cuda"), doc))] -#[doc(cfg(all(not(feature = "host"), target_os = "cuda")))] -impl ThreadBlockShared { - #[must_use] - pub fn new_uninit() -> Self { - let shared: *mut T; - - unsafe { - core::arch::asm!( - ".shared .align {align} .b8 {reg}_rust_cuda_static_shared[{size}];", - "cvta.shared.u64 {reg}, {reg}_rust_cuda_static_shared;", - reg = out(reg64) shared, - align = const(core::mem::align_of::()), - size = const(core::mem::size_of::()), - ); - } - - Self { shared } - } - - #[must_use] - pub fn as_mut_ptr(&self) -> *mut T { - self.shared - } -}