diff --git a/examples/single-source/src/main.rs b/examples/single-source/src/main.rs index cac35273..b8acc2c3 100644 --- a/examples/single-source/src/main.rs +++ b/examples/single-source/src/main.rs @@ -15,6 +15,7 @@ extern crate alloc; +#[cfg(target_os = "cuda")] use rc::utils::shared::r#static::ThreadBlockShared; #[cfg(not(target_os = "cuda"))] @@ -50,23 +51,25 @@ pub fn kernel<'a, T: rc::common::RustToCuda>( #[kernel(pass = LendRustToCuda)] _z: &ShallowCopy>, #[kernel(pass = SafeDeviceCopy, jit)] _v @ _w: &'a core::sync::atomic::AtomicU64, #[kernel(pass = LendRustToCuda)] _: Wrapper, - #[kernel(pass = SafeDeviceCopy)] Tuple(_s, mut __t): Tuple, - #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, + #[kernel(pass = SafeDeviceCopy)] Tuple(s, mut __t): Tuple, + // #[kernel(pass = LendRustToCuda)] shared3: ThreadBlockShared, ) where ::CudaRepresentation: rc::safety::StackOnly, { let shared: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); let shared2: ThreadBlockShared<[Tuple; 3]> = ThreadBlockShared::new_uninit(); + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] unsafe { - (*shared.as_mut_ptr().cast::().add(1)).0 = 42; + (*shared.as_mut_ptr().cast::().add(1)).0 = (f64::from(s) * 2.0) as u32; } unsafe { (*shared2.as_mut_ptr().cast::().add(2)).1 = 24; } - unsafe { - *shared3.as_mut_ptr() = 12; - } + unsafe { core::arch::asm!("hi") } + // unsafe { + // *shared3.as_mut_ptr() = 12; + // } } #[cfg(not(target_os = "cuda"))] diff --git a/rust-cuda-derive/Cargo.toml b/rust-cuda-derive/Cargo.toml index dee5afe1..ef7b5441 100644 --- a/rust-cuda-derive/Cargo.toml +++ b/rust-cuda-derive/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["Juniper Tyree "] license = "MIT OR Apache-2.0" edition = "2021" +links = "libnvptxcompiler_static" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -24,3 +25,4 @@ colored = "2.0" seahash = "4.1" ptx-builder = { git = "https://github.com/juntyr/rust-ptx-builder", rev = "1f1f49d" } +ptx_compiler = "0.1" diff --git a/rust-cuda-derive/build.rs b/rust-cuda-derive/build.rs new file mode 100644 index 00000000..27d940ad --- /dev/null +++ b/rust-cuda-derive/build.rs @@ -0,0 +1,3 @@ +fn main() { + println!("cargo:rustc-link-lib=nvptxcompiler_static"); +} diff --git a/rust-cuda-derive/src/kernel/link/config.rs b/rust-cuda-derive/src/kernel/link/config.rs index cdfd0b57..bb5f011d 100644 --- a/rust-cuda-derive/src/kernel/link/config.rs +++ b/rust-cuda-derive/src/kernel/link/config.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; #[allow(clippy::module_name_repetitions)] pub(super) struct LinkKernelConfig { pub(super) kernel: syn::Ident, + pub(super) kernel_hash: syn::Ident, pub(super) args: syn::Ident, pub(super) crate_name: String, pub(super) crate_path: PathBuf, @@ -12,6 +13,7 @@ pub(super) struct LinkKernelConfig { impl syn::parse::Parse for LinkKernelConfig { fn parse(input: syn::parse::ParseStream) -> syn::Result { let kernel: syn::Ident = input.parse()?; + let kernel_hash: syn::Ident = input.parse()?; let args: syn::Ident = input.parse()?; let name: syn::LitStr = input.parse()?; let path: syn::LitStr = input.parse()?; @@ -37,6 +39,7 @@ impl syn::parse::Parse for LinkKernelConfig { Ok(Self { kernel, + kernel_hash, args, crate_name: name.value(), crate_path: PathBuf::from(path.value()), diff --git a/rust-cuda-derive/src/kernel/link/mod.rs b/rust-cuda-derive/src/kernel/link/mod.rs index 8f544967..6ce73d84 100644 --- a/rust-cuda-derive/src/kernel/link/mod.rs +++ b/rust-cuda-derive/src/kernel/link/mod.rs @@ -1,7 +1,12 @@ use std::{ - env, fs, + env, + ffi::CString, + fs, io::{Read, Write}, + mem::MaybeUninit, + os::raw::c_int, path::{Path, PathBuf}, + ptr::addr_of_mut, sync::atomic::{AtomicBool, Ordering}, }; @@ -11,6 +16,7 @@ use ptx_builder::{ builder::{BuildStatus, Builder, MessageFormat, Profile}, error::{BuildErrorKind, Error, Result}, }; +use ptx_compiler::sys::size_t; use super::utils::skip_kernel_compilation; @@ -56,6 +62,7 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { let LinkKernelConfig { kernel, + kernel_hash, args, crate_name, crate_path, @@ -192,6 +199,119 @@ pub fn link_kernel(tokens: TokenStream) -> TokenStream { kernel_ptx.replace_range(type_layout_start..type_layout_end, ""); } + let mut compiler = MaybeUninit::uninit(); + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerCreate( + compiler.as_mut_ptr(), + kernel_ptx.len() as size_t, + kernel_ptx.as_ptr().cast(), + ) + }; + emit_call_site_warning!("PTX compiler create result {}", r); + let compiler = unsafe { compiler.assume_init() }; + + let mut major = 0; + let mut minor = 0; + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetVersion(addr_of_mut!(major), addr_of_mut!(minor)) + }; + emit_call_site_warning!("PTX version result {}", r); + emit_call_site_warning!("PTX compiler version {}.{}", major, minor); + + let kernel_name = if specialisation.is_empty() { + format!("{kernel_hash}_kernel") + } else { + format!( + "{kernel_hash}_kernel_{:016x}", + seahash::hash(specialisation.as_bytes()) + ) + }; + + let options = vec![ + CString::new("--entry").unwrap(), + CString::new(kernel_name).unwrap(), + CString::new("--verbose").unwrap(), + CString::new("--warn-on-double-precision-use").unwrap(), + CString::new("--warn-on-local-memory-usage").unwrap(), + CString::new("--warn-on-spills").unwrap(), + ]; + let options_ptrs = options.iter().map(|o| o.as_ptr()).collect::>(); + + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerCompile( + compiler, + options_ptrs.len() as c_int, + options_ptrs.as_ptr().cast(), + ) + }; + emit_call_site_warning!("PTX compile result {}", r); + + let mut info_log_size = 0; + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetInfoLogSize(compiler, addr_of_mut!(info_log_size)) + }; + emit_call_site_warning!("PTX info log size result {}", r); + #[allow(clippy::cast_possible_truncation)] + let mut info_log: Vec = Vec::with_capacity(info_log_size as usize); + if info_log_size > 0 { + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetInfoLog(compiler, info_log.as_mut_ptr().cast()) + }; + emit_call_site_warning!("PTX info log content result {}", r); + #[allow(clippy::cast_possible_truncation)] + unsafe { + info_log.set_len(info_log_size as usize); + } + } + let info_log = String::from_utf8_lossy(&info_log); + + let mut error_log_size = 0; + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetErrorLogSize(compiler, addr_of_mut!(error_log_size)) + }; + emit_call_site_warning!("PTX error log size result {}", r); + #[allow(clippy::cast_possible_truncation)] + let mut error_log: Vec = Vec::with_capacity(error_log_size as usize); + if error_log_size > 0 { + let r = unsafe { + ptx_compiler::sys::nvPTXCompilerGetErrorLog(compiler, error_log.as_mut_ptr().cast()) + }; + emit_call_site_warning!("PTX error log content result {}", r); + #[allow(clippy::cast_possible_truncation)] + unsafe { + error_log.set_len(error_log_size as usize); + } + } + let error_log = String::from_utf8_lossy(&error_log); + + // Ensure the compiler is not dropped + let mut compiler = MaybeUninit::new(compiler); + let r = unsafe { ptx_compiler::sys::nvPTXCompilerDestroy(compiler.as_mut_ptr()) }; + emit_call_site_warning!("PTX compiler destroy result {}", r); + + if !info_log.is_empty() { + emit_call_site_warning!("PTX compiler info log:\n{}", info_log); + } + if !error_log.is_empty() { + let mut max_lines = kernel_ptx.chars().filter(|c| *c == '\n').count() + 1; + let mut indent = 0; + while max_lines > 0 { + max_lines /= 10; + indent += 1; + } + + abort_call_site!( + "PTX compiler error log:\n{}\nPTX source:\n{}", + error_log, + kernel_ptx + .lines() + .enumerate() + .map(|(i, l)| format!("{:indent$}| {l}", i + 1)) + .collect::>() + .join("\n") + ); + } + (quote! { const PTX_STR: &'static str = #kernel_ptx; #(#type_layouts)* }).into() } diff --git a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs index 179ba7ee..d412bd31 100644 --- a/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs +++ b/rust-cuda-derive/src/kernel/wrapper/generate/cpu_linker_macro/get_ptx_str.rs @@ -84,7 +84,7 @@ pub(super) fn quote_get_ptx_str( quote! { fn get_ptx_str() -> &'static str { #crate_path::host::link_kernel!{ - #func_ident #args #crate_name #crate_manifest_dir #generic_start_token + #func_ident #func_ident_hash #args #crate_name #crate_manifest_dir #generic_start_token #($#macro_type_ids),* #generic_close_token } diff --git a/src/safety/device_copy.rs b/src/safety/device_copy.rs index a67a27e1..2acb17e1 100644 --- a/src/safety/device_copy.rs +++ b/src/safety/device_copy.rs @@ -19,4 +19,11 @@ mod sealed { for crate::utils::device_copy::SafeDeviceCopyWrapper { } + + // Only unsafe aliasing is possible since both only expose raw pointers + // impl SafeDeviceCopy for + // crate::utils::shared::r#static::ThreadBlockShared {} + // impl + // SafeDeviceCopy for crate::utils::shared::slice::ThreadBlockSharedSlice + // {} } diff --git a/src/safety/no_aliasing.rs b/src/safety/no_aliasing.rs index 22488efb..dbc163e5 100644 --- a/src/safety/no_aliasing.rs +++ b/src/safety/no_aliasing.rs @@ -22,4 +22,10 @@ mod private { { } impl NoAliasing for crate::utils::aliasing::SplitSliceOverCudaThreadsDynamicStride {} + + // Only unsafe aliasing is possible since both only expose raw pointers + // impl NoAliasing for + // crate::utils::shared::r#static::ThreadBlockShared {} + // impl NoAliasing + // for crate::utils::shared::slice::ThreadBlockSharedSlice {} }