diff --git a/Cargo.toml b/Cargo.toml index 13fbf2e05cf..2d7578f4e5c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ bench = false [dependencies] num-traits = "0.2" +bytemuck = { version = "1", features = ["derive"] } chrono = { version = "0.4", default_features = false, features = ["std"] } chrono-tz = { version = "0.6", optional = true } # To efficiently cast numbers to strings diff --git a/src/io/avro/mod.rs b/src/io/avro/mod.rs index aff28da0c6e..5f267abcb60 100644 --- a/src/io/avro/mod.rs +++ b/src/io/avro/mod.rs @@ -1,5 +1,4 @@ #![deny(missing_docs)] -#![forbid(unsafe_code)] //! Read and write from and to Apache Avro pub mod read; diff --git a/src/io/csv/mod.rs b/src/io/csv/mod.rs index 037e243b8a1..00b14185051 100644 --- a/src/io/csv/mod.rs +++ b/src/io/csv/mod.rs @@ -1,5 +1,4 @@ #![deny(missing_docs)] -#![forbid(unsafe_code)] //! Convert data between the Arrow and CSV (comma-separated values). use crate::error::ArrowError; diff --git a/src/io/ipc/read/read_basic.rs b/src/io/ipc/read/read_basic.rs index d442a82b665..d634e2f6e52 100644 --- a/src/io/ipc/read/read_basic.rs +++ b/src/io/ipc/read/read_basic.rs @@ -72,14 +72,8 @@ fn read_uncompressed_buffer( if is_native_little_endian() == is_little_endian { // fast case where we can just copy the contents as is - unsafe { - // transmute T to bytes. - let slice = std::slice::from_raw_parts_mut( - buffer.as_mut_ptr() as *mut u8, - length * std::mem::size_of::(), - ); - reader.read_exact(slice)?; - } + let slice = bytemuck::cast_slice_mut(&mut buffer); + reader.read_exact(slice)?; } else { read_swapped(reader, length, &mut buffer, is_little_endian)?; } @@ -108,14 +102,7 @@ fn read_compressed_buffer( let mut slice = vec![0u8; buffer_length]; reader.read_exact(&mut slice)?; - // Safety: - // This is safe because T is NativeType, which by definition can be transmuted to u8 - let out_slice = unsafe { - std::slice::from_raw_parts_mut( - buffer.as_mut_ptr() as *mut u8, - length * std::mem::size_of::(), - ) - }; + let out_slice = bytemuck::cast_slice_mut(&mut buffer); match compression.codec() { CompressionType::LZ4_FRAME => { diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 114583d21f6..aa6d3d31f26 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -765,12 +765,7 @@ fn _write_compressed_buffer_from_iter>( fn _write_buffer(buffer: &[T], arrow_data: &mut Vec, is_little_endian: bool) { if is_little_endian == is_native_little_endian() { // in native endianess we can use the bytes directly. - let buffer = unsafe { - std::slice::from_raw_parts( - buffer.as_ptr() as *const u8, - buffer.len() * std::mem::size_of::(), - ) - }; + let buffer = bytemuck::cast_slice(buffer); arrow_data.extend_from_slice(buffer); } else { _write_buffer_from_iter(buffer.iter().copied(), arrow_data, is_little_endian) @@ -784,12 +779,7 @@ fn _write_compressed_buffer( compression: Compression, ) { if is_little_endian == is_native_little_endian() { - let bytes = unsafe { - std::slice::from_raw_parts( - buffer.as_ptr() as *const u8, - buffer.len() * std::mem::size_of::(), - ) - }; + let bytes = bytemuck::cast_slice(buffer); arrow_data.extend_from_slice(&(bytes.len() as i64).to_le_bytes()); match compression { Compression::LZ4 => { diff --git a/src/io/json/mod.rs b/src/io/json/mod.rs index d7a1a862046..e52d8a9f45d 100644 --- a/src/io/json/mod.rs +++ b/src/io/json/mod.rs @@ -1,5 +1,4 @@ #![deny(missing_docs)] -#![forbid(unsafe_code)] //! Convert data between the Arrow memory format and JSON line-delimited records. pub mod read; diff --git a/src/io/mod.rs b/src/io/mod.rs index 7591b655200..cb4ff4bd00a 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -1,3 +1,4 @@ +#![forbid(unsafe_code)] //! Contains modules to interface with other formats such as [`csv`], //! [`parquet`], [`json`], [`ipc`], [`mod@print`] and [`avro`]. #[cfg(any( diff --git a/src/io/parquet/read/primitive/basic.rs b/src/io/parquet/read/primitive/basic.rs index 883ca314908..e661e0173cc 100644 --- a/src/io/parquet/read/primitive/basic.rs +++ b/src/io/parquet/read/primitive/basic.rs @@ -5,7 +5,7 @@ use parquet2::{ }; use super::super::utils as other_utils; -use super::utils::ExactChunksIter; +use super::utils::chunks; use super::ColumnDescriptor; use crate::{ bitmap::{utils::BitmapIter, MutableBitmap}, @@ -110,7 +110,7 @@ fn read_nullable( F: Fn(T) -> A, { let length = additional + values.len(); - let mut chunks = ExactChunksIter::::new(values_buffer); + let mut chunks = chunks(values_buffer); let validity_iterator = hybrid_rle::Decoder::new(validity_buffer, 1); @@ -153,7 +153,7 @@ where F: Fn(T) -> A, { assert_eq!(values_buffer.len(), additional * std::mem::size_of::()); - let iterator = ExactChunksIter::::new(values_buffer); + let iterator = chunks(values_buffer); let iterator = iterator.map(op); diff --git a/src/io/parquet/read/primitive/nested.rs b/src/io/parquet/read/primitive/nested.rs index 84303cf2056..67da7ffcd89 100644 --- a/src/io/parquet/read/primitive/nested.rs +++ b/src/io/parquet/read/primitive/nested.rs @@ -7,7 +7,7 @@ use parquet2::{ use super::super::nested_utils::extend_offsets; use super::ColumnDescriptor; -use super::{super::utils, utils::ExactChunksIter, Nested}; +use super::{super::utils, utils::chunks, Nested}; use crate::{ bitmap::MutableBitmap, error::Result, trusted_len::TrustedLen, types::NativeType as ArrowNativeType, @@ -66,7 +66,7 @@ fn read( A: ArrowNativeType, F: Fn(T) -> A, { - let new_values = ExactChunksIter::::new(values_buffer); + let new_values = chunks(values_buffer); let max_rep_level = rep_level_encoding.1 as u32; let max_def_level = def_level_encoding.1 as u32; diff --git a/src/io/parquet/read/primitive/utils.rs b/src/io/parquet/read/primitive/utils.rs index 8dce995a656..3215c21165b 100644 --- a/src/io/parquet/read/primitive/utils.rs +++ b/src/io/parquet/read/primitive/utils.rs @@ -1,44 +1,17 @@ -use crate::trusted_len::TrustedLen; - -use std::{convert::TryInto, hint::unreachable_unchecked}; +use std::convert::TryInto; use parquet2::types::NativeType; -pub struct ExactChunksIter<'a, T: NativeType> { - chunks: std::slice::ChunksExact<'a, u8>, - phantom: std::marker::PhantomData, -} - -impl<'a, T: NativeType> ExactChunksIter<'a, T> { - #[inline] - pub fn new(slice: &'a [u8]) -> Self { - assert_eq!(slice.len() % std::mem::size_of::(), 0); - let chunks = slice.chunks_exact(std::mem::size_of::()); - Self { - chunks, - phantom: std::marker::PhantomData, - } - } -} - -impl<'a, T: NativeType> Iterator for ExactChunksIter<'a, T> { - type Item = T; - - #[inline] - fn next(&mut self) -> Option { - self.chunks.next().map(|chunk| { - let chunk: ::Bytes = match chunk.try_into() { - Ok(v) => v, - Err(_) => unsafe { unreachable_unchecked() }, - }; - T::from_le_bytes(chunk) - }) - } +use crate::trusted_len::TrustedLen; - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.chunks.size_hint() - } +pub fn chunks(bytes: &[u8]) -> impl TrustedLen + '_ { + assert_eq!(bytes.len() % std::mem::size_of::(), 0); + let chunks = bytes.chunks_exact(std::mem::size_of::()); + chunks.map(|chunk| { + let chunk: ::Bytes = match chunk.try_into() { + Ok(v) => v, + Err(_) => unreachable!(), + }; + T::from_le_bytes(chunk) + }) } - -unsafe impl<'a, T: NativeType> TrustedLen for ExactChunksIter<'a, T> {} diff --git a/src/types/native.rs b/src/types/native.rs index bd3aed42827..5a3884275ed 100644 --- a/src/types/native.rs +++ b/src/types/native.rs @@ -1,6 +1,8 @@ use std::convert::TryFrom; use std::ops::Neg; +use bytemuck::{Pod, Zeroable}; + use super::PrimitiveType; /// Sealed trait implemented by all physical types that can be allocated, @@ -8,15 +10,14 @@ use super::PrimitiveType; /// All O(N) allocations in this crate are done for this trait alone. pub trait NativeType: super::private::Sealed + + Pod + Send + Sync + Sized - + Copy + std::fmt::Debug + std::fmt::Display + PartialEq + Default - + 'static { /// The corresponding variant of [`PrimitiveType`]. const PRIMITIVE: PrimitiveType; @@ -84,8 +85,9 @@ native_type!(f64, PrimitiveType::Float64); native_type!(i128, PrimitiveType::Int128); /// The in-memory representation of the DayMillisecond variant of arrow's "Interval" logical type. -#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash)] +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash, Zeroable, Pod)] #[allow(non_camel_case_types)] +#[repr(C)] pub struct days_ms([i32; 2]); impl days_ms { @@ -176,7 +178,7 @@ impl NativeType for days_ms { } /// The in-memory representation of the MonthDayNano variant of the "Interval" logical type. -#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash)] +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash, Zeroable, Pod)] #[allow(non_camel_case_types)] #[repr(C)] pub struct months_days_ns(i32, i32, i64);