diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index fa336d2dc8c..a268d010957 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -56,7 +56,7 @@ impl BinaryArray { assert_eq!(offsets.len() - 1, validity.len()); } - if data_type != Self::default_data_type() { + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary") } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index d9aecc80acc..8c1207e3902 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -1,4 +1,7 @@ -use crate::{bitmap::Bitmap, datatypes::DataType}; +use crate::{ + bitmap::Bitmap, + datatypes::{DataType, PhysicalType}, +}; use super::{display_fmt, Array}; @@ -40,6 +43,9 @@ impl BooleanArray { if let Some(ref validity) = validity { assert_eq!(values.len(), validity.len()); } + if data_type.to_physical_type() != PhysicalType::Boolean { + panic!("BooleanArray can only be initialized with DataType::Boolean") + } Self { data_type, values, diff --git a/src/array/boolean/mutable.rs b/src/array/boolean/mutable.rs index 4be152a3515..8ab6d01faa1 100644 --- a/src/array/boolean/mutable.rs +++ b/src/array/boolean/mutable.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use crate::{ array::{Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, - datatypes::DataType, + datatypes::{DataType, PhysicalType}, error::Result, trusted_len::TrustedLen, }; @@ -74,6 +74,9 @@ impl MutableBooleanArray { values: MutableBitmap, validity: Option, ) -> Self { + if data_type.to_physical_type() != PhysicalType::Boolean { + panic!("MutableBooleanArray can only be initialized with DataType::Boolean") + } Self { data_type, values, diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 71a9a687513..2a91db09ce6 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -40,7 +40,7 @@ impl FixedSizeBinaryArray { Self { size, - data_type: DataType::FixedSizeBinary(size), + data_type, values, validity, offset: 0, @@ -97,10 +97,9 @@ impl FixedSizeBinaryArray { impl FixedSizeBinaryArray { pub(crate) fn get_size(data_type: &DataType) -> &i32 { - if let DataType::FixedSizeBinary(size) = data_type { - size - } else { - panic!("Wrong DataType") + match data_type { + DataType::FixedSizeBinary(size) => size, + _ => panic!("Wrong DataType"), } } } diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 4147de5b6da..6b5e1e4e7b1 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -26,13 +26,18 @@ pub struct FixedSizeListArray { impl FixedSizeListArray { /// Returns a new empty [`FixedSizeListArray`]. pub fn new_empty(data_type: DataType) -> Self { - let values = new_empty_array(Self::get_child_and_size(&data_type).0.clone()).into(); + let values = + new_empty_array(Self::get_child_and_size(&data_type).0.data_type().clone()).into(); Self::from_data(data_type, values, None) } /// Returns a new null [`FixedSizeListArray`]. pub fn new_null(data_type: DataType, length: usize) -> Self { - let values = new_null_array(Self::get_child_and_size(&data_type).0.clone(), length).into(); + let values = new_null_array( + Self::get_child_and_size(&data_type).0.data_type().clone(), + length, + ) + .into(); Self::from_data(data_type, values, Some(Bitmap::new_zeroed(length))) } @@ -88,11 +93,10 @@ impl FixedSizeListArray { } impl FixedSizeListArray { - pub(crate) fn get_child_and_size(data_type: &DataType) -> (&DataType, &i32) { - if let DataType::FixedSizeList(field, size) = data_type { - (field.data_type(), size) - } else { - panic!("Wrong DataType") + pub(crate) fn get_child_and_size(data_type: &DataType) -> (&Field, &i32) { + match data_type { + DataType::FixedSizeList(child, size) => (child.as_ref(), size), + _ => panic!("Wrong DataType"), } } diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 25993dfdb4a..4a67b90a02f 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -96,9 +96,10 @@ pub fn make_growable<'a>( let data_type = arrays[0].data_type(); assert!(arrays.iter().all(|&item| item.data_type() == data_type)); - match data_type { - DataType::Null => Box::new(null::GrowableNull::new(data_type.clone())), - DataType::Boolean => { + use PhysicalType::*; + match data_type.to_physical_type() { + Null => Box::new(null::GrowableNull::new(data_type.clone())), + Boolean => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -109,33 +110,19 @@ pub fn make_growable<'a>( capacity, )) } - DataType::Int8 => dyn_growable!(i8, arrays, use_validity, capacity), - DataType::Int16 => dyn_growable!(i16, arrays, use_validity, capacity), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - dyn_growable!(i32, arrays, use_validity, capacity) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - dyn_growable!(i64, arrays, use_validity, capacity) - } - DataType::Interval(IntervalUnit::DayTime) => { - dyn_growable!(days_ms, arrays, use_validity, capacity) - } - DataType::Decimal(_, _) => dyn_growable!(i128, arrays, use_validity, capacity), - DataType::UInt8 => dyn_growable!(u8, arrays, use_validity, capacity), - DataType::UInt16 => dyn_growable!(u16, arrays, use_validity, capacity), - DataType::UInt32 => dyn_growable!(u32, arrays, use_validity, capacity), - DataType::UInt64 => dyn_growable!(u64, arrays, use_validity, capacity), - DataType::Float16 => unreachable!(), - DataType::Float32 => dyn_growable!(f32, arrays, use_validity, capacity), - DataType::Float64 => dyn_growable!(f64, arrays, use_validity, capacity), - DataType::Utf8 => { + Int8 => dyn_growable!(i8, arrays, use_validity, capacity), + Int16 => dyn_growable!(i16, arrays, use_validity, capacity), + Int32 => dyn_growable!(i32, arrays, use_validity, capacity), + Int64 => dyn_growable!(i64, arrays, use_validity, capacity), + Int128 => dyn_growable!(i128, arrays, use_validity, capacity), + DaysMs => dyn_growable!(days_ms, arrays, use_validity, capacity), + UInt8 => dyn_growable!(u8, arrays, use_validity, capacity), + UInt16 => dyn_growable!(u16, arrays, use_validity, capacity), + UInt32 => dyn_growable!(u32, arrays, use_validity, capacity), + UInt64 => dyn_growable!(u64, arrays, use_validity, capacity), + Float32 => dyn_growable!(f32, arrays, use_validity, capacity), + Float64 => dyn_growable!(f64, arrays, use_validity, capacity), + Utf8 => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -146,7 +133,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::LargeUtf8 => { + LargeUtf8 => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -157,7 +144,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::Binary => { + Binary => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -168,7 +155,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::LargeBinary => { + LargeBinary => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -179,7 +166,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::FixedSizeBinary(_) => { + FixedSizeBinary => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -190,8 +177,7 @@ pub fn make_growable<'a>( capacity, )) } - - DataType::List(_) => { + List => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -202,7 +188,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::LargeList(_) => { + LargeList => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -213,7 +199,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::Struct(_) => { + Struct => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -224,10 +210,10 @@ pub fn make_growable<'a>( capacity, )) } - DataType::FixedSizeList(_, _) => todo!(), - DataType::Union(_, _, _) => todo!(), - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + FixedSizeList => todo!(), + Union => todo!(), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { dyn_dict_growable!($T, arrays, use_validity, capacity) }) } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 8e767dda0a8..95808e2c191 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -133,15 +133,15 @@ impl ListArray { #[inline] pub fn get_child_field(data_type: &DataType) -> &Field { if O::is_large() { - if let DataType::LargeList(child) = data_type { - child.as_ref() - } else { - panic!("Wrong DataType") + match data_type { + DataType::LargeList(child) => child.as_ref(), + _ => panic!("Wrong DataType"), } - } else if let DataType::List(child) = data_type { - child.as_ref() } else { - panic!("Wrong DataType") + match data_type { + DataType::List(child) => child.as_ref(), + _ => panic!("Wrong DataType"), + } } } diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 488df1097d5..a18ee3bdd56 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -24,7 +24,7 @@ use super::{ffi::ToFfi, new_empty_array, new_null_array, Array, FromFfi}; /// Field::new("c", DataType::Int32, false), /// ]; /// -/// let array = StructArray::from_data(fields, vec![boolean, int], None); +/// let array = StructArray::from_data(DataType::Struct(fields), vec![boolean, int], None); /// ``` #[derive(Debug, Clone)] pub struct StructArray { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 2e022b51439..a5cb62b57a2 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -72,7 +72,7 @@ impl Utf8Array { assert_eq!(offsets.len() - 1, validity.len()); } - if data_type != Self::default_data_type() { + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") } @@ -105,6 +105,10 @@ impl Utf8Array { ) -> Self { check_offsets(&offsets, values.len()); + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") + } + Self { data_type, offsets, diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index fc7eb61c898..80e1e292ae6 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -46,11 +46,7 @@ impl MutableUtf8Array { let mut offsets = MutableBuffer::::new(); offsets.push(O::default()); Self { - data_type: if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }, + data_type: Self::default_data_type(), offsets, values: MutableBuffer::::new(), validity: None, @@ -73,10 +69,8 @@ impl MutableUtf8Array { if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } - if O::is_large() { - assert_eq!(data_type, DataType::LargeUtf8) - } else { - assert_eq!(data_type, DataType::Utf8) + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") } Self { data_type, @@ -103,10 +97,8 @@ impl MutableUtf8Array { if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } - if O::is_large() { - assert_eq!(data_type, DataType::LargeUtf8) - } else { - assert_eq!(data_type, DataType::Utf8) + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") } Self { data_type, @@ -117,11 +109,7 @@ impl MutableUtf8Array { } fn default_data_type() -> DataType { - if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - } + Utf8Array::::default_data_type() } /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots. diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index c955b274b23..01e5d03c2cb 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -4,7 +4,7 @@ mod physical_type; mod schema; pub use field::Field; -pub(crate) use physical_type::*; +pub use physical_type::*; pub use schema::Schema; /// The set of datatypes that are supported by this implementation of Apache Arrow. @@ -172,8 +172,8 @@ impl DataType { } } - /// Returns the physical type of the logical type - pub(crate) fn to_physical_type(&self) -> PhysicalType { + /// the [`PhysicalType`] of this [`DataType`]. + pub fn to_physical_type(&self) -> PhysicalType { use DataType::*; match self { Null => PhysicalType::Null, diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs index 701bf237018..6c5f47fa7b8 100644 --- a/src/datatypes/physical_type.rs +++ b/src/datatypes/physical_type.rs @@ -1,5 +1,4 @@ -/// Represents a physical type: a unique in-memory representation. -/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`]. +/// the set of valid indices used to index a dictionary-encoded Array. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum DictionaryIndexType { /// A signed 8-bit integer. @@ -20,13 +19,14 @@ pub enum DictionaryIndexType { UInt64, } -/// Represents a physical type: a unique in-memory representation of an Arrow array. +/// The set of physical types: unique in-memory representations of an Arrow array. /// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and -/// a one-to-one mapping with all structs in this crate that implement [`crate::array::Array`]. +/// a one-to-one mapping with each struct in this crate that implements [`crate::array::Array`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum PhysicalType { + /// A Null with no allocation. Null, - /// A boolean datatype representing the values `true` and `false`. + /// A boolean represented as a single bit. Boolean, /// A signed 8-bit integer. Int8, @@ -55,7 +55,6 @@ pub enum PhysicalType { /// Opaque binary data of variable length. Binary, /// Opaque binary data of fixed size. - /// Enum parameter specifies the number of bytes per value. FixedSizeBinary, /// Opaque binary data of variable length and 64-bit offsets. LargeBinary, @@ -63,26 +62,16 @@ pub enum PhysicalType { Utf8, /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. LargeUtf8, - /// A list of some logical data type with variable length. + /// A list of some data type with variable length. List, - /// A list of some logical data type with fixed length. + /// A list of some data type with fixed length. FixedSizeList, - /// A list of some logical data type with variable length and 64-bit offsets. + /// A list of some data type with variable length and 64-bit offsets. LargeList, - /// A nested datatype that contains a number of sub-fields. + /// A nested type that contains an arbitrary number of fields. Struct, - /// A nested datatype that can represent slots of differing types. - /// Third argument represents sparsness + /// A nested type that represents slots of differing types. Union, - /// A dictionary encoded array (`key_type`, `value_type`), where - /// each array element is an index of `key_type` into an - /// associated dictionary of `value_type`. - /// - /// Dictionary arrays are used to store columns of `value_type` - /// that contain many repeated values using less memory, but with - /// a higher CPU overhead for some operations. - /// - /// This type mostly used to represent low cardinality string - /// arrays or a limited set of primitive types as integers. + /// A dictionary encoded array by `DictionaryIndexType`. Dictionary(DictionaryIndexType), } diff --git a/src/io/ipc/read/array/fixed_size_list.rs b/src/io/ipc/read/array/fixed_size_list.rs index 8fb9b45cbcd..3527665fcc0 100644 --- a/src/io/ipc/read/array/fixed_size_list.rs +++ b/src/io/ipc/read/array/fixed_size_list.rs @@ -37,7 +37,7 @@ pub fn read_fixed_size_list( let values = read( field_nodes, - value_data_type.clone(), + value_data_type.data_type().clone(), buffers, reader, block_offset, @@ -57,7 +57,7 @@ pub fn skip_fixed_size_list( let _ = buffers.pop_front().unwrap(); - let (data_type, _) = FixedSizeListArray::get_child_and_size(data_type); + let (field, _) = FixedSizeListArray::get_child_and_size(data_type); - skip(field_nodes, data_type, buffers) + skip(field_nodes, field.data_type(), buffers) } diff --git a/src/types/mod.rs b/src/types/mod.rs index 31b2996aadc..fe030d73424 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -31,12 +31,11 @@ pub unsafe trait Relation { } macro_rules! create_relation { - ($native_ty:ty, $($impl_pattern:pat)|+) => { + ($native_ty:ty, $physical_ty:expr) => { unsafe impl Relation for $native_ty { #[inline] fn is_valid(data_type: &DataType) -> bool { - let physical_type = data_type.to_physical_type(); - matches!(physical_type, $($impl_pattern)|+) + data_type.to_physical_type() == $physical_ty } } };