Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added interoperability with arrow-schema #1442

Merged
merged 1 commit into from
Mar 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ ahash = "0.8"

# Support conversion to/from arrow-rs
arrow-buffer = { version = "35.0.0", optional = true }
arrow-schema = { version = "35.0.0", optional = true }

[target.wasm32-unknown-unknown.dependencies]
getrandom = { version = "0.2", features = ["js"] }
Expand Down Expand Up @@ -158,7 +159,7 @@ full = [
# parses timezones used in timestamp conversions
"chrono-tz",
]
arrow = ["arrow-buffer"]
arrow = ["arrow-buffer", "arrow-schema"]
io_odbc = ["odbc-api"]
io_csv = ["io_csv_read", "io_csv_write"]
io_csv_async = ["io_csv_read_async"]
Expand Down
21 changes: 21 additions & 0 deletions src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,24 @@ impl Field {
&self.data_type
}
}

#[cfg(feature = "arrow")]
impl From<Field> for arrow_schema::Field {
fn from(value: Field) -> Self {
Self::new(value.name, value.data_type.into(), value.is_nullable)
.with_metadata(value.metadata.into_iter().collect())
}
}

#[cfg(feature = "arrow")]
impl From<arrow_schema::Field> for Field {
fn from(value: arrow_schema::Field) -> Self {
let data_type = value.data_type().clone().into();
let metadata = value
.metadata()
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
Self::new(value.name(), data_type, value.is_nullable()).with_metadata(metadata)
}
}
186 changes: 186 additions & 0 deletions src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,126 @@ pub enum DataType {
Extension(String, Box<DataType>, Option<String>),
}

#[cfg(feature = "arrow")]
impl From<DataType> for arrow_schema::DataType {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This conversion uses truncating as _ casts, as provided the type is valid these cannot overflow / underflow. I'm not sure having a sensible behaviour for things like negative size FixedSizeList is necessary, garbage in garbage out was my thoughts

fn from(value: DataType) -> Self {
match value {
DataType::Null => Self::Null,
DataType::Boolean => Self::Boolean,
DataType::Int8 => Self::Int8,
DataType::Int16 => Self::Int16,
DataType::Int32 => Self::Int32,
DataType::Int64 => Self::Int64,
DataType::UInt8 => Self::UInt8,
DataType::UInt16 => Self::UInt16,
DataType::UInt32 => Self::UInt32,
DataType::UInt64 => Self::UInt64,
DataType::Float16 => Self::Float16,
DataType::Float32 => Self::Float32,
DataType::Float64 => Self::Float64,
DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz),
DataType::Date32 => Self::Date32,
DataType::Date64 => Self::Date64,
DataType::Time32(unit) => Self::Time32(unit.into()),
DataType::Time64(unit) => Self::Time64(unit.into()),
DataType::Duration(unit) => Self::Duration(unit.into()),
DataType::Interval(unit) => Self::Interval(unit.into()),
DataType::Binary => Self::Binary,
DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(size as _),
DataType::LargeBinary => Self::LargeBinary,
DataType::Utf8 => Self::Utf8,
DataType::LargeUtf8 => Self::LargeUtf8,
DataType::List(f) => Self::List(Box::new((*f).into())),
DataType::FixedSizeList(f, size) => {
Self::FixedSizeList(Box::new((*f).into()), size as _)
}
DataType::LargeList(f) => Self::LargeList(Box::new((*f).into())),
DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()),
DataType::Union(fields, Some(ids), mode) => {
let ids = ids.into_iter().map(|x| x as _).collect();
let fields = fields.into_iter().map(Into::into).collect();
Self::Union(fields, ids, mode.into())
}
DataType::Union(fields, None, mode) => {
let ids = (0..fields.len() as i8).collect();
let fields = fields.into_iter().map(Into::into).collect();
Self::Union(fields, ids, mode.into())
}
DataType::Map(f, ordered) => Self::Map(Box::new((*f).into()), ordered),
DataType::Dictionary(key, value, _) => Self::Dictionary(
Box::new(DataType::from(key).into()),
Box::new((*value).into()),
),
DataType::Decimal(precision, scale) => Self::Decimal128(precision as _, scale as _),
DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _),
DataType::Extension(_, d, _) => (*d).into(),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious why arrow2 chose to represent extension types as an explicit data type, as opposed to just field metadata?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is so that the Array has the logical type on it. It allows the user of a Box<dyn Array> to use .data_type() to perform the downcast and have the necessary information to build the extension.

For example, polars uses it to store arbitrary Python objects on the type. In theory this could be kept in the Field's metadata.

Pyarrow does the same: https://arrow.apache.org/docs/python/generated/pyarrow.ExtensionType.html

}
}
}

#[cfg(feature = "arrow")]
impl From<arrow_schema::DataType> for DataType {
fn from(value: arrow_schema::DataType) -> Self {
use arrow_schema::DataType;
match value {
DataType::Null => Self::Null,
DataType::Boolean => Self::Boolean,
DataType::Int8 => Self::Int8,
DataType::Int16 => Self::Int16,
DataType::Int32 => Self::Int32,
DataType::Int64 => Self::Int64,
DataType::UInt8 => Self::UInt8,
DataType::UInt16 => Self::UInt16,
DataType::UInt32 => Self::UInt32,
DataType::UInt64 => Self::UInt64,
DataType::Float16 => Self::Float16,
DataType::Float32 => Self::Float32,
DataType::Float64 => Self::Float64,
DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz),
DataType::Date32 => Self::Date32,
DataType::Date64 => Self::Date64,
DataType::Time32(unit) => Self::Time32(unit.into()),
DataType::Time64(unit) => Self::Time64(unit.into()),
DataType::Duration(unit) => Self::Duration(unit.into()),
DataType::Interval(unit) => Self::Interval(unit.into()),
DataType::Binary => Self::Binary,
DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(size as _),
DataType::LargeBinary => Self::LargeBinary,
DataType::Utf8 => Self::Utf8,
DataType::LargeUtf8 => Self::LargeUtf8,
DataType::List(f) => Self::List(Box::new((*f).into())),
DataType::FixedSizeList(f, size) => {
Self::FixedSizeList(Box::new((*f).into()), size as _)
}
DataType::LargeList(f) => Self::LargeList(Box::new((*f).into())),
DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()),
DataType::Union(fields, ids, mode) => {
let ids = ids.into_iter().map(|x| x as _).collect();
let fields = fields.into_iter().map(Into::into).collect();
Self::Union(fields, Some(ids), mode.into())
}
DataType::Map(f, ordered) => Self::Map(Box::new((*f).into()), ordered),
DataType::Dictionary(key, value) => {
let key = match *key {
DataType::Int8 => IntegerType::Int8,
DataType::Int16 => IntegerType::Int16,
DataType::Int32 => IntegerType::Int32,
DataType::Int64 => IntegerType::Int64,
DataType::UInt8 => IntegerType::UInt8,
DataType::UInt16 => IntegerType::UInt16,
DataType::UInt32 => IntegerType::UInt32,
DataType::UInt64 => IntegerType::UInt64,
d => panic!("illegal dictionary key type: {d}"),
};
Self::Dictionary(key, Box::new((*value).into()), false)
}
DataType::Decimal128(precision, scale) => Self::Decimal(precision as _, scale as _),
DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _),
DataType::RunEndEncoded(_, _) => panic!("Run-end encoding not supported by arrow2"),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could instead implement TryFrom, but it seemed a touch excessive for a single error case

}
}
}

/// Mode of [`DataType::Union`]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Serialize, Deserialize))]
Expand All @@ -170,6 +290,26 @@ pub enum UnionMode {
Sparse,
}

#[cfg(feature = "arrow")]
impl From<UnionMode> for arrow_schema::UnionMode {
fn from(value: UnionMode) -> Self {
match value {
UnionMode::Dense => Self::Dense,
UnionMode::Sparse => Self::Sparse,
}
}
}

#[cfg(feature = "arrow")]
impl From<arrow_schema::UnionMode> for UnionMode {
fn from(value: arrow_schema::UnionMode) -> Self {
match value {
arrow_schema::UnionMode::Dense => Self::Dense,
arrow_schema::UnionMode::Sparse => Self::Sparse,
}
}
}

impl UnionMode {
/// Constructs a [`UnionMode::Sparse`] if the input bool is true,
/// or otherwise constructs a [`UnionMode::Dense`]
Expand Down Expand Up @@ -206,6 +346,30 @@ pub enum TimeUnit {
Nanosecond,
}

#[cfg(feature = "arrow")]
impl From<TimeUnit> for arrow_schema::TimeUnit {
fn from(value: TimeUnit) -> Self {
match value {
TimeUnit::Nanosecond => Self::Nanosecond,
TimeUnit::Millisecond => Self::Millisecond,
TimeUnit::Microsecond => Self::Microsecond,
TimeUnit::Second => Self::Second,
}
}
}

#[cfg(feature = "arrow")]
impl From<arrow_schema::TimeUnit> for TimeUnit {
fn from(value: arrow_schema::TimeUnit) -> Self {
match value {
arrow_schema::TimeUnit::Nanosecond => Self::Nanosecond,
arrow_schema::TimeUnit::Millisecond => Self::Millisecond,
arrow_schema::TimeUnit::Microsecond => Self::Microsecond,
arrow_schema::TimeUnit::Second => Self::Second,
}
}
}

/// Interval units defined in Arrow
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde_types", derive(Serialize, Deserialize))]
Expand All @@ -219,6 +383,28 @@ pub enum IntervalUnit {
MonthDayNano,
}

#[cfg(feature = "arrow")]
impl From<IntervalUnit> for arrow_schema::IntervalUnit {
fn from(value: IntervalUnit) -> Self {
match value {
IntervalUnit::YearMonth => Self::YearMonth,
IntervalUnit::DayTime => Self::DayTime,
IntervalUnit::MonthDayNano => Self::MonthDayNano,
}
}
}

#[cfg(feature = "arrow")]
impl From<arrow_schema::IntervalUnit> for IntervalUnit {
fn from(value: arrow_schema::IntervalUnit) -> Self {
match value {
arrow_schema::IntervalUnit::YearMonth => Self::YearMonth,
arrow_schema::IntervalUnit::DayTime => Self::DayTime,
arrow_schema::IntervalUnit::MonthDayNano => Self::MonthDayNano,
}
}
}

impl DataType {
/// the [`PhysicalType`] of this [`DataType`].
pub fn to_physical_type(&self) -> PhysicalType {
Expand Down