Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Read parquet file with BinaryArray in ListArray panic #977

Closed
b41sh opened this issue May 5, 2022 · 1 comment
Closed

Read parquet file with BinaryArray in ListArray panic #977

b41sh opened this issue May 5, 2022 · 1 comment
Labels
bug Something isn't working no-changelog Issues whose changes are covered by a PR and thus should not be shown in the changelog

Comments

@b41sh
Copy link
Contributor

b41sh commented May 5, 2022

Read parquet file with BinaryArray in ListArray will panic. It seems that the offset of data value is not properly handled. Offset four bytes forward will read the correct data.

sample code

use std::fs::File;
use std::sync::Arc;

use arrow2::{
    array::{Array, ListArray, BinaryArray},
    chunk::Chunk,
    datatypes::{DataType, Field, Schema},
    error::Result,
    io::parquet::read,
    io::parquet::write::{
        CompressionOptions, Encoding, FileWriter, RowGroupIterator, Version, WriteOptions,
    },
};

type LargeBinaryArray = BinaryArray<i64>;

fn write_batch(path: &str, schema: Schema, columns: Chunk<Arc<dyn Array>>) -> Result<()> {
    let options = WriteOptions {
        write_statistics: true,
        compression: CompressionOptions::Uncompressed,
        version: Version::V2,
    };

    let iter = vec![Ok(columns)];

    let row_groups = RowGroupIterator::try_new(
        iter.into_iter(),
        &schema,
        options,
        vec![Encoding::Plain],
    )?;

    // Create a new empty file
    let file = File::create(path)?;

    let mut writer = FileWriter::try_new(file, schema, options)?;

    writer.start()?;
    for group in row_groups {
        writer.write(group?)?;
    }
    let _size = writer.end(None)?;
    Ok(())
}

fn main() -> Result<()> {
    let data_field = Field::new(
        "test",
        DataType::List(Box::new(Field::new("list", DataType::LargeBinary, false))),
        false,
    );
    let schema = Schema::from(vec![data_field]);

    let mut last_size: usize = 0;
    let mut offsets: Vec<i64> = Vec::with_capacity(6);
    let mut values: Vec<u8> = Vec::with_capacity(15);
    offsets.push(0);

    let mut list_last_size: usize = 0;
    let mut list_offsets: Vec<i32> = Vec::with_capacity(3);
    list_offsets.push(0);

    let vals = vec![vec!["aaa", "bbb"], vec!["uuu", "vvv"], vec!["www"]];
    for val in vals {
        list_last_size += val.len();
        list_offsets.push(list_last_size as i32);
        for v in val {
            last_size += v.len();
            offsets.push(last_size as i64);
            values.extend_from_slice(v.as_bytes());
        }
    }

    let binary_array = Arc::new(LargeBinaryArray::from_data(
        DataType::LargeBinary,
        offsets.into(),
        values.into(),
        None,
    ));

    let field = Field::new("list".to_string(), DataType::LargeBinary, false);
    let data_type = DataType::List(Box::new(field));

    let list_array = Arc::new(ListArray::<i32>::from_data(
        data_type,
        list_offsets.into(),
        binary_array,
        None,
    ));

    let columns = Chunk::new(vec![list_array as Arc<dyn Array>]);

    let file_path = "test.parquet";
    let _ = write_batch(file_path, schema, columns);

    let reader = File::open(file_path)?;
    let reader = read::FileReader::try_new(reader, None, None, None, None)?;

    for maybe_chunk in reader {
        let columns = maybe_chunk?;
        assert!(!columns.is_empty());
    }

    Ok(())
}

output

thread 'main' panicked at 'range end index 520292867 out of range for slice of length 35', /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/binary/utils.rs:112:23
stack backtrace:
   0:        0x104433d31 - std::backtrace_rs::backtrace::libunwind::trace::h72854132b122d638
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/../../backtrace/src/backtrace/libunwind.rs:93:5
   1:        0x104433d31 - std::backtrace_rs::backtrace::trace_unsynchronized::ha93153aa050311ab
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5
   2:        0x104433d31 - std::sys_common::backtrace::_print_fmt::h5b1eb3d7be0b24ab
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/sys_common/backtrace.rs:66:5
   3:        0x104433d31 - <std::sys_common::backtrace::_print::DisplayBacktrace as core::fmt::Display>::fmt::h0af9e61123fe45db
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/sys_common/backtrace.rs:45:22
   4:        0x10445214b - core::fmt::write::h2584048b445b9b8c
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/fmt/mod.rs:1190:17
   5:        0x104430ade - std::io::Write::write_fmt::h8e09e661881c2d10
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/io/mod.rs:1657:15
   6:        0x104435cb0 - std::sys_common::backtrace::_print::h8da79e38123fe8dc
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/sys_common/backtrace.rs:48:5
   7:        0x104435cb0 - std::sys_common::backtrace::print::h8c81a78009132972
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/sys_common/backtrace.rs:35:9
   8:        0x104435cb0 - std::panicking::default_hook::{{closure}}::hbd3431964909b1f1
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:295:22
   9:        0x104435994 - std::panicking::default_hook::h121f3ab9a5b3b08e
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:314:9
  10:        0x10443640e - std::panicking::rust_panic_with_hook::hb48cf4f9933cc169
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:698:17
  11:        0x104436133 - std::panicking::begin_panic_handler::{{closure}}::h424426f012771223
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:588:13
  12:        0x1044341c7 - std::sys_common::backtrace::__rust_end_short_backtrace::ha4959f8921e37983
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/sys_common/backtrace.rs:138:18
  13:        0x104435dfa - rust_begin_unwind
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:584:5
  14:        0x1044825a3 - core::panicking::panic_fmt::h592cc8ebeaa4ecb9
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/panicking.rs:143:14
  15:        0x104482776 - core::slice::index::slice_end_index_len_fail::hc8d3fcd01408ee5b
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/slice/index.rs:43:5
  16:        0x104309111 - <core::ops::range::Range<usize> as core::slice::index::SliceIndex<[T]>>::index::hae747d7708eb219c
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/slice/index.rs:245:13
  17:        0x104299c36 - <core::ops::range::RangeTo<usize> as core::slice::index::SliceIndex<[T]>>::index::hec0be2c9708b686b
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/slice/index.rs:291:9
  18:        0x1042bfa97 - core::slice::index::<impl core::ops::index::Index<I> for [T]>::index::h6a759b6beaf5ac46
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/slice/index.rs:15:9
  19:        0x103d3286f - <arrow2::io::parquet::read::deserialize::binary::utils::BinaryIter as core::iter::traits::iterator::Iterator>::next::haeee35db1016b494
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/binary/utils.rs:112:23
  20:        0x103d32a59 - <arrow2::io::parquet::read::deserialize::binary::utils::SizedBinaryIter as core::iter::traits::iterator::Iterator>::next::h2d51feea57ecc97b
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/binary/utils.rs:146:9
  21:        0x103d32ae4 - <&mut I as core::iter::traits::iterator::Iterator>::next::h0ef610d8aecbd044
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/traits/iterator.rs:3598:9
  22:        0x103eafe75 - <core::iter::adapters::take::Take<I> as core::iter::traits::iterator::Iterator>::next::h85223c4410768d9e
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/take.rs:37:13
  23:        0x103c94dfe - <arrow2::io::parquet::read::deserialize::binary::nested::BinaryDecoder<O> as arrow2::io::parquet::read::deserialize::utils::Decoder>::extend_from_state::he5c5d2e8b4742c35
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/binary/nested.rs:115:26
  24:        0x103c99dd3 - arrow2::io::parquet::read::deserialize::nested_utils::extend_from_new_page::h7ec07a88d6caf080
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/nested_utils.rs:341:5
  25:        0x103ca13b9 - arrow2::io::parquet::read::deserialize::nested_utils::next::hf4274a5ae719a2bb
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/nested_utils.rs:516:13
  26:        0x103c95c07 - <arrow2::io::parquet::read::deserialize::binary::nested::ArrayIterator<O,A,I> as core::iter::traits::iterator::Iterator>::next::h5253ce192dab2788
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/deserialize/binary/nested.rs:181:27
  27:        0x103b2f210 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next::h097f44b5b0068fa7
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/map.rs:103:9
  28:        0x103fc7910 - <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next::hade73a6e5639e68b
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/alloc/src/boxed.rs:1788:9
  29:        0x103b2fb91 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next::h1340bce65fbbb99a
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/map.rs:103:9
  30:        0x103fc7910 - <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next::hade73a6e5639e68b
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/alloc/src/boxed.rs:1788:9
  31:        0x103b314a1 - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::next::h3bb0ce3207877a55
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/map.rs:103:9
  32:        0x103fc78b0 - <alloc::boxed::Box<I,A> as core::iter::traits::iterator::Iterator>::next::h6f559621372b0d52
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/alloc/src/boxed.rs:1788:9
  33:        0x103d34169 - <arrow2::io::parquet::read::row_group::RowGroupDeserializer as core::iter::traits::iterator::Iterator>::next::{{closure}}::h98bec87f7d675af6
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/row_group.rs:72:29
  34:        0x103b2e008 - core::iter::adapters::map::map_try_fold::{{closure}}::hb9a00aae4acc3ef7
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/map.rs:91:28
  35:        0x103bce2d5 - core::iter::traits::iterator::Iterator::try_fold::h832ba73083b21ade
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/traits/iterator.rs:2027:21
  36:        0x103b4054f - <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold::h2739fa0a555b49f4
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/map.rs:117:9
  37:        0x103ebb430 - <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold::hfe14a163cb80e55e
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/mod.rs:182:9
  38:        0x103ec013e - core::iter::traits::iterator::Iterator::try_for_each::h5abe25b104bd6a73
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/traits/iterator.rs:2088:9
  39:        0x103ebae91 - <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next::h0facd6aca01972cc
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/mod.rs:165:9
  40:        0x103b8687b - <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter::h5db4820506a85391
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/alloc/src/vec/spec_from_iter_nested.rs:26:32
  41:        0x103b97647 - <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter::hc57979e52459082d
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/alloc/src/vec/spec_from_iter.rs:33:9
  42:        0x103bbc209 - <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter::hc4d047ed3b6536ef
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/alloc/src/vec/mod.rs:2547:9
  43:        0x103ebfe47 - core::iter::traits::iterator::Iterator::collect::h61fdceeb67761775
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/traits/iterator.rs:1777:9
  44:        0x103c5ccdb - <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}::h6566e0754822f83f
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/result.rs:2031:49
  45:        0x103eba6f2 - core::iter::adapters::try_process::h8b72f91360960f0a
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/adapters/mod.rs:151:17
  46:        0x103c5cb81 - <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::h0092a1f94303df57
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/result.rs:2031:9
  47:        0x103b7bc6f - core::iter::traits::iterator::Iterator::collect::h5bf38db8887f4d00
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/iter/traits/iterator.rs:1777:9
  48:        0x103d33fcd - <arrow2::io::parquet::read::row_group::RowGroupDeserializer as core::iter::traits::iterator::Iterator>::next::hcaa38836ef88dc21
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/row_group.rs:68:21
  49:        0x103a678b8 - <arrow2::io::parquet::read::file::FileReader<R> as core::iter::traits::iterator::Iterator>::next::hbfd7a67bfc8f1ae0
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/file.rs:138:19
  50:        0x103a67abf - <arrow2::io::parquet::read::file::FileReader<R> as core::iter::traits::iterator::Iterator>::next::hbfd7a67bfc8f1ae0
                               at /Users/baishen/.cargo/git/checkouts/arrow2-8a2ad61d97265680/9a38663/src/io/parquet/read/file.rs:158:21
  51:        0x103a4c290 - narr::main::h64464b2a0ebd25e6
                               at /Users/baishen/open/db/arrow/jorgecarleitao/narr/src/main.rs:103:24
  52:        0x103b11755 - core::ops::function::FnOnce::call_once::h4742c941aa96157f
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/ops/function.rs:227:5
  53:        0x103b05dfc - std::sys_common::backtrace::__rust_begin_short_backtrace::h8f87968410dab4a2
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/sys_common/backtrace.rs:122:18
  54:        0x103b05c88 - std::rt::lang_start::{{closure}}::hede066fa325eafb5
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/rt.rs:145:18
  55:        0x104433465 - core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once::hc1d3d50a2969f814
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/core/src/ops/function.rs:259:13
  56:        0x104433465 - std::panicking::try::do_call::h15ae273978729b4e
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:492:40
  57:        0x104433465 - std::panicking::try::h98c6926a09da0b2c
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:456:19
  58:        0x104433465 - std::panic::catch_unwind::haf2cfdcecc390882
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panic.rs:137:14
  59:        0x104433465 - std::rt::lang_start_internal::{{closure}}::h649686bdf2c787e0
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/rt.rs:128:48
  60:        0x104433465 - std::panicking::try::do_call::hac182aca074c79df
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:492:40
  61:        0x104433465 - std::panicking::try::ha9a6b4a7f1c326fb
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panicking.rs:456:19
  62:        0x104433465 - std::panic::catch_unwind::hdadacb445bf600d0
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/panic.rs:137:14
  63:        0x104433465 - std::rt::lang_start_internal::h29dd124c00a602c9
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/rt.rs:128:20
  64:        0x103b05c5e - std::rt::lang_start::h44dc482cb2675b71
                               at /rustc/30b3f35c420694a4f24e5a4df00f06073f4f3a37/library/std/src/rt.rs:144:17
  65:        0x103a4c6d6 - _main
@jorgecarleitao
Copy link
Owner

Closed by #984. Thanks again for the report!

@jorgecarleitao jorgecarleitao added the no-changelog Issues whose changes are covered by a PR and thus should not be shown in the changelog label May 7, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
bug Something isn't working no-changelog Issues whose changes are covered by a PR and thus should not be shown in the changelog
Projects
None yet
Development

No branches or pull requests

2 participants