Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved docs for bitmap #1022

Merged
merged 1 commit into from
May 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 119 additions & 78 deletions src/bitmap/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,51 @@ use either::Either;
use std::iter::FromIterator;
use std::sync::Arc;

use crate::{buffer::bytes::Bytes, trusted_len::TrustedLen};
use crate::{buffer::bytes::Bytes, error::Error, trusted_len::TrustedLen};

use super::{
utils::{count_zeros, fmt, get_bit, get_bit_unchecked, BitChunk, BitChunks, BitmapIter},
MutableBitmap,
};

/// An immutable container whose API is optimized to handle bitmaps. All quantities on this
/// container's API are measured in bits.
/// # Implementation
/// * memory on this container is sharable across thread boundaries
/// * Cloning [`Bitmap`] is `O(1)`
/// * Slicing [`Bitmap`] is `O(1)`
/// An immutable container semantically equivalent to `Arc<Vec<bool>>` but represented as `Arc<Vec<u8>>` where
/// each boolean is represented as a single bit.
///
/// # Examples
/// ```
/// use arrow2::bitmap::{Bitmap, MutableBitmap};
///
/// let bitmap = Bitmap::from([true, false, true]);
/// assert_eq!(bitmap.iter().collect::<Vec<_>>(), vec![true, false, true]);
///
/// // creation directly from bytes
/// let bitmap = Bitmap::try_new(vec![0b00001101], 5).unwrap();
/// // note: the first bit is the left-most of the first byte
/// assert_eq!(bitmap.iter().collect::<Vec<_>>(), vec![true, false, true, true, false]);
/// // we can also get the slice:
/// assert_eq!(bitmap.as_slice(), ([0b00001101u8].as_ref(), 0, 5));
/// // debug helps :)
/// assert_eq!(format!("{:?}", bitmap), "[0b___01101]".to_string());
///
/// // it supports copy-on-write semantics (to a `MutableBitmap`)
/// let bitmap: MutableBitmap = bitmap.into_mut().right().unwrap();
/// assert_eq!(bitmap, MutableBitmap::from([true, false, true, true, false]));
///
/// // slicing is 'O(1)' (data is shared)
/// let bitmap = Bitmap::try_new(vec![0b00001101], 5).unwrap();
/// let sliced = bitmap.slice(1, 4);
/// assert_eq!(sliced.as_slice(), ([0b00001101u8].as_ref(), 1, 4)); // 1 here is the offset:
/// assert_eq!(format!("{:?}", sliced), "[0b___0110_]".to_string());
/// // when sliced (or cloned), it is no longer possible to `into_mut`.
/// let same: Bitmap = sliced.into_mut().left().unwrap();
/// ```
#[derive(Clone)]
pub struct Bitmap {
bytes: Arc<Bytes<u8>>,
// both are measured in bits. They are used to bound the bitmap to a region of Bytes.
offset: usize,
length: usize,
// this is a cache: it must be computed on initialization
// this is a cache: it is computed on initialization
null_count: usize,
}

Expand All @@ -45,10 +70,25 @@ impl Bitmap {
Self::default()
}

/// Initializes an new [`Bitmap`] filled with unset values.
/// Initializes a new [`Bitmap`] from vector of bytes and a length.
/// # Errors
/// This function errors iff `length > bytes.len() * 8`
#[inline]
pub fn new_zeroed(length: usize) -> Self {
MutableBitmap::from_len_zeroed(length).into()
pub fn try_new(bytes: Vec<u8>, length: usize) -> Result<Self, Error> {
if length > bytes.len().saturating_mul(8) {
return Err(Error::InvalidArgumentError(format!(
"The length of the bitmap ({}) must be `<=` to the number of bytes times 8 ({})",
length,
bytes.len().saturating_mul(8)
)));
}
let null_count = count_zeros(&bytes, 0, length);
Ok(Self {
length,
offset: 0,
bytes: Arc::new(bytes.into()),
null_count,
})
}

/// Returns the length of the [`Bitmap`].
Expand All @@ -63,6 +103,18 @@ impl Bitmap {
self.len() == 0
}

/// Returns a new iterator of `bool` over this bitmap
pub fn iter(&self) -> BitmapIter {
BitmapIter::new(&self.bytes, self.offset, self.length)
}

/// Returns an iterator over bits in bit chunks [`BitChunk`].
///
/// This iterator is useful to operate over multiple bits via e.g. bitwise.
pub fn chunks<T: BitChunk>(&self) -> BitChunks<T> {
BitChunks::new(&self.bytes, self.offset, self.length)
}

/// Creates a new [`Bitmap`] from [`Bytes`] and a length.
/// # Panic
/// Panics iff `length <= bytes.len() * 8`
Expand All @@ -78,28 +130,22 @@ impl Bitmap {
}
}

/// Creates a new [`Bitmap`] from [`Vec`] and a length.
/// This function is `O(1)`
/// # Panic
/// Panics iff `length <= buffer.len() * 8`
#[inline]
pub fn from_u8_vec(vec: Vec<u8>, length: usize) -> Self {
Bitmap::from_bytes(vec.into(), length)
}

/// Creates a new [`Bitmap`] from a slice and length.
/// # Panic
/// Panics iff `length <= bytes.len() * 8`
#[inline]
pub fn from_u8_slice<T: AsRef<[u8]>>(buffer: T, length: usize) -> Self {
let buffer = Vec::<u8>::from(buffer.as_ref());
Bitmap::from_u8_vec(buffer, length)
}

/// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits.
/// Returns the byte slice of this [`Bitmap`].
///
/// The returned tuple contains:
/// * `.1`: The byte slice, truncated to the start of the first bit. So the start of the slice
/// is within the first 8 bits.
/// * `.2`: The start offset in bits on a range `0 <= offsets < 8`.
/// * `.3`: The length in number of bits.
#[inline]
pub fn null_count_range(&self, offset: usize, length: usize) -> usize {
count_zeros(&self.bytes, self.offset + offset, length)
pub fn as_slice(&self) -> (&[u8], usize, usize) {
let start = self.offset / 8;
let len = (self.offset % 8 + self.length).saturating_add(7) / 8;
(
&self.bytes[start..start + len],
self.offset % 8,
self.length,
)
}

/// Returns the number of unset bits on this [`Bitmap`].
Expand All @@ -110,9 +156,10 @@ impl Bitmap {

/// Slices `self`, offsetting by `offset` and truncating up to `length` bits.
/// # Panic
/// Panics iff `self.offset + offset + length >= self.bytes.len() * 8`, i.e. if the offset and `length`
/// Panics iff `offset + length > self.length`, i.e. if the offset and `length`
/// exceeds the allocated capacity of `self`.
#[inline]
#[must_use]
pub fn slice(self, offset: usize, length: usize) -> Self {
assert!(offset + length <= self.length);
unsafe { self.slice_unchecked(offset, length) }
Expand Down Expand Up @@ -147,16 +194,6 @@ impl Bitmap {
get_bit(&self.bytes, self.offset + i)
}

/// Returns whether the bit at position `i` is set.
#[inline]
pub fn get(&self, i: usize) -> Option<bool> {
if i < self.len() {
Some(unsafe { self.get_bit_unchecked(i) })
} else {
None
}
}

/// Unsafely returns whether the bit at position `i` is set.
/// # Safety
/// Unsound iff `i >= self.len()`.
Expand Down Expand Up @@ -196,6 +233,45 @@ impl Bitmap {
_ => Either::Left(self),
}
}

/// Initializes an new [`Bitmap`] filled with unset values.
#[inline]
pub fn new_zeroed(length: usize) -> Self {
MutableBitmap::from_len_zeroed(length).into()
}

/// Counts the nulls (unset bits) starting from `offset` bits and for `length` bits.
#[inline]
pub fn null_count_range(&self, offset: usize, length: usize) -> usize {
count_zeros(&self.bytes, self.offset + offset, length)
}

/// Creates a new [`Bitmap`] from a slice and length.
/// # Panic
/// Panics iff `length <= bytes.len() * 8`
#[inline]
pub fn from_u8_slice<T: AsRef<[u8]>>(slice: T, length: usize) -> Self {
Bitmap::try_new(slice.as_ref().to_vec(), length).unwrap()
}

/// Alias for `Bitmap::try_new().unwrap()`
/// This function is `O(1)`
/// # Panic
/// This function panics iff `length <= bytes.len() * 8`
#[inline]
pub fn from_u8_vec(vec: Vec<u8>, length: usize) -> Self {
Bitmap::try_new(vec, length).unwrap()
}

/// Returns whether the bit at position `i` is set.
#[inline]
pub fn get(&self, i: usize) -> Option<bool> {
if i < self.len() {
Some(unsafe { self.get_bit_unchecked(i) })
} else {
None
}
}
}

impl<P: AsRef<[bool]>> From<P> for Bitmap {
Expand All @@ -213,14 +289,6 @@ impl FromIterator<bool> for Bitmap {
}
}

impl Bitmap {
/// Returns an iterator over bits in chunks of `T`, which is useful for
/// bit operations.
pub fn chunks<T: BitChunk>(&self) -> BitChunks<T> {
BitChunks::new(&self.bytes, self.offset, self.length)
}
}

impl Bitmap {
/// Creates a new [`Bitmap`] from an iterator of booleans.
/// # Safety
Expand Down Expand Up @@ -258,26 +326,6 @@ impl Bitmap {
}
}

impl Bitmap {
/// Returns the byte slice of this Bitmap.
///
/// The returned tuple contains:
/// .1 -> The byte slice, truncated to the start of the first bit. So the start of the slice
/// is within the first 8 bits.
/// .2 -> The start offset in bits, given what described above `0 <= offsets < 8`.
/// .3 -> The length in bits.
#[inline]
pub fn as_slice(&self) -> (&[u8], usize, usize) {
let start = self.offset / 8;
let len = (self.offset % 8 + self.length).saturating_add(7) / 8;
(
&self.bytes[start..start + len],
self.offset % 8,
self.length,
)
}
}

impl<'a> IntoIterator for &'a Bitmap {
type Item = bool;
type IntoIter = BitmapIter<'a>;
Expand All @@ -286,10 +334,3 @@ impl<'a> IntoIterator for &'a Bitmap {
BitmapIter::<'a>::new(&self.bytes, self.offset, self.length)
}
}

impl<'a> Bitmap {
/// constructs a new iterator
pub fn iter(&'a self) -> BitmapIter<'a> {
BitmapIter::<'a>::new(&self.bytes, self.offset, self.length)
}
}
4 changes: 2 additions & 2 deletions src/bitmap/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,15 +264,15 @@ impl MutableBitmap {
impl From<MutableBitmap> for Bitmap {
#[inline]
fn from(buffer: MutableBitmap) -> Self {
Bitmap::from_bytes(buffer.buffer.into(), buffer.length)
Bitmap::try_new(buffer.buffer, buffer.length).unwrap()
}
}

impl From<MutableBitmap> for Option<Bitmap> {
#[inline]
fn from(buffer: MutableBitmap) -> Self {
if buffer.null_count() > 0 {
Some(Bitmap::from_bytes(buffer.buffer.into(), buffer.length))
Some(Bitmap::try_new(buffer.buffer, buffer.length).unwrap())
} else {
None
}
Expand Down
8 changes: 5 additions & 3 deletions src/bitmap/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,12 @@ pub unsafe fn set_bit_unchecked(data: &mut [u8], i: usize, value: bool) {
*byte = set(*byte, i % 8, value);
}

/// Returns whether bit at position `i` in `data` is set or not
/// Returns whether bit at position `i` in `data` is set
/// # Panic
/// This function panics iff `i / 8 >= bytes.len()`
#[inline]
pub fn get_bit(data: &[u8], i: usize) -> bool {
is_set(data[i / 8], i % 8)
pub fn get_bit(bytes: &[u8], i: usize) -> bool {
is_set(bytes[i / 8], i % 8)
}

/// Returns whether bit at position `i` in `data` is set or not.
Expand Down
2 changes: 1 addition & 1 deletion src/io/ipc/read/read_basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ pub fn read_bitmap<R: Read + Seek>(
read_uncompressed_bitmap(length, bytes, reader)
}?;

Ok(Bitmap::from_bytes(buffer.into(), length))
Bitmap::try_new(buffer, length)
}

pub fn read_validity<R: Read + Seek>(
Expand Down
2 changes: 1 addition & 1 deletion src/types/bit_chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use num_traits::PrimInt;
use super::NativeType;

/// A chunk of bits. This is used to create masks of a given length
/// whose width is `1` bit. In `simd_packed` notation, this corresponds to `m1xY`.
/// whose width is `1` bit. In `portable_simd` notation, this corresponds to `m1xY`.
pub trait BitChunk:
super::private::Sealed
+ PrimInt
Expand Down