Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chg: [v2] stabilized APIs #3

Merged
merged 1 commit into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions poppy/src/bloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ pub enum Error {
OptLevel(#[from] OptLevelError),
#[error("too many entries, false positive rate cannot be met")]
TooManyEntries,
#[error("entry index iterator must be initialized")]
UninitIter,
}

/// Structure used for easy filter creation from a set
Expand Down
5 changes: 5 additions & 0 deletions poppy/src/bloom/v1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ impl Iterator for Fingerprint {
}
type Error = crate::Error;

/// This structure implements a bloom filter compatible with
/// [DSCO implementation](https://github.com/DCSO/bloom). It must
/// be use for compatibility purposes only. Its use is deprecated
/// as its fpp is not guaranteed to be correct. Use [crate::v2::BloomFilter]
/// instead.
#[derive(Debug, Clone)]
pub struct BloomFilter {
flags: Flags,
Expand Down
166 changes: 140 additions & 26 deletions poppy/src/bloom/v2.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,29 @@
use std::{
hash::Hasher,
io::{self, BufWriter, Read, Write},
marker::PhantomData,
};

use crate::{
bitset::{array::BitSet, vec::VecBitSet},
hash::{wyhash::WyHasher, PoppyHasher},
hash::{digest, wyhash::WyHasher, PoppyHash, PoppyHasher},
read_flags,
utils::{read_le_f64, read_le_u64},
Flags, OptLevel, Params,
};
use core::mem::size_of;

fn digest<H: Hasher + Default, S: AsRef<[u8]>>(s: S) -> u64 {
let mut h = H::default();
h.write(s.as_ref());
h.finish()
}

#[derive(Debug, Default, Clone, Copy)]
#[derive(Debug, Default, Clone)]
pub struct IndexIterator<H: PoppyHasher, const M: u64> {
init: bool,
h1: u64,
h2: u64,
i: u64,
count: u64,
h: PhantomData<H>,
}

impl<H: PoppyHasher + Clone, const M: u64> std::marker::Copy for IndexIterator<H, M> {}

#[inline(always)]
fn xorshift_star(mut seed: u64) -> u64 {
seed ^= seed.wrapping_shl(12);
Expand All @@ -52,7 +48,9 @@ impl<H: PoppyHasher, const M: u64> IndexIterator<H, M> {
}

#[inline(always)]
fn init_with_data<S: AsRef<[u8]>>(mut self, data: S) -> Self {
#[allow(dead_code)] // this method is used to check compatibility with former implementation
fn init_with_slice<S: AsRef<[u8]>>(mut self, data: S) -> Self {
self.init = true;
if data.as_ref().len() > size_of::<u64>() {
self.h1 = digest::<H, S>(data);
} else {
Expand All @@ -68,6 +66,20 @@ impl<H: PoppyHasher, const M: u64> IndexIterator<H, M> {
self.i = 0;
self
}

#[inline(always)]
fn init_with_hashable<D: PoppyHash>(mut self, data: D) -> Self {
self.init = true;
self.h1 = data.hash_pop::<H>();
self.h2 = 0;
self.i = 0;
self
}

#[inline(always)]
const fn is_init(&self) -> bool {
self.init && self.i == 0
}
}

impl<H: PoppyHasher, const M: u64> Iterator for IndexIterator<H, M> {
Expand Down Expand Up @@ -108,6 +120,8 @@ const BUCKET_SIZE: usize = 4096;
const BIT_SET_MOD: u64 = (BUCKET_SIZE * 8) as u64;
type Bucket = BitSet<BUCKET_SIZE>;

/// Faster and more accurate implementation than [crate::v1::BloomFilter]
/// Until further notice, this is the structure to use by default.
#[derive(Debug, Clone)]
pub struct BloomFilter {
flags: Flags,
Expand Down Expand Up @@ -276,21 +290,26 @@ impl BloomFilter {
self.buckets.len().is_power_of_two()
}

/// clears out the bloom filter
/// Clears out the bloom filter
#[inline(always)]
pub fn clear(&mut self) {
self.buckets.iter_mut().for_each(|bucket| bucket.clear());
self.count = 0;
}

#[inline]
pub fn insert_bytes<D: AsRef<[u8]>>(&mut self, data: D) -> Result<bool, Error> {
#[inline(always)]
/// Function to implement hash one insert many use cases. An [IndexIterator] can
/// be obtained from [Self::prep_index_iter] method.
pub fn insert_iter(&mut self, it: IndexIterator<WyHasher, BIT_SET_MOD>) -> Result<bool, Error> {
if !it.is_init() {
return Err(Error::UninitIter);
}

if self.capacity == 0 {
return Err(Error::TooManyEntries);
}

let mut new = false;
let it = self.index_iter().init_with_data(data);

let h = it.bucket_hash();
let ibucket = {
Expand Down Expand Up @@ -331,12 +350,51 @@ impl BloomFilter {
}

#[inline]
pub fn contains_bytes<D: AsRef<[u8]>>(&self, data: D) -> bool {
if self.capacity == 0 {
return false;
fn _insert_bytes_old<D: AsRef<[u8]>>(&mut self, data: D) -> Result<bool, Error> {
self.insert_iter(self.index_iter().init_with_slice(data))
}

#[inline]
/// Insert a byte slice into the filter. This function is kept to support backward
/// compatibility with old API.
pub fn insert_bytes<D: AsRef<[u8]>>(&mut self, data: D) -> Result<bool, Error> {
self.insert(data.as_ref())
}

#[inline]
/// Generic insert any data implementing [PoppyHash] trait
pub fn insert<H: PoppyHash>(&mut self, data: H) -> Result<bool, Error> {
self.insert_iter(self.prep_index_iter(data))
}

#[inline]
/// Function to implement hash one contains many use cases. An [IndexIterator] can
/// be obtained from [Self::prep_index_iter] method.
///
/// # Example
///
/// ```
/// use poppy_filters::v2::BloomFilter;
///
/// let mut b: BloomFilter = BloomFilter::with_capacity(10000, 0.001);
///
/// /// we prepare the data to be inserted and/or checked
/// /// this way, the cost of hashing the data is done only once
/// let prep = (0..1000).map(|d| b.prep_index_iter(d)).collect::<Vec<_>>();
///
/// for p in prep {
/// b.insert_iter(p).unwrap();
/// b.contains_iter(p).unwrap();
/// }
/// ```
pub fn contains_iter(&self, it: IndexIterator<WyHasher, BIT_SET_MOD>) -> Result<bool, Error> {
if !it.is_init() {
return Err(Error::UninitIter);
}

let it = self.index_iter().init_with_data(data);
if self.capacity == 0 {
return Ok(false);
}

let h = it.bucket_hash();

Expand All @@ -345,7 +403,7 @@ impl BloomFilter {
.index_cache
.get_nth_bit(h as usize & (self.index_cache.bit_len() - 1))
{
return false;
return Ok(false);
}

let ibucket = {
Expand All @@ -363,11 +421,34 @@ impl BloomFilter {

for ibit in it {
if !bucket.get_nth_bit(ibit as usize) {
return false;
return Ok(false);
}
}

true
Ok(true)
}

#[inline]
fn _contains_bytes_old<D: AsRef<[u8]>>(&self, data: D) -> bool {
let it = self.index_iter().init_with_slice(data);
// this cannot panic as our iterator has been inititialized
self.contains_iter(it).unwrap()
}

#[inline]
pub fn contains_bytes<S: AsRef<[u8]>>(&self, data: S) -> bool {
self.contains(data.as_ref())
}

#[inline]
pub fn contains<H: PoppyHash>(&self, data: H) -> bool {
// this cannot panic as our iterator has been inititialized
self.contains_iter(self.prep_index_iter(data)).unwrap()
}

#[inline]
pub fn prep_index_iter<H: PoppyHash>(&self, data: H) -> IndexIterator<WyHasher, BIT_SET_MOD> {
self.index_iter().init_with_hashable(data)
}

/// counts all the set bits in the bloom filter
Expand Down Expand Up @@ -528,7 +609,7 @@ mod test {
($cap:expr, $proba:expr, [$($values:literal),*]) => {
{
let mut b=bloom!($cap, $proba);
$(b.insert_bytes($values).unwrap();)*
$(b.insert($values).unwrap();)*
b
}
};
Expand Down Expand Up @@ -587,11 +668,44 @@ mod test {
#[test]
fn test_bloom() {
let mut b = BloomFilter::with_capacity(10000, 0.001);
assert!(!b.contains_bytes("value"));
b.insert_bytes("value").unwrap();
assert!(b.contains_bytes("value"));
assert!(!b._contains_bytes_old("hello"));
b.insert_bytes("hello").unwrap();
assert!(b._contains_bytes_old("hello"));
assert!(b.contains("hello"));
assert!(b.contains_bytes("hello"));
assert_eq!(b.count, 1);
assert!(!b.contains_bytes("unknown"));
assert!(!b.contains("unknown"));
}

#[test]
fn test_poppy_hash_compatibility() {
let mut b: BloomFilter = BloomFilter::with_capacity(10000, 0.001);
assert!(!b.contains("hello"));
b.insert("hello").unwrap();
b.insert(String::from("some string")).unwrap();
b._insert_bytes_old("some old string").unwrap();

assert!(b._contains_bytes_old("hello"));
assert!(b._contains_bytes_old("some string"));
assert!(b._contains_bytes_old("some old string"));

assert!(b.contains("hello"));
assert!(b.contains("some string"));
assert!(b.contains("some old string"));

assert!(!b.contains("unknown"));
}

#[test]
fn test_insert_contains_by_iter() {
let mut b: BloomFilter = BloomFilter::with_capacity(10000, 0.001);

let prep = (0..1000).map(|d| b.prep_index_iter(d)).collect::<Vec<_>>();

for p in prep {
b.insert_iter(p).unwrap();
b.contains_iter(p).unwrap();
}
}

#[test]
Expand Down
58 changes: 57 additions & 1 deletion poppy/src/hash.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::hash::Hasher;
use std::hash::{Hash, Hasher};
use std::mem::size_of;

pub(crate) mod fnv;
pub(crate) mod wyhash;
Expand All @@ -11,3 +12,58 @@ pub trait PoppyHasher: Hasher + Default {
h.finish()
}
}

pub trait PoppyHash: Hash {
fn hash_pop<H: PoppyHasher>(&self) -> u64 {
let mut hasher = H::default();
self.hash(&mut hasher);
hasher.finish()
}
}

impl<T> PoppyHash for Option<T> where T: Hash {}

macro_rules! impl_poppy_hash {
($($t:ty),*) => {
$(impl PoppyHash for $t {})*
};
}

impl_poppy_hash!(u8, u16, u32, u64, u128, usize, i8, i16, i32, i64, i128, isize);

impl PoppyHash for &[u8] {
fn hash_pop<H: PoppyHasher>(&self) -> u64 {
if self.len() > size_of::<u64>() {
digest::<H, &[u8]>(self)
} else {
// if data is smaller than u64 we don't need to hash it
let mut tmp = [0u8; size_of::<u64>()];
self.iter().enumerate().for_each(|(i, &b)| tmp[i] = b);
u64::from_le_bytes(tmp)
}
}
}

impl PoppyHash for Vec<u8> {
fn hash_pop<H: PoppyHasher>(&self) -> u64 {
self.as_slice().hash_pop::<H>()
}
}

impl PoppyHash for &str {
fn hash_pop<H: PoppyHasher>(&self) -> u64 {
self.as_bytes().hash_pop::<H>()
}
}

impl PoppyHash for String {
fn hash_pop<H: PoppyHasher>(&self) -> u64 {
self.as_bytes().hash_pop::<H>()
}
}

pub(crate) fn digest<H: Hasher + Default, S: AsRef<[u8]>>(s: S) -> u64 {
let mut h = H::default();
h.write(s.as_ref());
h.finish()
}
1 change: 1 addition & 0 deletions poppy/src/hash/wyhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use super::PoppyHasher;
// poppy seed
const SEED: u64 = 0x706f707079533d42;

#[derive(Clone)]
pub struct WyHasher {
h: WyHash,
}
Expand Down
Loading