From 5e69a0ca0476a17cc17b25092684294910bfaede Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20S=C3=B8e=20S=C3=B8rensen?= Date: Tue, 25 Sep 2012 14:15:59 +0200 Subject: [PATCH 1/6] Bloom filter: Prepare for having other representations. --- src/hanoidb_bloom.erl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/hanoidb_bloom.erl b/src/hanoidb_bloom.erl index 14a6b13..a7a928c 100644 --- a/src/hanoidb_bloom.erl +++ b/src/hanoidb_bloom.erl @@ -148,9 +148,12 @@ masked_pair(Mask, X, Y) -> {X band Mask, Y band Mask}. all_set(_Mask, _I1, _I, []) -> true; all_set(Mask, I1, I, [H|T]) -> - case bitarray_get(I, H) of - true -> all_set(Mask, I1, (I+I1) band Mask, T); - false -> false + case element(1, H) of + array -> + case bitarray_get(I, H) of + true -> all_set(Mask, I1, (I+I1) band Mask, T); + false -> false + end end. %% Adds element to set @@ -184,6 +187,7 @@ set_bits(_Mask, _I1, _I, [], Acc) -> lists:reverse(Acc); set_bits(Mask, I1, I, [H|T], Acc) -> set_bits(Mask, I1, (I+I1) band Mask, T, [bitarray_set(I, H) | Acc]). +%%%========== Bitarray representation - suitable for sparse arrays ========== bitarray_new(N) -> array:new((N-1) div ?W + 1, {default, 0}). bitarray_set(I, A) -> @@ -197,6 +201,8 @@ bitarray_get(I, A) -> V = array:get(AI, A), V band (1 bsl (I rem ?W)) =/= 0. +%%%^^^^^^^^^^ Bitarray representation - suitable for sparse arrays ^^^^^^^^^^ + encode(Bloom) -> zlib:gzip(term_to_binary(Bloom)). From a475b5f5beffbf5a48fda801836b5d664ea7f27f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20S=C3=B8e=20S=C3=B8rensen?= Date: Tue, 25 Sep 2012 14:28:31 +0200 Subject: [PATCH 2/6] Bloom filter: Introduce dispatch for handling multiple representations. --- src/hanoidb_bloom.erl | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/hanoidb_bloom.erl b/src/hanoidb_bloom.erl index a7a928c..369b865 100644 --- a/src/hanoidb_bloom.erl +++ b/src/hanoidb_bloom.erl @@ -87,7 +87,7 @@ bloom(Mode, N, E) -> M = 1 bsl Mb, D = trunc(log(1-P) / log(1-1/M)), #bloom{e=E, n=D, mb=Mb, size = 0, - a = [bitarray_new(1 bsl Mb) || _ <- lists:seq(1, K)]}. + a = [bitmask_new(Mb) || _ <- lists:seq(1, K)]}. log2(X) -> log(X) / log(2). @@ -150,7 +150,7 @@ all_set(_Mask, _I1, _I, []) -> true; all_set(Mask, I1, I, [H|T]) -> case element(1, H) of array -> - case bitarray_get(I, H) of + case bitmask_get(I, H) of true -> all_set(Mask, I1, (I+I1) band Mask, T); false -> false end @@ -185,7 +185,22 @@ hash_add(Hashes, #bloom{mb=Mb, a=A, size=Size} = B) -> set_bits(_Mask, _I1, _I, [], Acc) -> lists:reverse(Acc); set_bits(Mask, I1, I, [H|T], Acc) -> - set_bits(Mask, I1, (I+I1) band Mask, T, [bitarray_set(I, H) | Acc]). + set_bits(Mask, I1, (I+I1) band Mask, T, [bitmask_set(I, H) | Acc]). + + +%%%========== Dispatch to appropriate representation: +bitmask_new(LogN) -> + bitarray_new(1 bsl LogN). + +bitmask_set(I, BM) -> + case element(1,BM) of + array -> bitarray_set(I, BM) + end. + +bitmask_get(I, BM) -> + case element(1,BM) of + array -> bitarray_get(I, BM) + end. %%%========== Bitarray representation - suitable for sparse arrays ========== bitarray_new(N) -> array:new((N-1) div ?W + 1, {default, 0}). From d727be2fa7b062e3862119123f4d07bf208c3b4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20S=C3=B8e=20S=C3=B8rensen?= Date: Tue, 25 Sep 2012 14:51:45 +0200 Subject: [PATCH 3/6] Bloom filter: remove double-testing introduced in dispatch. --- src/hanoidb_bloom.erl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/hanoidb_bloom.erl b/src/hanoidb_bloom.erl index 369b865..29f11d9 100644 --- a/src/hanoidb_bloom.erl +++ b/src/hanoidb_bloom.erl @@ -148,12 +148,9 @@ masked_pair(Mask, X, Y) -> {X band Mask, Y band Mask}. all_set(_Mask, _I1, _I, []) -> true; all_set(Mask, I1, I, [H|T]) -> - case element(1, H) of - array -> - case bitmask_get(I, H) of - true -> all_set(Mask, I1, (I+I1) band Mask, T); - false -> false - end + case bitmask_get(I, H) of + true -> all_set(Mask, I1, (I+I1) band Mask, T); + false -> false end. %% Adds element to set From 12148a7af92c0158e6d19efdbc44a193c1c114c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20S=C3=B8e=20S=C3=B8rensen?= Date: Tue, 25 Sep 2012 15:05:43 +0200 Subject: [PATCH 4/6] Bloom filter: Add faster, ETS-based build representation for dense bitmaps. --- src/hanoidb_bloom.erl | 28 ++++++++++++++++++++++---- src/hanoidb_dense_bitmap.erl | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 src/hanoidb_dense_bitmap.erl diff --git a/src/hanoidb_bloom.erl b/src/hanoidb_bloom.erl index 29f11d9..154bc9d 100644 --- a/src/hanoidb_bloom.erl +++ b/src/hanoidb_bloom.erl @@ -187,16 +187,30 @@ set_bits(Mask, I1, I, [H|T], Acc) -> %%%========== Dispatch to appropriate representation: bitmask_new(LogN) -> - bitarray_new(1 bsl LogN). + if LogN >= 20 -> % Use sparse representation. + bitarray_new(1 bsl LogN); + true -> % Use dense representation. + hanoidb_dense_bitmap:new(1 bsl LogN) + end. bitmask_set(I, BM) -> case element(1,BM) of - array -> bitarray_set(I, BM) + array -> bitarray_set(I, BM); + dense_bitmap_ets -> hanoidb_dense_bitmap:set(I, BM) + end. + +%%% Convert to external form. +bitmask_build(BM) -> + case element(1,BM) of + array -> BM; + dense_bitmap_ets -> hanoidb_dense_bitmap:build(BM) end. bitmask_get(I, BM) -> case element(1,BM) of - array -> bitarray_get(I, BM) + array -> bitarray_get(I, BM); + dense_bitmap_ets -> hanoidb_dense_bitmap:member(I, BM); + dense_bitmap -> hanoidb_dense_bitmap:member(I, BM) end. %%%========== Bitarray representation - suitable for sparse arrays ========== @@ -216,11 +230,17 @@ bitarray_get(I, A) -> %%%^^^^^^^^^^ Bitarray representation - suitable for sparse arrays ^^^^^^^^^^ encode(Bloom) -> - zlib:gzip(term_to_binary(Bloom)). + zlib:gzip(term_to_binary(bloom_build(Bloom))). decode(Bin) -> binary_to_term(zlib:gunzip(Bin)). +%%% Convert to external form. +bloom_build(Bloom=#bloom{a=Bitmasks}) -> + Bloom#bloom{a=[bitmask_build(X) || X <- Bitmasks]}; +bloom_build(Sbf=#sbf{b=Blooms}) -> + Sbf#sbf{b=[bloom_build(X) || X <- Blooms]}. + %% UNIT TESTS -ifdef(TEST). diff --git a/src/hanoidb_dense_bitmap.erl b/src/hanoidb_dense_bitmap.erl new file mode 100644 index 0000000..3bf25f4 --- /dev/null +++ b/src/hanoidb_dense_bitmap.erl @@ -0,0 +1,39 @@ +-module(hanoidb_dense_bitmap). + +-export([new/1, set/2, build/1, member/2]). +-define(BITS_PER_CELL, 32). + +-define(REPR_NAME, dense_bitmap). + +new(N) -> + Tab = ets:new(dense_bitmap, [private, set]), + Width = 1 + (N-1) div ?BITS_PER_CELL, + Value = erlang:make_tuple(Width+1, 0, [{1,?REPR_NAME}]), + ets:insert(Tab, Value), + %io:format("DB| create(): ~p of width ~p\n", [Tab, Width]), + {dense_bitmap_ets, N, Width, Tab}. + +%% Set a bit. +set(I, {dense_bitmap_ets, _,_, Tab}=DBM) -> + Cell = 2 + I div ?BITS_PER_CELL, + BitInCell = I rem ?BITS_PER_CELL, + Old = ets:lookup_element(Tab, ?REPR_NAME, Cell), + New = Old bor (1 bsl BitInCell), + ets:update_element(Tab, ?REPR_NAME, {Cell,New}), + DBM. + +build({dense_bitmap_ets, _, _, Tab}) -> + [Row] = ets:lookup(Tab, ?REPR_NAME), + ets:delete(Tab), + Row. + +member(I, Row) when element(1,Row)==?REPR_NAME -> + Cell = 2 + I div ?BITS_PER_CELL, + BitInCell = I rem ?BITS_PER_CELL, + CellValue = element(Cell, Row), + CellValue band (1 bsl BitInCell) =/= 0; +member(I, {dense_bitmap_ets, _,_, Tab}) -> + Cell = 2 + I div ?BITS_PER_CELL, + BitInCell = I rem ?BITS_PER_CELL, + CellValue = ets:lookup_element(Tab, ?REPR_NAME, Cell), + CellValue band (1 bsl BitInCell) =/= 0. From 1c6e0df15cf19b8cd7eb03ab76ce5d7afd275a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20S=C3=B8e=20S=C3=B8rensen?= Date: Tue, 25 Sep 2012 15:17:24 +0200 Subject: [PATCH 5/6] Bloom filter: Handle hibernate situation in dense_bitmap. --- src/hanoidb_bloom.erl | 5 ++++- src/hanoidb_dense_bitmap.erl | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/hanoidb_bloom.erl b/src/hanoidb_bloom.erl index 154bc9d..8d385d7 100644 --- a/src/hanoidb_bloom.erl +++ b/src/hanoidb_bloom.erl @@ -196,7 +196,10 @@ bitmask_new(LogN) -> bitmask_set(I, BM) -> case element(1,BM) of array -> bitarray_set(I, BM); - dense_bitmap_ets -> hanoidb_dense_bitmap:set(I, BM) + dense_bitmap_ets -> hanoidb_dense_bitmap:set(I, BM); + dense_bitmap -> + %% Surprise - we need to mutate a built representation: + hanoidb_dense_bitmap:set(I, hanoidb_dense_bitmap:unbuild(BM)) end. %%% Convert to external form. diff --git a/src/hanoidb_dense_bitmap.erl b/src/hanoidb_dense_bitmap.erl index 3bf25f4..300ccf8 100644 --- a/src/hanoidb_dense_bitmap.erl +++ b/src/hanoidb_dense_bitmap.erl @@ -1,6 +1,6 @@ -module(hanoidb_dense_bitmap). --export([new/1, set/2, build/1, member/2]). +-export([new/1, set/2, build/1, unbuild/1, member/2]). -define(BITS_PER_CELL, 32). -define(REPR_NAME, dense_bitmap). @@ -10,7 +10,6 @@ new(N) -> Width = 1 + (N-1) div ?BITS_PER_CELL, Value = erlang:make_tuple(Width+1, 0, [{1,?REPR_NAME}]), ets:insert(Tab, Value), - %io:format("DB| create(): ~p of width ~p\n", [Tab, Width]), {dense_bitmap_ets, N, Width, Tab}. %% Set a bit. @@ -27,6 +26,11 @@ build({dense_bitmap_ets, _, _, Tab}) -> ets:delete(Tab), Row. +unbuild(Row) when element(1,Row)==?REPR_NAME -> + Tab = ets:new(dense_bitmap, [private, set]), + ets:insert(Tab, Row), + {dense_bitmap_ets, undefined, undefined, Tab}. + member(I, Row) when element(1,Row)==?REPR_NAME -> Cell = 2 + I div ?BITS_PER_CELL, BitInCell = I rem ?BITS_PER_CELL, From ed65b5a468710a2e11b906e678e0195f1052dd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20S=C3=B8e=20S=C3=B8rensen?= Date: Tue, 25 Sep 2012 17:12:10 +0200 Subject: [PATCH 6/6] Bloom filter: replace array with faster custom representation. --- src/hanoidb_bloom.erl | 5 +++- src/hanoidb_sparse_bitmap.erl | 43 +++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 src/hanoidb_sparse_bitmap.erl diff --git a/src/hanoidb_bloom.erl b/src/hanoidb_bloom.erl index 8d385d7..c5d357c 100644 --- a/src/hanoidb_bloom.erl +++ b/src/hanoidb_bloom.erl @@ -188,7 +188,7 @@ set_bits(Mask, I1, I, [H|T], Acc) -> %%%========== Dispatch to appropriate representation: bitmask_new(LogN) -> if LogN >= 20 -> % Use sparse representation. - bitarray_new(1 bsl LogN); + hanoidb_sparse_bitmap:new(LogN); true -> % Use dense representation. hanoidb_dense_bitmap:new(1 bsl LogN) end. @@ -196,6 +196,7 @@ bitmask_new(LogN) -> bitmask_set(I, BM) -> case element(1,BM) of array -> bitarray_set(I, BM); + sparse_bitmap -> hanoidb_sparse_bitmap:set(I, BM); dense_bitmap_ets -> hanoidb_dense_bitmap:set(I, BM); dense_bitmap -> %% Surprise - we need to mutate a built representation: @@ -206,12 +207,14 @@ bitmask_set(I, BM) -> bitmask_build(BM) -> case element(1,BM) of array -> BM; + sparse_bitmap -> BM; dense_bitmap_ets -> hanoidb_dense_bitmap:build(BM) end. bitmask_get(I, BM) -> case element(1,BM) of array -> bitarray_get(I, BM); + sparse_bitmap -> hanoidb_sparse_bitmap:member(I, BM); dense_bitmap_ets -> hanoidb_dense_bitmap:member(I, BM); dense_bitmap -> hanoidb_dense_bitmap:member(I, BM) end. diff --git a/src/hanoidb_sparse_bitmap.erl b/src/hanoidb_sparse_bitmap.erl new file mode 100644 index 0000000..0b662dd --- /dev/null +++ b/src/hanoidb_sparse_bitmap.erl @@ -0,0 +1,43 @@ +-module(hanoidb_sparse_bitmap). +-export([new/1, set/2, member/2]). + +-define(REPR_NAME, sparse_bitmap). + +new(Bits) when is_integer(Bits), Bits>0 -> + {?REPR_NAME, Bits, []}. + +set(N, {?REPR_NAME, Bits, Tree}) -> + {?REPR_NAME, Bits, set_to_tree(N, 1 bsl (Bits-1), Tree)}. + +set_to_tree(N, HighestBit, Mask) when HighestBit<32 -> + Nbit = 1 bsl N, + case Mask of + []-> Nbit; + _ -> Nbit bor Mask + end; +set_to_tree(N, _HighestBit, []) -> N; +set_to_tree(N, HighestBit, [TLo|THi]) -> + pushdown(N, HighestBit, TLo, THi); +set_to_tree(N, _HighestBit, N) -> N; +set_to_tree(N, HighestBit, M) when is_integer(M) -> + set_to_tree(N, HighestBit, pushdown(M, HighestBit, [], [])). + +pushdown(N, HighestBit, TLo, THi) -> + NHigh = N band HighestBit, + if NHigh =:= 0 -> [set_to_tree(N, HighestBit bsr 1, TLo) | THi]; + true -> [TLo | set_to_tree(N bxor NHigh, HighestBit bsr 1, THi)] + end. + +member(N, {?REPR_NAME, Bits, Tree}) -> + member_in_tree(N, 1 bsl (Bits-1), Tree). + +member_in_tree(_N, _HighestBit, []) -> false; +member_in_tree(N, HighestBit, Mask) when HighestBit<32 -> + Nbit = 1 bsl N, + Nbit band Mask > 0; +member_in_tree(N, _HighestBit, M) when is_integer(M) -> N =:= M; +member_in_tree(N, HighestBit, [TLo|THi]) -> + NHigh = N band HighestBit, + if NHigh =:= 0 -> member_in_tree(N, HighestBit bsr 1, TLo); + true -> member_in_tree(N bxor NHigh, HighestBit bsr 1, THi) + end.