diff --git a/src/similarities.jl b/src/similarities.jl index 0e91a1f..cb821c7 100644 --- a/src/similarities.jl +++ b/src/similarities.jl @@ -239,6 +239,37 @@ function jaccard(A::Set, B::Set) :: Float64 end end +@doc raw""" + function jaccard(x::BitArray{1}, y::BitArray{1}) + +Computes the Jaccard similarity between a pair of binary vectors. Here, Jaccard similarity is defined as + +``J(x, y) = \\frac{\\sum_{i} \\min{(x_i,y_i)}}{\\sum_{i} \\max{(x_i,y_i)}}`` + +# Arguments +- `x::BitArray{1}`, `y::BitArray{1}`: two binary vectors, in the form of `BitArray`s. + +# Examples +```jldoctest; setup = :(using LSHFunctions) +julia> x = BitArray([true, false, true, true, false]); + +julia> y = BitArray([false, false, true, true, true]); + +julia> jaccard(x,y) +0.5 +``` +""" +function jaccard(x::BitArray{1}, y::BitArray{1}) :: Float64 + union = sum(x .| y) + if union == 0 + # To avoid corner cases where x and y are both full of zeros + Float64(0) + else + intersection = sum(x .& y) + intersection / union + end +end + #==================== Inner product and norms ====================# diff --git a/test/test_similarities.jl b/test/test_similarities.jl index 70d17c4..894ccfd 100644 --- a/test/test_similarities.jl +++ b/test/test_similarities.jl @@ -232,6 +232,19 @@ end # Convention used in this module @test jaccard(Set(), Set()) == 0 end + + @testset "Compute Jaccard similarity between binary vectors" begin + x = BitArray([true, false, true, true, false]) + y = BitArray([false, false, true, true, true]) + + @test jaccard(x, y) == jaccard(y, x) == 2 / 4 + + # When x and y are both full of false values, we define the + # Jaccard similarity between them to be zero. + x = falses(5) + y = falses(5) + @test jaccard(x, y) == 0 + end end @testset "Inner product similarity tests" begin