<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#hash_unique.unique-2-10-times-faster-than-numpy.unique-and-pandas.unique" data-toc-modified-id="hash_unique.unique-2-10-times-faster-than-numpy.unique-and-pandas.unique-1"><span class="toc-item-num">1&nbsp;&nbsp;</span><code>hash_unique.unique</code> 2-10 times faster than <code>numpy.unique</code> and <code>pandas.unique</code></a></span></li><li><span><a href="#implementation" data-toc-modified-id="implementation-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>implementation</a></span></li><li><span><a href="#Simple-test" data-toc-modified-id="Simple-test-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Simple test</a></span></li><li><span><a href="#Check-hash-quality" data-toc-modified-id="Check-hash-quality-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Check hash quality</a></span></li><li><span><a href="#performance-for-tiny-array" data-toc-modified-id="performance-for-tiny-array-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>performance for tiny array</a></span></li><li><span><a href="#performance-for-medium-array" data-toc-modified-id="performance-for-medium-array-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>performance for medium array</a></span></li><li><span><a href="#performance-for-large-array" data-toc-modified-id="performance-for-large-array-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>performance for large array</a></span></li></ul></div>

# `hash_unique.unique` 2-10 times faster than `numpy.unique` and `pandas.unique`

for random array.

# implementation

In [1]:
%%file hash_unique.py

# don't remove the follows for your use
# Author: https://lhprojects.github.io/blog/
# don't remove above lines

import numba
import numpy as np


def unique(ar, return_counts = False, return_hit_accuracy = False):
    '''
    ar: integer array
    return:
        uniques, uniques_counts
    '''
    
    f,s,t = unique_impl(ar)
    if return_counts and return_hit_accuracy:
        return f, s, t
    elif return_counts:
        return f, s
    elif return_hit_accuracy:
        return f, t
    else:
        return f

@numba.njit
def length(l):
    # https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
    l = int(np.ceil(np.log2(l)))
    # 4*len(ar) > l > 2*len(ar)
    l = 2 << l
    return l

@numba.njit
def FNV_1(v):
    
    byte_mask = np.uint64(255)
    bs = np.uint64(v)
    x1 = (bs) & byte_mask
    x2 = (bs>>8) &byte_mask
    x3 = (bs>>16) &byte_mask
    x4 = (bs>>24) &byte_mask

    FNV_primer = np.uint64(1099511628211)
    FNV_bias = np.uint64(14695981039346656037)
    h = FNV_bias
    h = h*FNV_primer
    h = h^x1
    h = h*FNV_primer
    h = h^x2
    h = h*FNV_primer
    h = h^x3
    h = h*FNV_primer
    h = h^x4
    return h
    
@numba.njit
def unique_impl(ar):
    
    l = len(ar)
    l = int(np.ceil(np.log2(l)))
    # 4*len(ar) > l > 2*len(ar)
    l = 2 << l
    
    mask = l - 1      
    uniques = np.empty(l, dtype=ar.dtype)
    uniques_cnt = np.zeros(l, dtype=np.int_)
    
    total = 0    
    miss_hits = 0    
    
    for v in ar:
        h = FNV_1(v)
        
        index = (h & mask)
        
        # open address hash
        # great cache performance
        while True:
            if uniques_cnt[index] == 0:
                uniques_cnt[index] += 1
                uniques[index] = v
                total += 1
                break
            elif uniques[index] == v:
                uniques_cnt[index] += 1 
                break
            else:
                miss_hits += 1
                index += 1
                index = index & mask
    
    
    # flush the results in a concrete array
    uniques_ = np.empty(total, dtype=ar.dtype)
    uniques_cnt_ = np.empty(total, dtype=np.int_)
    t = 0
    for i in range(l):
        if uniques_cnt[i] > 0:
            uniques_[t] = uniques[i]
            uniques_cnt_[t] = uniques_cnt[i]
            t += 1
            
    hit_accuracy = len(ar)/(len(ar) + miss_hits)
    return uniques_, uniques_cnt_, hit_accuracy


Overwriting hash_unique.py


# Simple test

In [2]:
import hash_unique
import pandas
import numpy as np
import imp
imp.reload(hash_unique)

x = np.random.randint(10, size=100)

u, c = np.unique(x, return_counts=True)

hash_u, hash_c = hash_unique.unique(x, return_counts=True)
index = np.argsort(hash_u)
hash_u = hash_u[index]
hash_c = hash_c[index]

for u_, c_ in zip(u, c):
    print((u_, c_), end=" ")
print()

for u_, c_ in zip(hash_u, hash_c):
    print((u_, c_), end=" ")
print()

(0, 10) (1, 10) (2, 12) (3, 8) (4, 8) (5, 7) (6, 7) (7, 11) (8, 11) (9, 16) 
(0, 10) (1, 10) (2, 12) (3, 8) (4, 8) (5, 7) (6, 7) (7, 11) (8, 11) (9, 16) 


# Check hash quality

In [3]:
x = np.random.randint(5000, size=10000)
f, hit_accuracy = hash_unique.unique(x, return_hit_accuracy=True)

length = hash_unique.length(len(x))
y = np.array([hash_unique.FNV_1(v) for v in x], dtype=np.uint64) & np.uint64(length - 1)

print("%.3f"%hit_accuracy)
print(len(set(x)))
print(len(set(y)))

0.947
4345
4063


# performance for tiny array

In [4]:
x = np.random.randint(5, size=10)
%timeit np.unique(x, return_counts=True)
%timeit hash_unique.unique(x, return_counts=True)
%timeit pandas.unique(x)

13 µs ± 201 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1.39 µs ± 3.92 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
25 µs ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# performance for medium array

In [5]:
x = np.random.randint(5000, size=10000)
%timeit np.unique(x, return_counts=True)
%timeit hash_unique.unique(x, return_counts=True)
%timeit pandas.unique(x)

464 µs ± 47.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
87.2 µs ± 378 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
127 µs ± 583 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [8]:
x = np.arange(10000,0,-1)
%timeit np.unique(x, return_counts=True)
%timeit hash_unique.unique(x)
%timeit pandas.unique(x)

108 µs ± 474 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
74.1 µs ± 268 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
124 µs ± 428 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [9]:
x = np.arange(0,10000)
%timeit np.unique(x, return_counts=True)
%timeit hash_unique.unique(x)
%timeit pandas.unique(x)

81.7 µs ± 363 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
74 µs ± 205 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
123 µs ± 465 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# performance for large array

In [7]:
x = np.random.randint(10000000//2, size=10000000)
%timeit np.unique(x, return_counts=True)
%timeit hash_unique.unique(x, return_counts=True)
%timeit pandas.unique(x)

717 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
407 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
583 ms ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
