In [1]:
import itertools, functools
import numpy as np

### Pairwise independent hash functions
Reference: 
- https://cseweb.ucsd.edu/~slovett/teaching/SP15-CSE190/pairwise_hash_functions.pdf
- https://people.csail.mit.edu/ronitt/COURSE/S12/handouts/lec5.pdf

A family $H=\{h:U \to R\}$ is said to be pairwise independent, if for any two distinct elements $x_1 \neq x_2 \in U$ and two (possibly equal) values $y_1, y_2 \in R$,
$$ Pr_{h\in H} [h(x_1)=y_1 \text{ and } h(x_2)=y_2] = \frac{1}{|R|^2} $$

### Example
$U=\{0,1\}^k, R=\{0,1\}$

In [2]:
k = 3
U = np.asarray([np.asarray(u).astype(np.int) for u in itertools.product('01',repeat=k)])
R = [0,1]

Family $H$:
$$ H = \{h_{a,b}(x)=\langle a,x\rangle+b (mod 2): a\in\{0,1\}^k, b\in\{0,1\}\} $$

In [3]:
H = []
for a in itertools.product('01',repeat=k):
  for b in [0,1]:
    a = np.asarray(a).astype(np.int)
    h_ab = (lambda y,z: (lambda x: (np.dot(y,x)+z)%2))(a,b)
    H.append(h_ab)

The lambdas in the list comprehension are a closure over the scope of this comprehension. A lexical closure, so they refer to the a,b via reference, and not its value when they were evaluated. So need to wrap it in another lambda.

In [4]:
(x1, x2) = U[np.random.choice(U.shape[0], 2, replace=False), :]
(y1, y2) = np.random.choice(R, 2)

In [5]:
np.average([h(x1)==y1 and h(x2)==y2 for h in np.random.choice(H, size=1000)])

0.269

which is close to $\frac{1}{|R|^2} = \frac{1}{4}$

### More about universal hashing
Reference:
- https://en.wikipedia.org/wiki/Universal_hashing
- https://en.wikipedia.org/wiki/K-independent_hashing

(minhash)
- https://github.com/4d55397500/learning-scraps/blob/master/minhash/minhash.py
- https://stackoverflow.com/questions/2255604/hash-functions-family-generator-in-python
- https://stackoverflow.com/questions/19701052/how-many-hash-functions-are-required-in-a-minhash-algorithm/25104050#25104050


Properties of a family of hashing functions:

universality < uniform difference (distance) property < pairwise indepedence (strong universality)

In [6]:
import random
import numpy as np
import sys
from collections import Counter

### Hashing integers (machine words)

In [7]:
# Carter and Wegman's method

PRIME = 131071 # large prime number
m = 10 # number of hashes
rang = 100 # range
assert PRIME > max(m,rang), "The PRIME is too small"

# generate the universal family for hashing integers
# A linear congruential generator (LCG) is an algorithm that yields a sequence of pseudo–randomized numbers
# calculated with a discontinuous piecewise linear equation.
def hash_generator_int():
  while True:
    a,b = random.randint(1, PRIME), random.randint(0, PRIME)
    h_ab = (lambda a,b: lambda x: (a*x+b)%PRIME%m)(a,b)
    yield h_ab
family = hash_generator_int()

In [8]:
[x,y] = random.sample(range(rang), 2)
print "x", x, "y", y

x 3 y 85


In [9]:
# universality
print "<= 1/%s = %s" %(m, np.mean([h(x)==h(y) for h in [next(family) for _ in range(10000)]]))

<= 1/10 = 0.1058


In [10]:
# uniform difference property
Counter([(h(x)-h(y))%m for h in [next(family) for _ in range(10000)]])

Counter({0: 1010,
         1: 1040,
         2: 1012,
         3: 996,
         4: 1009,
         5: 1004,
         6: 979,
         7: 983,
         8: 973,
         9: 994})

In [11]:
# pairwise independent
(z1, z2) = random.sample(range(m), 2)
print "1/m^2 = 1/%s = %s"%(m**2,np.mean([h(x)==z1 and h(y)==z2 for h in [next(family) for _ in range(10000)]]))

1/m^2 = 1/100 = 0.0097


### Hashing vectors (fixed-length sequence of machine words)

In [12]:
k = 7 # length of sequence

def hash_generator_int():
  while True:
    a,b = random.randint(1, PRIME), random.randint(0, PRIME)
    h_ab = (lambda a,b: lambda x: (a*x+b)%PRIME%m)(a,b)
    yield h_ab
family_int = hash_generator_int()

def hash_generator_vector():
  while True:
    h = lambda x: sum(map(next(family_int), x))%m
    yield h
family = hash_generator_vector()

In [13]:
[x,y] = [random.sample(range(rang), k), random.sample(range(rang), k)]
print "x", x, "y", y

x [56, 29, 80, 28, 13, 58, 17] y [68, 54, 28, 13, 52, 26, 25]


In [14]:
# universality
print "<= 1/%s = %s" %(m, np.mean([h(x)==h(y) for h in [next(family) for _ in range(10000)]]))

<= 1/10 = 0.0997


In [15]:
# uniform difference property
Counter([(h(x)-h(y))%m for h in [next(family) for _ in range(10000)]])

Counter({0: 1015,
         1: 970,
         2: 1043,
         3: 1035,
         4: 1016,
         5: 984,
         6: 992,
         7: 974,
         8: 976,
         9: 995})

In [16]:
# pairwise independent
(z1, z2) = random.sample(range(m), 2)
print "1/m^2 = 1/%s = %s"%(m**2,np.mean([h(x)==z1 and h(y)==z2 for h in [next(family) for _ in range(10000)]]))

1/m^2 = 1/100 = 0.0101


### Hashing strings (variable-sized sequence of machine words)

In [17]:
LARGER_PRIME = 2147483647
def hash_generator_int():
  while True:
    a,b = random.randint(1, LARGER_PRIME), random.randint(0, LARGER_PRIME)
    h_ab = (lambda a,b: lambda x: (a*x+b)%LARGER_PRIME%m)(a,b)
    yield h_ab
family_int = hash_generator_int()

def hash_generator_string():
  while True:
    a = random.randint(0, PRIME)
    h = lambda x: next(family_int)(sum([xi*a**(len(x)-i) for i,xi in enumerate(x)])%PRIME)
    yield h
family = hash_generator_string()

In [18]:
k1, k2 = 10, 5
[x,y] = [random.sample(range(rang), k1), random.sample(range(rang), k2)]
print "x", x, "y", y

x [20, 79, 63, 86, 47, 31, 93, 83, 90, 3] y [97, 28, 42, 56, 8]


In [19]:
# universality
print "<= 1/%s = %s" %(m, np.mean([h(x)==h(y) for h in [next(family) for _ in range(10000)]]))

<= 1/10 = 0.1027


In [20]:
# uniform difference property
Counter([(h(x)-h(y))%m for h in [next(family) for _ in range(10000)]])

Counter({0L: 937,
         1L: 1052,
         2L: 975,
         3L: 998,
         4L: 1014,
         5L: 1013,
         6L: 989,
         7L: 1046,
         8L: 987,
         9L: 989})

In [21]:
# pairwise independent
(z1, z2) = random.sample(range(m), 2)
print "1/m^2 = 1/%s = %s"%(m**2,np.mean([h(x)==z1 and h(y)==z2 for h in [next(family) for _ in range(10000)]]))

1/m^2 = 1/100 = 0.0098


### Padding
Reference:
- https://crypto.stackexchange.com/questions/2753/in-the-sha-hash-algorithm-why-is-the-message-always-padded
- Strongly universal string hashing is fast

Padding can be used for converting variable-length string to fixed-length string. Then use hashing for vectors. However, padding with zero only won't work since universality will break. We need to introduce an extra bit at the end.