In [2]:
from pprint import pprint

keys = 'guido sarah barry rachel tim'.split()
values1 = 'blue orange green yellow red'.split()
values2 = 'austin dallas tuscon reno portland'.split()
values3 = 'apple banana orange pear peach'.split()
hashes = list(map(abs, map(hash, keys)))
entries = list(zip(hashes, keys, values1))
comb_entries = list(zip(hashes, keys, values1, values2, values3))

In [5]:
pprint(comb_entries)

[(6263957734076381623, 'guido', 'blue', 'austin', 'apple'),
 (4670100726627187108, 'sarah', 'orange', 'dallas', 'banana'),
 (4505871006362405169, 'barry', 'green', 'tuscon', 'orange'),
 (8301609630502316535, 'rachel', 'yellow', 'reno', 'pear'),
 (6421035295685486569, 'tim', 'red', 'portland', 'peach')]


In [6]:
# How would database store this entries welcome to 60s
# 4 columns primary_key, color, city, fruit
# They stored it like that because memory was precious


def database_linear_search():
    pprint(list(zip(keys, values1, values2, values3)))

In [7]:
database_linear_search()

[('guido', 'blue', 'austin', 'apple'),
 ('sarah', 'orange', 'dallas', 'banana'),
 ('barry', 'green', 'tuscon', 'orange'),
 ('rachel', 'yellow', 'reno', 'pear'),
 ('tim', 'red', 'portland', 'peach')]


In [None]:
# Things were stored as flat files

In [8]:
# How LISP Would Do It 
# Store lists of pairs


def association_lists():
    pprint([
        list(zip(keys, values1)),
        list(zip(keys, values2)),
        list(zip(keys, values3)),
    ])
association_lists()

[[('guido', 'blue'),
  ('sarah', 'orange'),
  ('barry', 'green'),
  ('rachel', 'yellow'),
  ('tim', 'red')],
 [('guido', 'austin'),
  ('sarah', 'dallas'),
  ('barry', 'tuscon'),
  ('rachel', 'reno'),
  ('tim', 'portland')],
 [('guido', 'apple'),
  ('sarah', 'banana'),
  ('barry', 'orange'),
  ('rachel', 'pear'),
  ('tim', 'peach')]]


In [None]:
# The association lists occupies more memory space keys are stored 
# 3 times instead of once, but we can do better (worse)!
# Linear Search - let us speed it up by using separate chaining

In [9]:
def separate_chaining(n):
    buckets = [[] for _ in range(n)]
    for pair in entries:
        h, key, value = pair
        i = h % n
        buckets[i].append(pair)
    pprint(buckets)
    
separate_chaining(2)  # That gives about 25% improved lookup

[[(4670100726627187108, 'sarah', 'orange')],
 [(6263957734076381623, 'guido', 'blue'),
  (4505871006362405169, 'barry', 'green'),
  (8301609630502316535, 'rachel', 'yellow'),
  (6421035295685486569, 'tim', 'red')]]


In [19]:
separate_chaining(8)

[[],
 [(4505871006362405169, 'barry', 'green'), (6421035295685486569, 'tim', 'red')],
 [],
 [],
 [(4670100726627187108, 'sarah', 'orange')],
 [],
 [],
 [(6263957734076381623, 'guido', 'blue'),
  (8301609630502316535, 'rachel', 'yellow')]]


In [13]:
# We can throw even more memory at it
separate_chaining(7)

[[(4505871006362405169, 'barry', 'green')],
 [(8301609630502316535, 'rachel', 'yellow')],
 [(6421035295685486569, 'tim', 'red')],
 [],
 [(6263957734076381623, 'guido', 'blue')],
 [(4670100726627187108, 'sarah', 'orange')],
 []]


In [None]:
# Everyone gets found in just one probe!
# What about memory several lists which need to be overallocated
# with room to grow, some empty
# To avoid overallocating memory for these lists we can create one
# big table welcome to Open Addressing

In [14]:
def open_addressing_linear(n):
    table = [None] * n
    for h, key, value in entries:
        i = h % n
        while table[i] is not None:
            i = (i + 1) % n
        table[i] = (key, value)
    pprint(table)

In [18]:
open_addressing_linear(8)

[('rachel', 'yellow'),
 ('barry', 'green'),
 ('tim', 'red'),
 None,
 ('sarah', 'orange'),
 None,
 None,
 ('guido', 'blue')]


In [None]:
# Guido and Rachel wanted the same slot but rachel had to wrap around
# This once in a while leads to catastrophic linear pile-up circa 1972

In [27]:
# We can do better than fall back on linear probe lets use random
# number generator congruential i = 5 * i + 1


def open_addressing_multihash(n):
    table = [None] * n
    for h, key, value in entries:
        perturb = h
        i = h % n
        while table[i] is not None:
            print(f'{key!r} collided with {table[i][0]!r}')
            i = (5 * i + perturb + 1) % n
            perturb >>= 5
        table[i] = (key, value)
    pprint(table)

In [28]:
open_addressing_multihash(8)

'rachel' collided with 'guido'
'tim' collided with 'barry'
'tim' collided with 'guido'
'tim' collided with 'rachel'
'tim' collided with 'barry'
[None,
 ('barry', 'green'),
 None,
 ('rachel', 'yellow'),
 ('sarah', 'orange'),
 None,
 ('tim', 'red'),
 ('guido', 'blue')]


In [None]:
"""
Performance slowed down but we avoid the catastrophic pileup

If we use more space we avoid collisions which speeds up performance
but use more memory. This implementation was in Python for quite a
while with some size changes.
"""

In [29]:
def compact_and_ordered(n):
    table = [None] * n
    for pos, entry, in enumerate(entries):
        h = perturb = entry[0]
        i = h % n
        while table[i] is not None:
            i = (5 * i + perturb + 1) % n
            perturb >>= 5
        table[i] = pos
    pprint(entries)
    pprint(table)

In [30]:
compact_and_ordered(8)

[(6263957734076381623, 'guido', 'blue'),
 (4670100726627187108, 'sarah', 'orange'),
 (4505871006362405169, 'barry', 'green'),
 (8301609630502316535, 'rachel', 'yellow'),
 (6421035295685486569, 'tim', 'red')]
[None, 2, None, 3, 1, None, 4, 0]


In [None]:
# In the meantime there was key-sharing dict
# This together with compact dict makes current 3.6 Python dict
# implementation.

In [31]:
def shared_and_compact(n):
    'Compact, ordered, and shared'
    table = [None] * n
    for pos, entry in enumerate(comb_entries):
        h = perturb = entry[0]
        i = h % n
        while table[i] is not None:
            i = (5 * i + perturb + 1) % n
            perturb >>= 5
        table[i] = pos
    pprint(comb_entries)
    pprint(table)

In [32]:
shared_and_compact(8)

[(6263957734076381623, 'guido', 'blue', 'austin', 'apple'),
 (4670100726627187108, 'sarah', 'orange', 'dallas', 'banana'),
 (4505871006362405169, 'barry', 'green', 'tuscon', 'orange'),
 (8301609630502316535, 'rachel', 'yellow', 'reno', 'pear'),
 (6421035295685486569, 'tim', 'red', 'portland', 'peach')]
[None, 2, None, 3, 1, None, 4, 0]


In [33]:
# We can make the dict more sparse without moving any ot the hash
# key/ value entries. The additional sparsity costs only 8 bytes
# and avoids all collisions

In [34]:
shared_and_compact(16)

[(6263957734076381623, 'guido', 'blue', 'austin', 'apple'),
 (4670100726627187108, 'sarah', 'orange', 'dallas', 'banana'),
 (4505871006362405169, 'barry', 'green', 'tuscon', 'orange'),
 (8301609630502316535, 'rachel', 'yellow', 'reno', 'pear'),
 (6421035295685486569, 'tim', 'red', 'portland', 'peach')]
[None,
 2,
 None,
 None,
 1,
 None,
 None,
 0,
 None,
 4,
 None,
 3,
 None,
 None,
 None,
 None]
