From f7eb85c22d1eeb3b04dfee58552dae7227eed989 Mon Sep 17 00:00:00 2001 From: Karsten Blees Date: Wed, 24 Oct 2012 20:35:51 +0200 Subject: [PATCH] add a hashtable implementation that supports O(1) removal The existing hashtable implementation (in hash.[ch]) uses open addressing (i.e. resolve hash collisions by distributing entries across the table). Thus, removal is difficult to implement with less than O(n) complexity. Resolving collisions of entries with identical hashes (e.g. via chaining) is left to the client code. Add a hashtable implementation that supports O(1) removal and is slightly easier to use due to builtin entry chaining. Supports all basic operations init, free, get, put, remove and iteration. Also includes ready-to-use hash functions based on the public domain FNV-1 algorithm (http://www.isthe.com/chongo/tech/comp/fnv). The per-entry data structure (hashmap_entry) is meant to be piggybacked onto the client's data structure to save memory. See test-hashmap.c for usage examples. The hashtable is resized by a factor of four when 80% full. With these settings, average memory consumption is about 2/3 of hash.[ch], and insertion is twice as fast (due to less frequent resizing). Signed-off-by: Karsten Blees --- Makefile | 3 + hashmap.c | 209 +++++++++++++++++++++++++++++++++++++++++ hashmap.h | 161 ++++++++++++++++++++++++++++++++ t/t0007-hashmap.sh | 204 ++++++++++++++++++++++++++++++++++++++++ test-hashmap.c | 227 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 804 insertions(+) create mode 100644 hashmap.c create mode 100644 hashmap.h create mode 100644 t/t0007-hashmap.sh create mode 100644 test-hashmap.c diff --git a/Makefile b/Makefile index 1cff01ecbde7ef..5efe5807b79344 100644 --- a/Makefile +++ b/Makefile @@ -511,6 +511,7 @@ TEST_PROGRAMS_NEED_X += test-date TEST_PROGRAMS_NEED_X += test-delta TEST_PROGRAMS_NEED_X += test-dump-cache-tree TEST_PROGRAMS_NEED_X += test-genrandom +TEST_PROGRAMS_NEED_X += test-hashmap TEST_PROGRAMS_NEED_X += test-index-version TEST_PROGRAMS_NEED_X += test-line-buffer TEST_PROGRAMS_NEED_X += test-match-trees @@ -644,6 +645,7 @@ LIB_H += gpg-interface.h LIB_H += graph.h LIB_H += grep.h LIB_H += hash.h +LIB_H += hashmap.h LIB_H += help.h LIB_H += http.h LIB_H += kwset.h @@ -755,6 +757,7 @@ LIB_OBJS += gpg-interface.o LIB_OBJS += graph.o LIB_OBJS += grep.o LIB_OBJS += hash.o +LIB_OBJS += hashmap.o LIB_OBJS += help.o LIB_OBJS += hex.o LIB_OBJS += ident.o diff --git a/hashmap.c b/hashmap.c new file mode 100644 index 00000000000000..03439e79c2da81 --- /dev/null +++ b/hashmap.c @@ -0,0 +1,209 @@ +/* + * Generic implementation of hash-based key value mappings. + */ +#include "cache.h" +#include "hashmap.h" + +#define FNV32_BASE ((unsigned int) 0x811c9dc5) +#define FNV32_PRIME ((unsigned int) 0x01000193) + +unsigned int strhash(const char *str) +{ + unsigned int c, hash = FNV32_BASE; + while ((c = (unsigned char) *str++)) + hash = (hash * FNV32_PRIME) ^ c; + return hash; +} + +unsigned int strihash(const char *str) +{ + unsigned int c, hash = FNV32_BASE; + while ((c = (unsigned char) *str++)) { + if (c >= 'a' && c <= 'z') + c -= 'a' - 'A'; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + +unsigned int memhash(const void *buf, size_t len) +{ + unsigned int hash = FNV32_BASE; + unsigned char *ucbuf = (unsigned char*) buf; + while (len--) { + unsigned int c = *ucbuf++; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + +unsigned int memihash(const void *buf, size_t len) +{ + unsigned int hash = FNV32_BASE; + unsigned char *ucbuf = (unsigned char*) buf; + while (len--) { + unsigned int c = *ucbuf++; + if (c >= 'a' && c <= 'z') + c -= 'a' - 'A'; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + +#define HASHMAP_INITIAL_SIZE 64 +/* grow / shrink by 2^2 */ +#define HASHMAP_GROW 2 +/* grow if > 80% full (to 20%) */ +#define HASHMAP_GROW_AT 1.25 +/* shrink if < 16.6% full (to 66.6%) */ +#define HASHMAP_SHRINK_AT 6 + +static inline int entry_equals(const hashmap *map, const hashmap_entry *e1, + const hashmap_entry *e2) +{ + return (e1 == e2) || (e1->hash == e2->hash && !(*map->cmpfn)(e1, e2)); +} + +static inline unsigned int bucket(const hashmap *map, const hashmap_entry *key) +{ + return key->hash & (map->tablesize - 1); +} + +static void rehash(hashmap *map, unsigned int newsize) +{ + unsigned int i, oldsize = map->tablesize; + hashmap_entry **oldtable = map->table; + + map->tablesize = newsize; + map->table = xcalloc(sizeof(hashmap_entry*), map->tablesize); + for (i = 0; i < oldsize; i++) { + hashmap_entry *e = oldtable[i]; + while (e) { + hashmap_entry *next = e->next; + unsigned int b = bucket(map, e); + e->next = map->table[b]; + map->table[b] = e; + e = next; + } + } + free(oldtable); +} + +void hashmap_init(hashmap *map, hashmap_cmp_fn equals_function, + size_t initial_size) +{ + map->size = 0; + map->cmpfn = equals_function; + /* calculate initial table size and allocate the table */ + map->tablesize = HASHMAP_INITIAL_SIZE; + initial_size *= HASHMAP_GROW_AT; + while (initial_size > map->tablesize) + map->tablesize <<= HASHMAP_GROW; + map->table = xcalloc(sizeof(hashmap_entry*), map->tablesize); +} + +void hashmap_free(hashmap *map, hashmap_free_fn free_function) +{ + if (!map || !map->table) + return; + if (free_function) { + hashmap_iter iter; + hashmap_entry *e; + hashmap_iter_init(map, &iter); + while ((e = hashmap_iter_next(&iter))) + (*free_function)(e); + } + free(map->table); + memset(map, 0, sizeof(*map)); +} + +hashmap_entry *hashmap_get(const hashmap *map, const hashmap_entry *key) +{ + hashmap_entry *e = map->table[bucket(map, key)]; + while (e && !entry_equals(map, e, key)) + e = e->next; + return e; +} + +hashmap_entry *hashmap_put(hashmap *map, hashmap_entry *entry) +{ + unsigned int b = bucket(map, entry); + hashmap_entry *last = NULL, *e = map->table[b]; + + /* find entry */ + while (e && !entry_equals(map, e, entry)) { + last = e; + e = e->next; + } + + if (!e) { + /* not found, add entry */ + entry->next = map->table[b]; + map->table[b] = entry; + + /* fix size and rehash if appropriate */ + map->size++; + if (map->size * HASHMAP_GROW_AT > map->tablesize) + rehash(map, map->tablesize << HASHMAP_GROW); + } else if (e != entry) { + /* replace found entry */ + if (last) + last->next = entry; + else + map->table[b] = entry; + entry->next = e->next; + e->next = NULL; + } + return e; +} + +hashmap_entry *hashmap_remove(hashmap *map, const hashmap_entry *key) +{ + unsigned int b = bucket(map, key); + hashmap_entry *last = NULL, *e = map->table[b]; + + /* find entry */ + while (e && !entry_equals(map, e, key)) { + last = e; + e = e->next; + } + + if (e) { + /* remove found entry */ + if (last) + last->next = e->next; + else + map->table[b] = e->next; + e->next = NULL; + + /* fix size and rehash if appropriate */ + map->size--; + if (map->tablesize > HASHMAP_INITIAL_SIZE && map->size + * HASHMAP_SHRINK_AT < map->tablesize) + rehash(map, map->tablesize >> HASHMAP_GROW); + } + return e; +} + +void hashmap_iter_init(hashmap *map, hashmap_iter *iter) +{ + iter->map = map; + iter->tablepos = 0; + iter->next = NULL; +} + +hashmap_entry *hashmap_iter_next(hashmap_iter *iter) +{ + hashmap_entry *current = iter->next; + for (;;) { + if (current) { + iter->next = current->next; + return current; + } + + if (iter->tablepos >= iter->map->tablesize) + return NULL; + + current = iter->map->table[iter->tablepos++]; + } +} diff --git a/hashmap.h b/hashmap.h new file mode 100644 index 00000000000000..30fa4da2c879e5 --- /dev/null +++ b/hashmap.h @@ -0,0 +1,161 @@ +#ifndef HASHMAP_H +#define HASHMAP_H + +/* + * Generic implementation of hash-based key value mappings. + * Supports basic operations get, put, remove and iteration. + * + * Also contains a set of ready-to-use hash functions for strings, using the + * FNV-1 algorithm (see http://www.isthe.com/chongo/tech/comp/fnv). + */ + +/* + * Case-sensitive FNV-1 hash of 0-terminated string. + * str: the string + * returns hash code + */ +extern unsigned int strhash(const char *buf); + +/* + * Case-insensitive FNV-1 hash of 0-terminated string. + * str: the string + * returns hash code + */ +extern unsigned int strihash(const char *buf); + +/* + * Case-sensitive FNV-1 hash of a memory block. + * buf: start of the memory block + * len: length of the memory block + * returns hash code + */ +extern unsigned int memhash(const void *buf, size_t len); + +/* + * Case-insensitive FNV-1 hash of a memory block. + * buf: start of the memory block + * len: length of the memory block + * returns hash code + */ +extern unsigned int memihash(const void *buf, size_t len); + +/* + * Hashmap entry data structure, intended to be used as first member of user + * data structures. Consists of a pointer and an int. Ideally it should be + * followed by an int-sized member to prevent unused memory on 64-bit systems + * due to alignment. + */ +typedef struct hashmap_entry { + struct hashmap_entry *next; + unsigned int hash; +} hashmap_entry; + +/* + * User-supplied function to test two hashmap entries for equality, shall + * return 0 if the entries are equal. This function is always called with + * non-NULL parameters that have the same hash code. + */ +typedef int (*hashmap_cmp_fn)(const hashmap_entry*, const hashmap_entry*); + +/* + * User-supplied function to free a hashmap entry. + */ +typedef void (*hashmap_free_fn)(const hashmap_entry*); + +/* + * Hashmap data structure, use with hashmap_* functions. + */ +typedef struct hashmap { + hashmap_entry **table; + hashmap_cmp_fn cmpfn; + unsigned int size, tablesize; +} hashmap; + +/* + * Hashmap iterator data structure, use with hasmap_iter_* functions. + */ +typedef struct hashmap_iter { + hashmap *map; + hashmap_entry *next; + unsigned int tablepos; +} hashmap_iter; + +/* + * Initializes a hashmap_entry structure. + * entry: pointer to the entry to initialize + * hash: hash code of the entry + */ +static inline void hashmap_entry_init(hashmap_entry *entry, int hash) +{ + entry->hash = hash; + entry->next = NULL; +} + +/* + * Initializes a hashmap structure. + * map: hashmap to initialize + * equals_function: function to test equality of hashmap entries + * initial_size: number of initial entries, or 0 if unknown + */ +extern void hashmap_init(hashmap *map, hashmap_cmp_fn equals_function, + size_t initial_size); + +/* + * Frees a hashmap structure and allocated memory. + * map: hashmap to free + * free_function: optional function to free the hashmap entries + */ +extern void hashmap_free(hashmap *map, hashmap_free_fn free_function); + +/* + * Returns the hashmap entry for the specified key, or NULL if not found. + * map: the hashmap + * key: key of the entry to look up + * returns matching hashmap entry, or NULL if not found + */ +extern hashmap_entry *hashmap_get(const hashmap *map, const hashmap_entry *key); + +/* + * Adds or replaces a hashmap entry. + * map: the hashmap + * entry: the entry to add or replace + * returns previous entry, or NULL if the entry is new + */ +extern hashmap_entry *hashmap_put(hashmap *map, hashmap_entry *entry); + +/* + * Removes a hashmap entry matching the specified key. + * map: the hashmap + * key: key of the entry to remove + * returns removed entry, or NULL if not found + */ +extern hashmap_entry *hashmap_remove(hashmap *map, const hashmap_entry *key); + +/* + * Initializes a hashmap iterator structure. + * map: the hashmap + * iter: hashmap iterator structure + */ +extern void hashmap_iter_init(hashmap *map, hashmap_iter *iter); + +/** + * Returns the next hashmap entry. + * iter: hashmap iterator + * returns next entry, or NULL if there are no more entries + */ +extern hashmap_entry *hashmap_iter_next(hashmap_iter *iter); + +/** + * Initializes a hashmap iterator and returns the first hashmap entry. + * map: the hashmap + * iter: hashmap iterator + * returns first entry, or NULL if there are no entries + */ +static inline hashmap_entry *hashmap_iter_first(hashmap *map, + hashmap_iter *iter) +{ + hashmap_iter_init(map, iter); + return hashmap_iter_next(iter); +} + +#endif diff --git a/t/t0007-hashmap.sh b/t/t0007-hashmap.sh new file mode 100644 index 00000000000000..8d616cf6fd3d3b --- /dev/null +++ b/t/t0007-hashmap.sh @@ -0,0 +1,204 @@ +#!/bin/sh + +test_description='test hashmap and string hash functions' +. ./test-lib.sh + +test_hashmap() { + echo "$1" | test-hashmap $3 > actual && + echo "$2" > expect && + test_cmp expect actual +} + +test_expect_success 'hash functions' ' + +test_hashmap "hash key1" "2215982743 2215982743 116372151 116372151" && +test_hashmap "hash key2" "2215982740 2215982740 116372148 116372148" && +test_hashmap "hash fooBarFrotz" "1383912807 1383912807 3189766727 3189766727" && +test_hashmap "hash foobarfrotz" "2862305959 2862305959 3189766727 3189766727" + +' + +test_expect_success 'add' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +put foobarfrotz value4 +size" "NULL +NULL +NULL +NULL +64 4" + +' + +test_expect_success 'add (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +size" "NULL +NULL +NULL +64 3" ignorecase + +' + +test_expect_success 'replace' ' + +test_hashmap "put key1 value1 +put key1 value2 +put fooBarFrotz value3 +put fooBarFrotz value4 +size" "NULL +value1 +NULL +value3 +64 2" + +' + +test_expect_success 'replace (case insensitive)' ' + +test_hashmap "put key1 value1 +put Key1 value2 +put fooBarFrotz value3 +put foobarfrotz value4 +size" "NULL +value1 +NULL +value3 +64 2" ignorecase + +' + +test_expect_success 'get' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +put foobarfrotz value4 +get key1 +get key2 +get fooBarFrotz +get notInMap" "NULL +NULL +NULL +NULL +value1 +value2 +value3 +NULL" + +' + +test_expect_success 'get (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +get Key1 +get keY2 +get foobarfrotz +get notInMap" "NULL +NULL +NULL +value1 +value2 +value3 +NULL" ignorecase + +' + +test_expect_success 'remove' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +remove key1 +remove key2 +remove notInMap +size" "NULL +NULL +NULL +value1 +value2 +NULL +64 1" + +' + +test_expect_success 'remove (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +remove Key1 +remove keY2 +remove notInMap +size" "NULL +NULL +NULL +value1 +value2 +NULL +64 1" ignorecase + +' + +test_expect_success 'iterate' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +iterate" "NULL +NULL +NULL +key2 value2 +key1 value1 +fooBarFrotz value3" + +' + +test_expect_success 'iterate (case insensitive)' ' + +test_hashmap "put key1 value1 +put key2 value2 +put fooBarFrotz value3 +iterate" "NULL +NULL +NULL +fooBarFrotz value3 +key2 value2 +key1 value1" ignorecase + +' + +test_expect_success 'grow / shrink' ' + + rm -f in && + rm -f expect && + for (( n = 0; n < 51 ; n++ )) + do + echo put key$n value$n >> in && + echo NULL >> expect + done && + echo size >> in && + echo 64 51 >> expect && + echo put key51 value51 >> in && + echo NULL >> expect + echo size >> in && + echo 256 52 >> expect && + for (( n = 0; n < 10 ; n++ )) + do + echo remove key$n >> in && + echo value$n >> expect + done && + echo size >> in && + echo 64 42 >> expect && + cat in | test-hashmap > out && + test_cmp expect out + +' + +test_done diff --git a/test-hashmap.c b/test-hashmap.c new file mode 100644 index 00000000000000..0621a7cb8eed63 --- /dev/null +++ b/test-hashmap.c @@ -0,0 +1,227 @@ +#include "cache.h" +#include "hashmap.h" +#include + +typedef struct test_entry +{ + hashmap_entry ent; + char *key; + char *value; +} test_entry; + +static int test_entry_cmp(const test_entry *e1, const test_entry *e2) +{ + return strcmp(e1->key, e2->key); +} + +static int test_entry_cmp_icase(const test_entry *e1, const test_entry *e2) +{ + return strcasecmp(e1->key, e2->key); +} + +static void perf_hashmap(unsigned int size, unsigned int rounds) +{ + hashmap map; + char buf[16]; + char **strings; + test_entry *entries, *e; + unsigned int i, j; + + strings = malloc(size * sizeof(char*)); + entries = malloc(size * sizeof(test_entry)); + for (i = 0; i < size; i++) { + snprintf(buf, sizeof(buf), "%i", i); + strings[i] = strdup(buf); + entries[i].key = strings[i]; + entries[i].value = strings[i]; + } + + for (j = 0; j < rounds; j++) { + // initialize the map + hashmap_init(&map, (hashmap_cmp_fn) test_entry_cmp, 0); + + // add entries + for (i = 0; i < size; i++) { + hashmap_entry_init(&entries[i].ent, strhash(strings[i])); + e = (test_entry*) hashmap_put(&map, &entries[i].ent); + if (e) + printf("duplicate: %s\n", strings[i]); + } + + hashmap_free(&map, NULL); + } +} + +typedef struct hash_entry +{ + char *key; + char *value; + struct hash_entry *next; +} hash_entry; + +static void perf_hashtable(unsigned int size, unsigned int rounds) +{ + struct hash_table map; + char buf[16]; + char **strings; + hash_entry *entries, **res, *e; + unsigned int i, j; + + strings = malloc(size * sizeof(char*)); + entries = malloc(size * sizeof(hash_entry)); + for (i = 0; i < size; i++) { + snprintf(buf, sizeof(buf), "%i", i); + strings[i] = strdup(buf); + entries[i].key = strings[i]; + entries[i].value = strings[i]; + } + + for (j = 0; j < rounds; j++) { + // initialize the map + init_hash(&map); + + // add entries + for (i = 0; i < size; i++) { + res = (hash_entry**) insert_hash( + strhash(entries[i].key), &entries[i], &map); + if (res) { + e = *res; + while (e && strcmp(e->key, strings[i])) + e = e->next; + if (e) + printf("duplicate: %s\n", strings[i]); + + entries[i].next = *res; + *res = &entries[i]; + } + } + + free_hash(&map); + } +} + + +#define DELIM " \t\r\n" + +/* + * Read stdin line by line and print result of commands to stdout: + * + * hash key -> strhash(key) memhash(key) strihash(key) memihash(key) + * put key value -> NULL / old value + * get key -> NULL / value + * remove key -> NULL / old value + * iterate -> key1 value1\nkey2 value2\n... + * size -> tablesize numentries + */ +int main(int argc, const char *argv[]) +{ + char line[1024]; + hashmap map; + int icase; + + /* init hash map */ + icase = argc > 1 && !strcmp("ignorecase", argv[1]); + hashmap_init(&map, (hashmap_cmp_fn) (icase ? test_entry_cmp_icase + : test_entry_cmp), 0); + + /* process commands from stdin */ + while (fgets(line, sizeof(line), stdin)) { + char *cmd, *p1 = NULL, *p2 = NULL; + int l1 = 0, l2 = 0, hash = 0; + test_entry *entry; + + /* break line into command and up to two parameters */ + cmd = strtok(line, DELIM); + /* ignore empty lines */ + if (!cmd || *cmd == '#') + continue; + + p1 = strtok(NULL, DELIM); + if (p1) { + l1 = strlen(p1); + hash = icase ? strihash(p1) : strhash(p1); + p2 = strtok(NULL, DELIM); + if (p2) + l2 = strlen(p2); + } + + if (!strcmp("hash", cmd) && l1) { + + /* print results of different hash functions */ + printf("%u %u %u %u\n", strhash(p1), memhash(p1, l1), + strihash(p1), memihash(p1, l1)); + + } else if (!strcmp("put", cmd) && l1 && l2) { + + /* create entry with key = p1, value = p2 */ + entry = malloc(sizeof(test_entry) + l1 + l2 + 2); + hashmap_entry_init(&entry->ent, hash); + entry->key = ((char*)entry) + sizeof(test_entry); + entry->value = entry->key + l1 + 1; + memcpy(entry->key, p1, l1 + 1); + memcpy(entry->value, p2, l2 + 1); + + /* add to hashmap */ + entry = (test_entry*) hashmap_put(&map, &entry->ent); + + /* print and free replaced entry, if any */ + puts(entry ? entry->value : "NULL"); + free(entry); + + } else if (!strcmp("get", cmd) && l1) { + + /* setup static key */ + test_entry key; + hashmap_entry_init(&key.ent, hash); + key.key = p1; + + /* lookup entry in hashmap */ + entry = (test_entry*) hashmap_get(&map, &key.ent); + + /* print result */ + puts(entry ? entry->value : "NULL"); + + } else if (!strcmp("remove", cmd) && l1) { + + /* setup static key */ + test_entry key; + hashmap_entry_init(&key.ent, hash); + key.key = p1; + + /* remove entry from hashmap */ + entry = (test_entry*) hashmap_remove(&map, &key.ent); + + /* print result and free entry*/ + puts(entry ? entry->value : "NULL"); + free(entry); + + } else if (!strcmp("iterate", cmd)) { + + hashmap_iter iter; + hashmap_iter_init(&map, &iter); + while ((entry = (test_entry*) hashmap_iter_next(&iter))) + printf("%s %s\n", entry->key, entry->value); + + } else if (!strcmp("size", cmd)) { + + /* print table sizes */ + printf("%u %u\n", map.tablesize, map.size); + + } else if (!strcmp("perfhashmap", cmd) && l1 && l2) { + + perf_hashmap(atoi(p1), atoi(p2)); + + } else if (!strcmp("perfhashtable", cmd) && l1 && l2) { + + perf_hashtable(atoi(p1), atoi(p2)); + + } else { + + printf("Unknown command %s\n", cmd); + + } + } + + hashmap_free(&map, (hashmap_free_fn) free); + return 0; +}