From bc9c6cacac18b7b96472421aac4071cff08e889a Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sun, 25 Jan 2015 15:36:08 +0100 Subject: [PATCH] Optimized duplicate word checks The greater the input wordlist the greater the performance increase --- src/pp.c | 209 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 122 insertions(+), 87 deletions(-) diff --git a/src/pp.c b/src/pp.c index 032cd45..d931ef0 100644 --- a/src/pp.c +++ b/src/pp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "mpz_int128.h" @@ -41,6 +42,8 @@ #define ALLOC_NEW_CHAINS 0x10 #define ALLOC_DUPES 0x100000 +#define ENTRY_END_HASH 0xFFFFFFFF + #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #define MAX(a,b) (((a) > (b)) ? (a) : (b)) @@ -72,6 +75,25 @@ typedef struct } chain_t; +typedef struct +{ + u32 next; + + char *element; + +} uniq_data_t; + +typedef struct +{ + u32 index; + u32 alloc; + + u32 *hash; + + uniq_data_t *data; + +} uniq_t; + typedef struct { elem_t *elems_buf; @@ -84,6 +106,9 @@ typedef struct int chains_alloc; u64 cur_chain_ks_poses[OUT_LEN_MAX]; + + uniq_t *uniq; + } db_entry_t; typedef struct @@ -198,6 +223,7 @@ static void *malloc_tiny (const size_t size) #else #define MEM_ALLOC_SIZE 0x10000 #endif + if (size > MEM_ALLOC_SIZE) { // we can't handle it here @@ -535,7 +561,7 @@ static void chain_gen_with_idx (chain_t *chain_buf, const int len1, const int ch chain_buf->cnt++; } -static char *add_elem(db_entry_t *db_entry, char *input_buf, int input_len) +static char *add_elem (db_entry_t *db_entry, char *input_buf, int input_len) { check_realloc_elems (db_entry); @@ -547,90 +573,64 @@ static char *add_elem(db_entry_t *db_entry, char *input_buf, int input_len) db_entry->elems_cnt++; - return (char*)elem_buf->buf; + return (char *) elem_buf->buf; } -static unsigned int hash_log, hash_size, hash_mask, hash_alloc; -#define ENTRY_END_HASH 0xFFFFFFFF -#define ENTRY_END_LIST 0xFFFFFFFE - -static inline unsigned int line_hash(char *line) +static u32 input_hash (char *input_buf, int input_len) { - unsigned int hash, extra; - char *p; - - p = line + 2; - hash = (unsigned char)line[0]; - if (!hash) - goto out; - extra = (unsigned char)line[1]; - if (!extra) - goto out; - - while (*p) { - hash <<= 3; extra <<= 2; - hash += (unsigned char)p[0]; - if (!p[1]) break; - extra += (unsigned char)p[1]; - p += 2; - if (hash & 0xe0000000) { - hash ^= hash >> hash_log; - extra ^= extra >> hash_log; - hash &= hash_mask; - } - } + u32 h = 0; - hash -= extra; - hash ^= extra << (hash_log / 2); + for (int i = 0; i < input_len; i++) + { + h = (h * 33) + input_buf[i]; + } - hash ^= hash >> hash_log; + #define HASH_MASK ((1 << DUPE_HASH_LOG) - 1) -out: - hash &= hash_mask; - return hash; + return h & HASH_MASK; } -typedef struct { - u32 next; - char *element; -} element_st; +static void add_uniq (db_entry_t *db_entry, char *input_buf, int input_len) +{ + const u32 h = input_hash (input_buf, input_len); -static struct { - u32 *hash; - element_st *data; -} uniq_buf; + uniq_t *uniq = db_entry->uniq; -static inline int add_uniq(db_entry_t *db_entry, char *line, int len) -{ - static unsigned int index; - unsigned int current, last, linehash; - - linehash = line_hash(line); - current = uniq_buf.hash[linehash]; - last = current; - while (current != ENTRY_END_HASH) { - if (!strncmp(line, uniq_buf.data[current].element, len)) - break; - last = current; - current = uniq_buf.data[current].next; + u32 cur = uniq->hash[h]; + + u32 prev = cur; + + while (cur != ENTRY_END_HASH) + { + if (memcmp (input_buf, uniq->data[cur].element, input_len) == 0) return; + + prev = cur; + + cur = uniq->data[cur].next; } - if (current != ENTRY_END_HASH) - return 0; - if (last == ENTRY_END_HASH) - uniq_buf.hash[linehash] = index; + const u32 index = uniq->index; + + if (prev == ENTRY_END_HASH) + { + uniq->hash[h] = index; + } else - uniq_buf.data[last].next = index; + { + uniq->data[prev].next = index; + } + + if (index == uniq->alloc) + { + uniq->alloc += ALLOC_DUPES; - if (index == hash_alloc) { - hash_alloc += ALLOC_DUPES; - uniq_buf.data = realloc(uniq_buf.data, hash_alloc * sizeof(element_st)); + uniq->data = realloc (uniq->data, uniq->alloc * sizeof (uniq_data_t)); } - uniq_buf.data[index].element = add_elem(db_entry, line, len); - uniq_buf.data[index].next = ENTRY_END_HASH; - index++; - return 1; + uniq->data[index].element = add_elem (db_entry, input_buf, input_len); + uniq->data[index].next = ENTRY_END_HASH; + + uniq->index++; } int main (int argc, char *argv[]) @@ -829,6 +829,28 @@ int main (int argc, char *argv[]) out->fp = stdout; out->len = 0; + if (dupe_check) + { + for (int pw_len = pw_min; pw_len <= pw_max; pw_len++) + { + db_entry_t *db_entry = &db_entries[pw_len]; + + const u32 hash_size = 1 << DUPE_HASH_LOG; + const u32 hash_alloc = ALLOC_DUPES; + + uniq_t *uniq = mem_alloc (sizeof (uniq_t)); + + uniq->data = mem_alloc (hash_alloc * sizeof (uniq_data_t)); + uniq->hash = mem_alloc (hash_size * sizeof (u32)); + uniq->index = 0; + uniq->alloc = hash_alloc; + + memset (uniq->hash, 0xff, hash_size * sizeof (u32)); + + db_entry->uniq = uniq; + } + } + /** * files */ @@ -845,16 +867,6 @@ int main (int argc, char *argv[]) } } - if (dupe_check) { - hash_log = DUPE_HASH_LOG; - hash_size = (1 << hash_log); - hash_mask = (hash_size - 1); - hash_alloc = ALLOC_DUPES; - uniq_buf.data = mem_alloc(hash_alloc * sizeof(element_st)); - uniq_buf.hash = mem_alloc(hash_size * sizeof(unsigned int)); - memset(uniq_buf.hash, 0xff, hash_size * sizeof(unsigned int)); - } - /** * load elems from stdin */ @@ -877,9 +889,13 @@ int main (int argc, char *argv[]) db_entry_t *db_entry = &db_entries[input_len]; if (!dupe_check) - add_elem(db_entry, input_buf, input_len); + { + add_elem (db_entry, input_buf, input_len); + } else - add_uniq(db_entry, input_buf, input_len); + { + add_uniq (db_entry, input_buf, input_len); + } if (case_permute) { @@ -893,9 +909,13 @@ int main (int argc, char *argv[]) input_buf[0] = new_cu; if (!dupe_check) - add_elem(db_entry, input_buf, input_len); + { + add_elem (db_entry, input_buf, input_len); + } else - add_uniq(db_entry, input_buf, input_len); + { + add_uniq (db_entry, input_buf, input_len); + } } if (old_c != new_cl) @@ -903,15 +923,30 @@ int main (int argc, char *argv[]) input_buf[0] = new_cl; if (!dupe_check) - add_elem(db_entry, input_buf, input_len); + { + add_elem (db_entry, input_buf, input_len); + } else - add_uniq(db_entry, input_buf, input_len); + { + add_uniq (db_entry, input_buf, input_len); + } } } } - if (uniq_buf.hash) free(uniq_buf.hash); - if (uniq_buf.data) free(uniq_buf.data); + if (dupe_check) + { + for (int pw_len = pw_min; pw_len <= pw_max; pw_len++) + { + db_entry_t *db_entry = &db_entries[pw_len]; + + uniq_t *uniq = db_entry->uniq; + + free (uniq->hash); + free (uniq->data); + free (uniq); + } + } /** * init chains