diff --git a/tripe.py b/tripe.py index 5fa6a75..a33f24d 100644 --- a/tripe.py +++ b/tripe.py @@ -1,8 +1,9 @@ #!/usr/bin/env python '''Tripe - an information retrieval experiment''' -import re +import re, os from struct import pack, unpack, calcsize +from mmap import mmap, MAP_SHARED, PROT_READ, PROT_WRITE STEMRE = re.compile(r'\W') def stem(term): @@ -18,13 +19,126 @@ def tokenize(text): yield (off, stem(term), term) off = off + len(term) + len(post) +BLOCKSIZE = calcsize('Q') # size of ints +HEADERCOUNT = 16 # number of ints in header +HEADERSIZE = BLOCKSIZE * HEADERCOUNT # size of header in bytes + +HEADER_MAGIC = 0 # where the magic number resides +HEADER_ROOT = 1 * BLOCKSIZE # root node +HEADER_FIRST_FREE = 2 * BLOCKSIZE # first free block + +class TripeStore(object): + def __init__(self, filename, writable=False): + self.filename = filename + if writable: + open_mode = 'r+' + mmap_mode = PROT_READ | PROT_WRITE + else: + open_mode = 'r' + mmap_mode = PROT_READ + if not os.path.exists(filename): + # create empty file + open(filename, 'w').write(pack('Q'*16, Tripe.MAGIC, 0, + HEADERSIZE, *((0,)*(HEADERCOUNT-3)))) + # open the file + self.file = open(filename, open_mode) + # map the file + self.mmap = mmap(self.file.fileno(), 0, MAP_SHARED, mmap_mode) + + def __load_number(self, offset): + '''load a number from the map''' + return unpack('Q', self.mmap[offset:offset+BLOCKSIZE])[0] + + def __store_number(self, offset, number): + '''store a number in the map''' + self.mmap[offset:offset+BLOCKSIZE] = pack('Q', number) + + def __allocate(self, size): + '''allocate a block of the given size''' + # where's the end of the used space? + first_free = self.__load_number(HEADER_FIRST_FREE) + # where will it come to once we've allocated + new_size = first_free + BLOCKSIZE + size + # make sure we have enough space + if new_size > len(self.mmap): + self.mmap.resize(new_size) + # store the new end + self.__store_number(HEADER_FIRST_FREE, new_size) + # store the length + self.__store_number(first_free, size) + # return the offset to put data in + return first_free + BLOCKSIZE + + def __handle_size(self, handle): + '''return the size of the handle''' + return self.__load_number(handle-BLOCKSIZE) + + def __get_handle(self, handle): + return self.mmap[handle:handle+self.__handle_size(handle)] + + def __set_handle(self, handle, bytes): + self.mmap[handle:handle+self.__handle_size(handle)] = bytes + + def get_root(self): + '''the handle of the root node''' + return self.__load_number(HEADER_ROOT) + + def set_root(self, handle): + '''the handle of the root node''' + return self.__store_number(HEADER_ROOT, handle) + + def store_numbers(self, numbers): + '''allocate storage and store numbers. return a handle.''' + fmt = 'Q' * len(numbers) + size = calcsize(fmt) + handle = self.__allocate(size) + self.__set_handle(handle, pack(fmt, *numbers)) + return handle + + def store_text(self, text): + '''allocate storage and store text. return a handle.''' + # FIXME: pad to 64-bit boundary for performance? + utf8 = text.encode('utf-8') + utf8_len = len(utf8) + handle = self.__allocate(utf8_len) + self.__set_handle(handle, utf8) + return handle + + def update_numbers(self, handle, numbers): + '''update numbers in an existing handle''' + fmt = 'Q' * len(numbers) + size = calcsize(fmt) + assert size == self.__handle_size(handle) + self.__set_handle(handle, pack(fmt, *numbers)) + + def load_numbers(self, handle): + '''fetch numbers stored at a handle''' + num_numbers = self.__handle_size(handle) / BLOCKSIZE + return unpack('Q'*num_numbers, self.__get_handle(handle)) + + def load_text(self, handle): + '''fetch text stored at a handle''' + # FIXME: drop trailing '\0'? + return self.__get_handle(handle).decode('utf-8') + + def free(self, handle): + '''mark a handle as unused''' + pass class Tripe(object): '''a text index''' MAGIC = unpack('Q', 'Tripe001')[0] - def __init__(self): - self.root = TrieNode() + def __init__(self, store): + self.store = store + if store.get_root() == 0: + # no root found - we should make one + self.root = TrieNode(self, None) + # store its location + store.set_root(self.root.handle) + else: + # load the root node + self.root = TrieNode(self, store.get_root()) def search(self, phrase, exact=False): '''match the phrase''' @@ -46,13 +160,12 @@ def search(self, phrase, exact=False): return instances - def add(self, text, docid): - doc = Document(docid, text) + def add(self, text, doc): tokens = list(tokenize(text)) tokens.reverse() next = None for off, stemmed, raw in tokens: - ti = TermInstance(doc, off, raw, next) + ti = TermInstance(self, None, doc, off, raw, next) self.root.add(stemmed, ti) next = ti @@ -61,42 +174,85 @@ def dot(self): self.root.dot() print '}' - def write(self, file): - '''make sure we're at the start of the file''' - assert file.tell() == 0 - # write 16 64bit ints - # magic 'Tripe001" - # root node offset - # 14 x 0 for future space - file.write(pack('QQQQQQQQQQQQQQQQ', Tripe.MAGIC, 0, *((0,)*14))) - # write all of the nodes, get the offset of the root node - root_off = self.root.write(file) - # write it to the second place in the file - file.seek(calcsize('Q')) - file.write(pack('Q', root_off)) - - class TrieNode(object): '''a node in the trie''' - def __init__(self): - self.matches = [] - self.children = {} - self.dotname = None - self.offset = 0 + def __init__(self, tripe, handle): + self.tripe = tripe + # if this is a new node, allocate it + if handle == None: + handle = tripe.store.store_numbers((0,0)) + self.handle = handle + + def __matches(self): + '''returns list of match handles''' + matches_handle, children_handle = \ + self.tripe.store.load_numbers(self.handle) + if matches_handle == 0: return [] + return self.tripe.store.load_numbers(matches_handle) + + def __children(self): + '''returns list of pairs of (key, handle)''' + matches_handle, children_handle = \ + self.tripe.store.load_numbers(self.handle) + if children_handle == 0: return [] + children = self.tripe.store.load_numbers(children_handle) + return zip(children[::2], children[1::2]) + + def __find_child(self, name): + # FIXME: binary search, or something better + for key, offset in self.__children(): + if key == name: return TrieNode(self.tripe, offset) + return None def search(self, term): if term == '': - return self.matches + return [TermInstance(self.tripe, h) for h in self.__matches()] else: - return self.children[term[0]].search(term[1:]) + child = self.__find_child(ord(term[0])) + if child: return child.search(term[1:]) + else: return [] def add(self, term, value): if term == '': - self.matches.append(value) + # get the current list of matches + matches = list(self.__matches()) + # append this new match + matches.append(value.handle) + # store the new list + new_matches = self.tripe.store.store_numbers(matches) + # update the record + old_matches, children_handle = \ + self.tripe.store.load_numbers(self.handle) + self.tripe.store.update_numbers(self.handle, (new_matches, children_handle)) + # free the old matches array + if old_matches: + self.tripe.store.free(old_matches) else: - if not self.children.has_key(term[0]): - self.children[term[0]] = TrieNode() - self.children[term[0]].add(term[1:], value) + # look for an existing child node + character = ord(term[0]) + child = self.__find_child(character) + if child == None: + # create a new child + child = TrieNode(self.tripe, None) + # none, time to rewrite the children list + children = self.__children() + children.append((character, child.handle)) + children.sort() + # flatten the list for storage + children_flat = [] + for n,v in children: + children_flat.append(n) + children_flat.append(v) + # store the new list of children + new_children = self.tripe.store.store_numbers(children_flat) + # update the record + matches_handle, old_children = \ + self.tripe.store.load_numbers(self.handle) + self.tripe.store.update_numbers(self.handle, (matches_handle, new_children)) + # free the old children array + if old_children: + self.tripe.store.free(old_children) + child.add(term[1:], value) def dot(self, label=''): if self.dotname: @@ -109,43 +265,48 @@ def dot(self, label=''): print '%s -> %s' % (self.dotname, m.dot()) return self.dotname - def write(self, file): - if self.offset: - return self.offset - - # force all matches and children to be written, collect offsets - match_offsets = [m.write(file) for m in self.matches] - child_offsets = [(k, v.write(file)) for k,v in self.children.items()] - - # now prepare to write the record for this trie node - self.offset = file.tell() # remember where we're writing - # write the number of matches and children - file.write(pack('QQ', len(self.matches), len(self.children))) - # write the matches offsets - file.write(pack('Q'*len(match_offsets), *match_offsets)) - # write each of the char/offset pairs for the children - child_offsets.sort() - for k,o in child_offsets: - file.write(pack('QQ', ord(k), o)) - - return self.offset - class TermInstance(object): '''an instance of a term in a document''' - def __init__(self, doc, off, raw, next): - self.doc = doc # document - self.off = off # offset within document - self.raw = raw # original text of the term - self.next = next # next term instance in the document + def __init__(self, tripe, handle, doc=None, offset=None, raw=None, next=None): + self.tripe = tripe + # if this is a new term instance, allocate it + if handle == None: + # store the supplied values + self.doc = doc + self.offset = offset + self.__next = next + if next: + self.next_handle = next.handle + else: + self.next_handle = 0 + self.raw_handle = tripe.store.store_text(raw) + handle = tripe.store.store_numbers((doc, offset, self.raw_handle, + self.next_handle)) + else: + # load the values + self.doc, self.offset, self.raw_handle, self.next_handle = \ + tripe.store.load_numbers(handle) + assert self.raw_handle != None + self.raw = tripe.store.load_text(self.raw_handle) + self.__next = None + self.handle = handle + self.dotname = None - self.offset = 0 + + def next(self): + if self.__next != None: return self.__next + if self.next_handle != 0: + self.__next = TermInstance(self.tripe, self.next_handle) + return self.__next + else: + return None def matches_exact(self, raw): return self.raw == raw def matches_phrase(self, phrase, exact=False): - instance = self.next + instance = self.next() for off, stemmed, raw in phrase: # if we run out of words in the document we fail if not instance: return False @@ -162,11 +323,11 @@ def matches_phrase(self, phrase, exact=False): return False # this word looked good, next please - instance = instance.next + instance = instance.next() return True def __repr__(self): - return 'TermInstance<%s, off=%s, raw=%s>' % (`self.doc`, self.off, `self.raw`) + return 'TermInstance' % (`self.doc`, self.offset, `self.raw`) def dot(self): if self.dotname: @@ -177,46 +338,17 @@ def dot(self): print '%s -> %s [style=dashed]' % (self.dotname, self.next.dot()) return self.dotname - def write(self, file): - if self.offset: return self.offset - - # write (and get the offset for) the next term instance - next_offset = 0 - if self.next: - next_offset = self.next.write(file) - - # write the raw string version for reference - # FIXME: avoid duplication, other instances of the same term may be the same - raw_offset = file.tell() - utf8 = self.raw.encode('utf8') + '\0' - # pad to 64 bits - while len(utf8) % calcsize('Q'): utf8 = utf8 + '\0' - # write it out - file.write(utf8) - - # write the record for this instance - self.offset = file.tell() - file.write(pack('QQQQ', self.doc.docid, self.off, raw_offset, next_offset)) - - return self.offset - - -class Document(object): - '''a document contains terms''' - def __init__(self, docid, text): - self.docid = docid - self.text = text - def __repr__(self): - return 'Document<%s>' % self.docid -tripe = Tripe() -tripe.add('Hello world', 1) -tripe.add('Hello, World', 2) -tripe.add('Goodbye, cruel world...', 3) -tripe.add('This is a test.', 4) -tripe.add('This is not a pipe', 5) -tripe.add('Thistle, bristle and whistle!', 6) -tripe.add('A bird in the hand is worth two in the bush.', 7) +tripe = Tripe(TripeStore('/tmp/test.tripe', False)) +#tripe = Tripe(TripeStore('/tmp/test.tripe', True)) +#tripe.add('Hello world', 1) +#tripe.add('Hello, World', 2) +#tripe.add('Goodbye, cruel world...', 3) +#tripe.add('This is a test.', 4) +#tripe.add('This is not a pipe', 5) +#tripe.add('Thistle, bristle and whistle!', 6) +#tripe.add('A bird in the hand is worth two in the bush.', 7) #tripe.dot() -tripe.write(open('/tmp/test.tripe', 'w')) +#tripe.write(TripeStore(open('/tmp/test.tripe', 'w'))) +#tripe = Tripe.read(TripeStore(open('/tmp/test.tripe')))