Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

backend to mmap, full persistence round-tripping

  • Loading branch information...
commit 9c10af08537268e4dfcc73e9e9761f608983e9ed 1 parent c9bba26
@ianloic authored
Showing with 234 additions and 102 deletions.
  1. +234 −102 tripe.py
View
336 tripe.py
@@ -1,8 +1,9 @@
#!/usr/bin/env python
'''Tripe - an information retrieval experiment'''
-import re
+import re, os
from struct import pack, unpack, calcsize
+from mmap import mmap, MAP_SHARED, PROT_READ, PROT_WRITE
STEMRE = re.compile(r'\W')
def stem(term):
@@ -18,13 +19,126 @@ def tokenize(text):
yield (off, stem(term), term)
off = off + len(term) + len(post)
+BLOCKSIZE = calcsize('Q') # size of ints
+HEADERCOUNT = 16 # number of ints in header
+HEADERSIZE = BLOCKSIZE * HEADERCOUNT # size of header in bytes
+
+HEADER_MAGIC = 0 # where the magic number resides
+HEADER_ROOT = 1 * BLOCKSIZE # root node
+HEADER_FIRST_FREE = 2 * BLOCKSIZE # first free block
+
+class TripeStore(object):
+ def __init__(self, filename, writable=False):
+ self.filename = filename
+ if writable:
+ open_mode = 'r+'
+ mmap_mode = PROT_READ | PROT_WRITE
+ else:
+ open_mode = 'r'
+ mmap_mode = PROT_READ
+ if not os.path.exists(filename):
+ # create empty file
+ open(filename, 'w').write(pack('Q'*16, Tripe.MAGIC, 0,
+ HEADERSIZE, *((0,)*(HEADERCOUNT-3))))
+ # open the file
+ self.file = open(filename, open_mode)
+ # map the file
+ self.mmap = mmap(self.file.fileno(), 0, MAP_SHARED, mmap_mode)
+
+ def __load_number(self, offset):
+ '''load a number from the map'''
+ return unpack('Q', self.mmap[offset:offset+BLOCKSIZE])[0]
+
+ def __store_number(self, offset, number):
+ '''store a number in the map'''
+ self.mmap[offset:offset+BLOCKSIZE] = pack('Q', number)
+
+ def __allocate(self, size):
+ '''allocate a block of the given size'''
+ # where's the end of the used space?
+ first_free = self.__load_number(HEADER_FIRST_FREE)
+ # where will it come to once we've allocated
+ new_size = first_free + BLOCKSIZE + size
+ # make sure we have enough space
+ if new_size > len(self.mmap):
+ self.mmap.resize(new_size)
+ # store the new end
+ self.__store_number(HEADER_FIRST_FREE, new_size)
+ # store the length
+ self.__store_number(first_free, size)
+ # return the offset to put data in
+ return first_free + BLOCKSIZE
+
+ def __handle_size(self, handle):
+ '''return the size of the handle'''
+ return self.__load_number(handle-BLOCKSIZE)
+
+ def __get_handle(self, handle):
+ return self.mmap[handle:handle+self.__handle_size(handle)]
+
+ def __set_handle(self, handle, bytes):
+ self.mmap[handle:handle+self.__handle_size(handle)] = bytes
+
+ def get_root(self):
+ '''the handle of the root node'''
+ return self.__load_number(HEADER_ROOT)
+
+ def set_root(self, handle):
+ '''the handle of the root node'''
+ return self.__store_number(HEADER_ROOT, handle)
+
+ def store_numbers(self, numbers):
+ '''allocate storage and store numbers. return a handle.'''
+ fmt = 'Q' * len(numbers)
+ size = calcsize(fmt)
+ handle = self.__allocate(size)
+ self.__set_handle(handle, pack(fmt, *numbers))
+ return handle
+
+ def store_text(self, text):
+ '''allocate storage and store text. return a handle.'''
+ # FIXME: pad to 64-bit boundary for performance?
+ utf8 = text.encode('utf-8')
+ utf8_len = len(utf8)
+ handle = self.__allocate(utf8_len)
+ self.__set_handle(handle, utf8)
+ return handle
+
+ def update_numbers(self, handle, numbers):
+ '''update numbers in an existing handle'''
+ fmt = 'Q' * len(numbers)
+ size = calcsize(fmt)
+ assert size == self.__handle_size(handle)
+ self.__set_handle(handle, pack(fmt, *numbers))
+
+ def load_numbers(self, handle):
+ '''fetch numbers stored at a handle'''
+ num_numbers = self.__handle_size(handle) / BLOCKSIZE
+ return unpack('Q'*num_numbers, self.__get_handle(handle))
+
+ def load_text(self, handle):
+ '''fetch text stored at a handle'''
+ # FIXME: drop trailing '\0'?
+ return self.__get_handle(handle).decode('utf-8')
+
+ def free(self, handle):
+ '''mark a handle as unused'''
+ pass
class Tripe(object):
'''a text index'''
MAGIC = unpack('Q', 'Tripe001')[0]
- def __init__(self):
- self.root = TrieNode()
+ def __init__(self, store):
+ self.store = store
+ if store.get_root() == 0:
+ # no root found - we should make one
+ self.root = TrieNode(self, None)
+ # store its location
+ store.set_root(self.root.handle)
+ else:
+ # load the root node
+ self.root = TrieNode(self, store.get_root())
def search(self, phrase, exact=False):
'''match the phrase'''
@@ -46,13 +160,12 @@ def search(self, phrase, exact=False):
return instances
- def add(self, text, docid):
- doc = Document(docid, text)
+ def add(self, text, doc):
tokens = list(tokenize(text))
tokens.reverse()
next = None
for off, stemmed, raw in tokens:
- ti = TermInstance(doc, off, raw, next)
+ ti = TermInstance(self, None, doc, off, raw, next)
self.root.add(stemmed, ti)
next = ti
@@ -61,42 +174,85 @@ def dot(self):
self.root.dot()
print '}'
- def write(self, file):
- '''make sure we're at the start of the file'''
- assert file.tell() == 0
- # write 16 64bit ints
- # magic 'Tripe001"
- # root node offset
- # 14 x 0 for future space
- file.write(pack('QQQQQQQQQQQQQQQQ', Tripe.MAGIC, 0, *((0,)*14)))
- # write all of the nodes, get the offset of the root node
- root_off = self.root.write(file)
- # write it to the second place in the file
- file.seek(calcsize('Q'))
- file.write(pack('Q', root_off))
-
-
class TrieNode(object):
'''a node in the trie'''
- def __init__(self):
- self.matches = []
- self.children = {}
- self.dotname = None
- self.offset = 0
+ def __init__(self, tripe, handle):
+ self.tripe = tripe
+ # if this is a new node, allocate it
+ if handle == None:
+ handle = tripe.store.store_numbers((0,0))
+ self.handle = handle
+
+ def __matches(self):
+ '''returns list of match handles'''
+ matches_handle, children_handle = \
+ self.tripe.store.load_numbers(self.handle)
+ if matches_handle == 0: return []
+ return self.tripe.store.load_numbers(matches_handle)
+
+ def __children(self):
+ '''returns list of pairs of (key, handle)'''
+ matches_handle, children_handle = \
+ self.tripe.store.load_numbers(self.handle)
+ if children_handle == 0: return []
+ children = self.tripe.store.load_numbers(children_handle)
+ return zip(children[::2], children[1::2])
+
+ def __find_child(self, name):
+ # FIXME: binary search, or something better
+ for key, offset in self.__children():
+ if key == name: return TrieNode(self.tripe, offset)
+ return None
def search(self, term):
if term == '':
- return self.matches
+ return [TermInstance(self.tripe, h) for h in self.__matches()]
else:
- return self.children[term[0]].search(term[1:])
+ child = self.__find_child(ord(term[0]))
+ if child: return child.search(term[1:])
+ else: return []
def add(self, term, value):
if term == '':
- self.matches.append(value)
+ # get the current list of matches
+ matches = list(self.__matches())
+ # append this new match
+ matches.append(value.handle)
+ # store the new list
+ new_matches = self.tripe.store.store_numbers(matches)
+ # update the record
+ old_matches, children_handle = \
+ self.tripe.store.load_numbers(self.handle)
+ self.tripe.store.update_numbers(self.handle, (new_matches, children_handle))
+ # free the old matches array
+ if old_matches:
+ self.tripe.store.free(old_matches)
else:
- if not self.children.has_key(term[0]):
- self.children[term[0]] = TrieNode()
- self.children[term[0]].add(term[1:], value)
+ # look for an existing child node
+ character = ord(term[0])
+ child = self.__find_child(character)
+ if child == None:
+ # create a new child
+ child = TrieNode(self.tripe, None)
+ # none, time to rewrite the children list
+ children = self.__children()
+ children.append((character, child.handle))
+ children.sort()
+ # flatten the list for storage
+ children_flat = []
+ for n,v in children:
+ children_flat.append(n)
+ children_flat.append(v)
+ # store the new list of children
+ new_children = self.tripe.store.store_numbers(children_flat)
+ # update the record
+ matches_handle, old_children = \
+ self.tripe.store.load_numbers(self.handle)
+ self.tripe.store.update_numbers(self.handle, (matches_handle, new_children))
+ # free the old children array
+ if old_children:
+ self.tripe.store.free(old_children)
+ child.add(term[1:], value)
def dot(self, label=''):
if self.dotname:
@@ -109,43 +265,48 @@ def dot(self, label=''):
print '%s -> %s' % (self.dotname, m.dot())
return self.dotname
- def write(self, file):
- if self.offset:
- return self.offset
-
- # force all matches and children to be written, collect offsets
- match_offsets = [m.write(file) for m in self.matches]
- child_offsets = [(k, v.write(file)) for k,v in self.children.items()]
-
- # now prepare to write the record for this trie node
- self.offset = file.tell() # remember where we're writing
- # write the number of matches and children
- file.write(pack('QQ', len(self.matches), len(self.children)))
- # write the matches offsets
- file.write(pack('Q'*len(match_offsets), *match_offsets))
- # write each of the char/offset pairs for the children
- child_offsets.sort()
- for k,o in child_offsets:
- file.write(pack('QQ', ord(k), o))
-
- return self.offset
-
class TermInstance(object):
'''an instance of a term in a document'''
- def __init__(self, doc, off, raw, next):
- self.doc = doc # document
- self.off = off # offset within document
- self.raw = raw # original text of the term
- self.next = next # next term instance in the document
+ def __init__(self, tripe, handle, doc=None, offset=None, raw=None, next=None):
+ self.tripe = tripe
+ # if this is a new term instance, allocate it
+ if handle == None:
+ # store the supplied values
+ self.doc = doc
+ self.offset = offset
+ self.__next = next
+ if next:
+ self.next_handle = next.handle
+ else:
+ self.next_handle = 0
+ self.raw_handle = tripe.store.store_text(raw)
+ handle = tripe.store.store_numbers((doc, offset, self.raw_handle,
+ self.next_handle))
+ else:
+ # load the values
+ self.doc, self.offset, self.raw_handle, self.next_handle = \
+ tripe.store.load_numbers(handle)
+ assert self.raw_handle != None
+ self.raw = tripe.store.load_text(self.raw_handle)
+ self.__next = None
+ self.handle = handle
+
self.dotname = None
- self.offset = 0
+
+ def next(self):
+ if self.__next != None: return self.__next
+ if self.next_handle != 0:
+ self.__next = TermInstance(self.tripe, self.next_handle)
+ return self.__next
+ else:
+ return None
def matches_exact(self, raw):
return self.raw == raw
def matches_phrase(self, phrase, exact=False):
- instance = self.next
+ instance = self.next()
for off, stemmed, raw in phrase:
# if we run out of words in the document we fail
if not instance: return False
@@ -162,11 +323,11 @@ def matches_phrase(self, phrase, exact=False):
return False
# this word looked good, next please
- instance = instance.next
+ instance = instance.next()
return True
def __repr__(self):
- return 'TermInstance<%s, off=%s, raw=%s>' % (`self.doc`, self.off, `self.raw`)
+ return 'TermInstance<doc=%s, offset=%s, raw=%s>' % (`self.doc`, self.offset, `self.raw`)
def dot(self):
if self.dotname:
@@ -177,46 +338,17 @@ def dot(self):
print '%s -> %s [style=dashed]' % (self.dotname, self.next.dot())
return self.dotname
- def write(self, file):
- if self.offset: return self.offset
-
- # write (and get the offset for) the next term instance
- next_offset = 0
- if self.next:
- next_offset = self.next.write(file)
-
- # write the raw string version for reference
- # FIXME: avoid duplication, other instances of the same term may be the same
- raw_offset = file.tell()
- utf8 = self.raw.encode('utf8') + '\0'
- # pad to 64 bits
- while len(utf8) % calcsize('Q'): utf8 = utf8 + '\0'
- # write it out
- file.write(utf8)
-
- # write the record for this instance
- self.offset = file.tell()
- file.write(pack('QQQQ', self.doc.docid, self.off, raw_offset, next_offset))
-
- return self.offset
-
-
-class Document(object):
- '''a document contains terms'''
- def __init__(self, docid, text):
- self.docid = docid
- self.text = text
- def __repr__(self):
- return 'Document<%s>' % self.docid
-tripe = Tripe()
-tripe.add('Hello world', 1)
-tripe.add('Hello, World', 2)
-tripe.add('Goodbye, cruel world...', 3)
-tripe.add('This is a test.', 4)
-tripe.add('This is not a pipe', 5)
-tripe.add('Thistle, bristle and whistle!', 6)
-tripe.add('A bird in the hand is worth two in the bush.', 7)
+tripe = Tripe(TripeStore('/tmp/test.tripe', False))
+#tripe = Tripe(TripeStore('/tmp/test.tripe', True))
+#tripe.add('Hello world', 1)
+#tripe.add('Hello, World', 2)
+#tripe.add('Goodbye, cruel world...', 3)
+#tripe.add('This is a test.', 4)
+#tripe.add('This is not a pipe', 5)
+#tripe.add('Thistle, bristle and whistle!', 6)
+#tripe.add('A bird in the hand is worth two in the bush.', 7)
#tripe.dot()
-tripe.write(open('/tmp/test.tripe', 'w'))
+#tripe.write(TripeStore(open('/tmp/test.tripe', 'w')))
+#tripe = Tripe.read(TripeStore(open('/tmp/test.tripe')))
Please sign in to comment.
Something went wrong with that request. Please try again.