Permalink
Browse files

simple inverted index via TC's key-value store

  • Loading branch information...
igrigorik committed Jul 8, 2009
1 parent 7d6e8b1 commit 00d3f1110288a114d066863322752905129693e5
Showing with 260 additions and 0 deletions.
  1. +37 −0 inverted-index/README.rdoc
  2. +212 −0 inverted-index/inverted-index.lua
  3. +11 −0 inverted-index/inverted-index.rb
View
@@ -0,0 +1,37 @@
+= Inverted Index with Lua and Tokyo Cabinet
+
+An inverted index is an index data structure storing a mapping from content, such as words or numbers,
+to its locations in a database file, or in a document or a set of documents, in this case allowing full
+text search [more on wkipedia].
+
+== Starting server with incr extension
+ > ttserver -ext inverted-index.lua test.tch
+
+== Executing from command line
+ > tcrmgr ext localhost search "hello" 5 -- search for "hello" in empty DB, returns 0 results
+ 0
+ > tcrmgr ext localhost put 1 "hello awesome world" -- store document with ID 1
+ ok
+ > tcrmgr ext localhost search "hello" 5 -- finds 1 document, and list of ids: 1
+ 1
+ 1
+ > tcrmgr ext localhost put 2 "hello world" -- store another document with ID 2
+ ok
+ > tcrmgr ext localhost search "hello" 5 -- search for "hello": 2 results, ids: 2,1
+ 2
+ 2
+ 1
+ > tcrmgr ext localhost search "awesome" 5 -- search for "awesome": 1 result, ids: 1
+ 1
+ 1
+
+== Executing via Ruby
+
+ > ruby inverted-index.rb
+ > ok
+ > ok
+ > 2
+ > 124
+ > 123
+
+Source: http://opensource.plurk.com/LightCloud/Inverted_index_by_the_Lua_extension_of_Tokyo_Tyrant/
@@ -0,0 +1,212 @@
+--
+-- Inverted index by the Lua extension of Tokyo Tyrant
+--
+
+
+-- constants
+DELIMS = " \t\r\n" -- delimiters of tokenizing
+LIMNUM = 2000 -- limit number of kept occurrence
+DEFMAX = 10 -- default maximum number of search
+
+
+-- call back function when starting
+function _begin()
+ _log("Inverted index started")
+end
+
+-- call back function when ending
+function _end()
+ _log("Inverted index finished")
+end
+
+-- register a text into the index
+function put(id, text)
+ id = tonumber(id)
+ if not id or id < 1 then
+ return nil
+ end
+ if not text then
+ return nil
+ end
+ local tokens = _tokenize(text)
+ if math.random() < 5 / LIMNUM then
+ for i = 1, #tokens do
+ token = tokens[i]
+ if not _lock(token) then
+ _log("lock error")
+ return nil
+ end
+ local ids = {}
+ local idsel = _get(token)
+ if idsel then
+ ids = _unpack("w*", idsel)
+ end
+ local nids = {}
+ local top = #ids - LIMNUM + 2
+ if top < 1 then
+ top = 1
+ end
+ for j = top, #ids do
+ table.insert(nids, ids[j])
+ end
+ table.insert(nids, id)
+ idsel = _pack("w*", nids)
+ if not _put(token, idsel) then
+ _log("put error")
+ _unlock(token)
+ return nil
+ end
+ _unlock(token)
+ end
+ else
+ local idsel = _pack("w", id)
+ for i = 1, #tokens do
+ token = tokens[i]
+ if not _lock(token) then
+ _log("lock error")
+ return nil
+ end
+ if not _putcat(token, idsel) then
+ _log("putcat error")
+ _unlock(token)
+ return nil
+ end
+ _unlock(token)
+ end
+ end
+ return "ok"
+end
+
+-- remove a text from the index
+function out(id, text)
+ id = tonumber(id)
+ if not id or id < 1 then
+ return nil
+ end
+ if not text then
+ return nil
+ end
+ local tokens = _tokenize(text)
+ for i = 1, #tokens do
+ token = tokens[i]
+ if not _lock(token) then
+ _log("lock error")
+ return nil
+ end
+ local ids = {}
+ local idsel = _get(token)
+ if idsel then
+ ids = _unpack("w*", idsel)
+ end
+ local nids = {}
+ for j = 0, #ids do
+ if ids[j] ~= id then
+ table.insert(nids, ids[j])
+ end
+ end
+ idsel = _pack("w*", nids)
+ if not _put(token, idsel) then
+ _log("put error")
+ _unlock(token)
+ return nil
+ end
+ _unlock(token)
+ end
+ return "ok"
+end
+
+-- replace the text
+function replace(id, befaft)
+ id = tonumber(id)
+ if not id or id < 1 then
+ return nil
+ end
+ if not befaft then
+ return nil
+ end
+ local pivot = string.find(befaft, "\n", 1, true)
+ if not pivot then
+ return nil
+ end
+ local bef = string.sub(befaft, 1, pivot - 1)
+ local aft = string.sub(befaft, pivot + 1)
+ if not out(id, bef) then
+ return nil
+ end
+ if not put(id, aft) then
+ return nil
+ end
+ return "ok"
+end
+
+-- search the index with a phrase of intersection
+function search(phrase, max)
+ if not phrase then
+ return nil
+ end
+ max = tonumber(max)
+ if not max or max < 0 then
+ max = DEFMAX
+ end
+ local tokens = _tokenize(phrase)
+ local tnum = #tokens
+ if tnum < 1 then
+ return "0\n"
+ end
+ local idsel = _get(tokens[1])
+ local result = _unpack("w*", idsel)
+
+
+
+ for i = 2, tnum do
+ idsel = _get(tokens[i])
+ local ids = _unpack("w*", idsel)
+ result = _isect(result, ids)
+ end
+
+
+
+
+-- if tnum > 1 then
+-- local rset = {}
+-- table.insert(rset, result)
+-- for i = 2, tnum do
+-- idsel = _get(tokens[i])
+-- result = _unpack("w*", idsel)
+-- table.insert(rset, result)
+-- end
+-- result = _isect(rset)
+-- end
+
+
+
+ table.sort(result)
+ local rtxt = #result .. "\n"
+ local bot = #result - max
+ if bot < 1 then
+ bot = 1
+ end
+ for i = #result, bot, -1 do
+ if max < 1 then
+ break
+ end
+ rtxt = rtxt .. result[i] .. "\n"
+ max = max - 1
+ end
+ return rtxt
+end
+
+-- break a text into an array of tokens
+function _tokenize(text)
+ local tokens = {}
+ local uniq = {}
+ for token in string.gmatch(text, "[^" .. DELIMS .. "]+") do
+ if #token > 0 and not uniq[token] then
+ table.insert(tokens, token)
+ uniq[token] = true
+ end
+ end
+ return tokens
+end
+
+-- END OF FILE
@@ -0,0 +1,11 @@
+require 'rubygems'
+require 'rufus/tokyo/tyrant' # sudo gem install rufus-tokyo
+
+t = Rufus::Tokyo::Tyrant.new('127.0.0.1', 1978)
+
+puts t.ext(:put, '123', 'hello world')
+puts t.ext(:put, '124', 'world of icecream')
+
+puts t.ext(:search, 'world', 5)
+
+t.close

0 comments on commit 00d3f11

Please sign in to comment.