Permalink
Browse files

Add overview comments for each module

Ignore-this: e10c63c78b17025f443fe0abd98e1cf4

darcs-hash:20110129081647-1e580-6ca7ccd160493031e47a2148b53646bba3e4c4aa.gz
  • Loading branch information...
jamii committed Jan 29, 2011
1 parent a3323aa commit d858c3ae1e0b6df4bd706ee82c4d20d6c5b7a694
Showing with 42 additions and 0 deletions.
  1. +3 −0 scripts/db.py
  2. +2 −0 scripts/preprocessor.py
  3. +3 −0 scripts/replay.py
  4. +3 −0 scripts/search.py
  5. +3 −0 scripts/test.py
  6. +3 −0 scripts/times.py
  7. +2 −0 src/dynArray.ml
  8. +2 −0 src/hashset.ml
  9. +2 −0 src/pid.ml
  10. +2 −0 src/suffix.ml
  11. +17 −0 src/suffix_array.ml
View
@@ -1,4 +1,7 @@
#!/usr/bin/env python
+
+""" Handles parsing Springer documents and adding/deleting document entries to/from couchdb """
+
import re
import sys, httplib, urllib
from xml.dom import minidom
View
@@ -1,5 +1,7 @@
#!/bin/env python
+""" Parses and preprocesses LaTeX formulae using PlasTeX """
+
import string, re
from plasTeX import TeXFragment, TeXDocument
import plasTeX.Context
View
@@ -1,4 +1,7 @@
#!/usr/bin/env python
+
+""" Replay existing searches found in couchdb logs """
+
import sys
import re
import search
View
@@ -1,4 +1,7 @@
#!/bin/env python
+
+""" Python interface to the index external service running on couchdb """
+
import urllib
import time
from xml.dom import minidom
View
@@ -1,4 +1,7 @@
#!/bin/env python
+
+""" Whitebox testing of the index external service running on couchdb """
+
import os, sys, httplib, urllib, socket
from xml.dom import minidom
from util import decodeDoi
View
@@ -1,4 +1,7 @@
#!/bin/env python
+
+""" Benchmarking for the index external service running on couchdb """
+
import os, sys, httplib, urllib, socket
import random
import couchdb.client
View
@@ -1,3 +1,5 @@
+(* Modified version of ExtLib DynArray - contains no functional values so is safer for Marshal *)
+
(*
* DynArray - Resizeable Ocaml arrays
* Copyright (C) 2003 Brian Hurt
View
@@ -1,3 +1,5 @@
+(* Simple sets using Hashtbl *)
+
type 'a t = ('a, unit) Hashtbl.t
let create = Hashtbl.create
View
@@ -1,3 +1,5 @@
+(* Prevents multiple update processes from running in parrallel *)
+
let lock () =
try
Util.flush_line "Checking pid file";
View
@@ -1,3 +1,5 @@
+(* Packed representations of suffixes of strings. Used by suffix_array *)
+
type id = int
type pos = int
View
@@ -1,3 +1,8 @@
+(*
+Suffix arrays storing compressed latex formulae.
+Allows neighbourhood search by Latex.distance
+*)
+
open Util
type id = Suffix.id
@@ -43,6 +48,7 @@ let insert sa (opaque, latex) =
DynArray.add sa.deleted false;
id
+(* a little convoluted to keep memory usage as low as possible *)
let prepare sa =
let ids = List.map (insert sa) sa.unsorted in
sa.unsorted <- [];
@@ -85,6 +91,8 @@ let leq sa latexL (id,pos) =
let latexR = DynArray.get sa.latexs id in
(Latex.compare_suffix (latexL,0) (latexR,pos)) <= 0
+(* Exact searching *)
+
(* binary search *)
let gather_exact ids sa latex =
(* find beginning of region *)
@@ -118,6 +126,13 @@ let find_exact sa latex =
filter_deleted sa ids;
List.map (exact_match sa) (Hashset.to_list ids)
+(* Searching by Latex.distance *)
+
+(*
+The logic behind the approx search is as follows:
+Suppose Latex.distance latex corpus_term < k
+Then List.exists (fun fragment -> Latex.distance fragment corpus_term = 0) (Latex.fragments latex k)
+*)
let gather_approx sa precision latex =
let k = Latex.cutoff precision latex in
let ids = Hashset.create 0 in
@@ -138,6 +153,8 @@ let find_approx sa precision latex =
filter_deleted sa ids;
Util.filter_map (approx_match sa precision latex) (Hashset.to_list ids)
+(* Searching by Query.distance *)
+
let rec gather_query sa precision query =
match query with
| Query.Latex (latex, _) -> gather_approx sa precision latex

0 comments on commit d858c3a

Please sign in to comment.