Permalink
Browse files

Memoization of the computation of the edit distance: all calls to com…

…puteEditDistance() with the same params are really computed only once.
  • Loading branch information...
1 parent a3622a8 commit b3092ed728fda12deda7318d37dd0a7d15ce886b @julosaure committed Apr 9, 2012
Showing with 65 additions and 10 deletions.
  1. +5 −1 batcher.py
  2. +35 −0 memoization.py
  3. +12 −9 multiAligner.py
  4. +1 −0 pyLatticeAligner.py
  5. +12 −0 sentence.py
View
@@ -1,12 +1,15 @@
#!/usr/bin/python
-import glob, fileinput, subprocess, sys, argparse, os.path
+import glob, fileinput, subprocess, sys, argparse, os.path, datetime
MALIGNER = "/Users/julien/workspaces/xp/hcomp12/pyLatticeAligner/pyLatticeAligner.py"
def main(directory, refFile, opts=None):
+ begin = datetime.datetime.now()
+ print "End after "+ str(datetime.datetime.now()-begin)
+
outFile = os.path.join(directory,refFile[:-4] + ".aligned")
out = open(outFile, "w")
@@ -37,6 +40,7 @@ def main(directory, refFile, opts=None):
i += 1
out.close()
+ print "Total time: "+ str(datetime.datetime.now()-begin)
if __name__ == "__main__":
View
@@ -0,0 +1,35 @@
+#!/usr/bin/python
+
+from functools import wraps
+import cPickle
+
+def memo(func):
+ """ Memoize the wrapped function.
+
+ The result of each call of func is stored in memo.
+ Subsequent calls with the same parameters make only a dictionary lookup.
+
+ The string representation of the params of func is essential,
+ especially if params are mutables, such as list.
+ Here we use it for Sentence and Alignment datastructures
+ for which we have a string representation.
+
+ If we cannot easily have such representation, using cPickle.dump
+ can rather be used, but it's slow.
+ """
+ memo = {}
+ @wraps(func)
+ def wrapper(*args, **kwds):
+ # computes a hash with cPickle
+ # works but very slow
+ #keystr = cPickle.dumps(args, 1) + cPickle.dumps(kwds, 1)
+
+ keystr = str(map(str, args)) + str(map(str, kwds))
+
+ # check if the function was previously computed with these args
+ if not memo.has_key(keystr):
+ memo[keystr] = func(*args, **kwds)
+
+ return memo[keystr]
+ return wrapper
+
View
@@ -5,6 +5,7 @@
from sentence import *
from lalign import *
from editDistance import *
+from memoization import memo
class MultiAligner:
@@ -56,7 +57,8 @@ def computeDistanceMatrix2(self, lAlign):
for j in xrange(i+1, nbSentence):
s1 = lAlign.getSentOrAlignAtPos(i)
s2 = lAlign.getSentOrAlignAtPos(j)
- editMat, finalCell = self.computeEditDistance(s1, s2)
+ #editMat, finalCell = self.computeEditDistance(s1, s2)
+ finalCell = self.computeEditDistance(s1, s2)
distMat[i,j] = finalCell.val
return distMat
@@ -107,15 +109,15 @@ def alignItems(self, a1, a2, sentencesToAlign):
return align
def alignAlignments(self, a1, a2):
- editMat, finalCell = self.computeEditDistance(a1, a2)
- print editMat
+ finalCell = self.computeEditDistance(a1, a2)
+ #print editMat
cell = finalCell
while cell.i > 0 or cell.j > 0:
s = cell.pp()
if cell.prev is not None:
s += " " + cell.prev.pp()
- print s
+ #print s
prevPos = max(0, cell.i-1)
@@ -154,7 +156,7 @@ def alignAlignments(self, a1, a2):
for nSent in a2.alignedSentences:
a1[prevPos].add(SentPos(nSent, -1))
- print a1.sentAlign(a2.alignedSentences)
+ #print a1.sentAlign(a2.alignedSentences)
cell = cell.prev
a1.alignedSentences.extend(a2.alignedSentences)
@@ -164,7 +166,7 @@ def alignAlignments(self, a1, a2):
def alignSentenceVsAlignment(self, a1, a2):
n2, s2 = a2 #align.lSentence[n2]
#print s2
- editMat, finalCell = self.computeEditDistance(a1, s2)
+ finalCell = self.computeEditDistance(a1, s2)
#print editMat
cell = finalCell
@@ -213,7 +215,7 @@ def alignSentenceVsAlignment(self, a1, a2):
def alignSentencePair(self, a1, a2, sentencesToAlign):
n1, s1 = a1 #sentencesToAlign[n1]
n2, s2 = a2 #sentencesToAlign[n2]
- editMat, finalCell = self.computeEditDistance(s1, s2)
+ finalCell = self.computeEditDistance(s1, s2)
#print editMat
align = Alignment(sentencesToAlign)
@@ -256,6 +258,7 @@ def computeDistanceMatrix(self, sentenceToAlign):
distMat[i,j] = finalCell.val
return distMat
+ @memo
def computeEditDistance(self, s1, s2):
""" Compute the edit distance betweem to items, either Sentences or Alignments.
"""
@@ -277,8 +280,8 @@ def computeEditDistance(self, s1, s2):
mat[i,j] = DistCell(i, j, prev[0], prev[1])
#print i, j, prev[0], prev[1]
#print mat
- return mat, mat[l1,l2]
-
+ #return mat, mat[l1,l2]
+ return mat[l1,l2]
class DistCell():
""" A cell of the edit distance matrix, composed of its positions i and j in the matrix, of its value val, and of a pointer to its predecessor cell (chosen during the dynamic programming step that created this cell).
View
@@ -3,6 +3,7 @@
import fileinput, argparse
import nltk, nltk.data
import sentence, multiAligner, lattice
+import cProfile
# default tagger in NLTK
POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
View
@@ -23,6 +23,18 @@ def __str__(self):
def pp(self):
return str([tok.pp() for tok in self])
+ def __eq__(self, other):
+ equals = True
+ if not isinstance(other, Sentence) or len(self) != len(other):
+ equals = False
+ else:
+ for tok1, tok2 in zip(self, other):
+ if tok1 != tok2:
+ equals = False
+ break
+ return equals
+
+
@total_ordering
class Token:

0 comments on commit b3092ed

Please sign in to comment.