Merge pull request #33 from afader/master

fixed bug in computeOffsets
knowitall · Dec 3, 2013 · e8246d0 · e8246d0
2 parents 65e1c66 + b07975e
commit e8246d0
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 21 deletions.
diff --git a/core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala b/core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala
@@ -17,28 +17,14 @@ object Tokenizer {
     * It adds offset information to the strings by tracing through
     * the source sentence and skipping whitespace. */
   def computeOffsets(strings: TraversableOnce[String], sentence: String) = {
-    var sent: Array[Char] = sentence.toCharArray()
-    var offset: Int = 0
-    var tokens: Seq[Token] = Seq.empty
-
-    // remove leading spaces
-    val (spaces, rest) = sent.span(c => c.isWhitespace || c.isControl)
-    offset += spaces.size
-    sent = rest
-
-    for (string <- strings) {
-      val leftOffset = offset
-      assume(sent startsWith string, "Wrong sentence prefix: '" + string + "' of " + "'" + sentence + "'")
-
-      sent = sent.drop(string.length)
-      val skip = sent.takeWhile(c => c.isWhitespace || c.isControl).length
-      sent = sent.drop(skip)
-
-      offset += string.length + skip
-      tokens = tokens :+ Token(string, leftOffset)
+    var searchIndex = 0
+    val tokens = for (s <- strings) yield {
+      val startIndex = sentence.indexOf(s, searchIndex)
+      assume(startIndex >= 0, s"Could not find offset of '$s' in '$sentence' starting at $searchIndex")
+      searchIndex = startIndex + s.size
+      Token(s, startIndex)
     }
-
-    tokens
+    tokens.toSeq
   }
 
   /** Rebuild the original text from tokens.  This will maintain

diff --git a/core/src/test/scala/edu/knowitall/tool/tokenize/TokenizerSpec.scala b/core/src/test/scala/edu/knowitall/tool/tokenize/TokenizerSpec.scala
@@ -45,4 +45,11 @@ object TokenizerSpecTest extends Specification {
     // make sure we can go back to the original sentence
     Tokenizer.originalText(tokens, tokens.head.offset) must_== trimmedSentence
   }
+
+  "offsets are computed correctly when a token is a unicode control symbol" in {
+    val sentence = "hello \u0097"
+    val tokens = Tokenizer.computeOffsets(Seq("hello", "\u0097"), sentence)
+    tokens.map(_.offsets.start) must_== Seq(0, 6)
+    Tokenizer.originalText(tokens) must_== sentence
+  }
 }
diff --git a/tokenize/clear/src/test/scala/edu/knowitall/tool/tokenize/ClearTokenizerTest.scala b/tokenize/clear/src/test/scala/edu/knowitall/tool/tokenize/ClearTokenizerTest.scala
@@ -21,4 +21,5 @@ object ClearTokenizerTest extends Specification {
     val tokenizer = new ClearTokenizer()
     tokenizer(text).mkString(" ") must_== "rough@0 straight@7 and@16"
   }
+
 }