Skip to content
This repository has been archived by the owner on Feb 15, 2024. It is now read-only.

Commit

Permalink
Merge pull request #33 from afader/master
Browse files Browse the repository at this point in the history
fixed bug in computeOffsets
  • Loading branch information
Rob Bart committed Dec 3, 2013
2 parents 65e1c66 + b07975e commit e8246d0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 21 deletions.
28 changes: 7 additions & 21 deletions core/src/main/scala/edu/knowitall/tool/tokenize/Tokenizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,14 @@ object Tokenizer {
* It adds offset information to the strings by tracing through
* the source sentence and skipping whitespace. */
def computeOffsets(strings: TraversableOnce[String], sentence: String) = {
var sent: Array[Char] = sentence.toCharArray()
var offset: Int = 0
var tokens: Seq[Token] = Seq.empty

// remove leading spaces
val (spaces, rest) = sent.span(c => c.isWhitespace || c.isControl)
offset += spaces.size
sent = rest

for (string <- strings) {
val leftOffset = offset
assume(sent startsWith string, "Wrong sentence prefix: '" + string + "' of " + "'" + sentence + "'")

sent = sent.drop(string.length)
val skip = sent.takeWhile(c => c.isWhitespace || c.isControl).length
sent = sent.drop(skip)

offset += string.length + skip
tokens = tokens :+ Token(string, leftOffset)
var searchIndex = 0
val tokens = for (s <- strings) yield {
val startIndex = sentence.indexOf(s, searchIndex)
assume(startIndex >= 0, s"Could not find offset of '$s' in '$sentence' starting at $searchIndex")
searchIndex = startIndex + s.size
Token(s, startIndex)
}

tokens
tokens.toSeq
}

/** Rebuild the original text from tokens. This will maintain
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,11 @@ object TokenizerSpecTest extends Specification {
// make sure we can go back to the original sentence
Tokenizer.originalText(tokens, tokens.head.offset) must_== trimmedSentence
}

"offsets are computed correctly when a token is a unicode control symbol" in {
val sentence = "hello \u0097"
val tokens = Tokenizer.computeOffsets(Seq("hello", "\u0097"), sentence)
tokens.map(_.offsets.start) must_== Seq(0, 6)
Tokenizer.originalText(tokens) must_== sentence
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ object ClearTokenizerTest extends Specification {
val tokenizer = new ClearTokenizer()
tokenizer(text).mkString(" ") must_== "rough@0 straight@7 and@16"
}

}

0 comments on commit e8246d0

Please sign in to comment.