Permalink
Browse files

Working version.

  • Loading branch information...
1 parent 5212cde commit a0084bfcac9780db5b5eeeb4f13a069904992b7d @schmmd schmmd committed Oct 4, 2012
@@ -3,6 +3,7 @@ package tool
package tokenize
import edu.washington.cs.knowitall.common.HashCodeHelper
+import edu.washington.cs.knowitall.collection.immutable.Interval
/** The most simple representation of a token. A token has a string
* and a character offset in the original text.
@@ -21,6 +22,8 @@ class Token(val string: String, val offset: Int) {
this.offset == that.offset
case _ => false
}
+
+ def interval = Interval.open(offset, offset + string.length)
}
object Token {
@@ -10,5 +10,5 @@ abstract class Typer[E <: Token](val name: String, val source: String) {
case class Type(val name: String, val source: String, val interval: Interval, val text: String) {
def matchText[E <: Token](seq: Seq[E]): String = seq.iterator.slice(interval.start, interval.end).map(_.string).mkString(" ")
- def tokens[E <: Token](seq: Seq[E]): Seq[String] = seq.slice(interval.start, interval.end)
+ def tokens[E <: Token](seq: Seq[E]): Seq[E] = seq.slice(interval.start, interval.end)
}
View
@@ -2,12 +2,12 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<artifactId>nlptools-typer-stanford_2.9.2</artifactId>
- <version>2.2.2-SNAPSHOT</version>
+ <version>2.2.3-SNAPSHOT</version>
<name>nlptools-typer-stanford</name>
<parent>
<groupId>edu.washington.cs.knowitall.nlptools</groupId>
<artifactId>nlptools-parent_2.9.2</artifactId>
- <version>2.2.2-SNAPSHOT</version>
+ <version>2.2.2</version>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -0,0 +1,56 @@
+package edu.washington.cs.knowitall
+package tool
+package typer
+
+import java.net.URL
+import edu.stanford.nlp.ie.AbstractSequenceClassifier
+import edu.stanford.nlp.ie.crf.CRFClassifier
+import edu.stanford.nlp.util.Triple
+import edu.washington.cs.knowitall.collection.immutable.Interval
+import edu.washington.cs.knowitall.common.Resource.using
+import edu.washington.cs.knowitall.tool.tokenize._
+import java.io.BufferedInputStream
+import java.io.FileInputStream
+import java.util.zip.GZIPInputStream
+
+class StanfordNer(private val classifier: AbstractSequenceClassifier[_]) extends Typer[Token]("Stanford", "Stanford") {
+ def apply(text: String, seq: Seq[Token]) = {
+import scala.collection.JavaConverters._
+
+ val response = classifier.classifyToCharacterOffsets(text).asScala
+
+ var tags = List.empty[Type]
+ for (triple <- response) {
+ val nerInterval = Interval.open(triple.second, triple.third)
+ val nerType = triple.first
+
+ // find actual token offsets from NER offsets
+ val start = seq.find(_.interval.start == nerInterval.start).map(_.interval.start)
+ val end = seq.find(_.interval.end == nerInterval.end).map(_.interval.end)
+
+ for (s <- start; e <- end) {
+ val typ = new Type(this.name + nerType, "Stanford", Interval.open(s, e), text.substring(nerInterval.start, nerInterval.end))
+ tags ::= typ
+ }
+ }
+
+ tags
+ }
+
+ def apply(seq: Seq[Token]) = apply(seq.iterator.map(_.string).mkString(" "), seq)
+}
+
+object StanfordNer {
+ final val defaultModelUrl = StanfordNer.getClass().getResource("/edu/stanford/nlp/models/ner/english.all.3class.distsim.prop")
+ require(defaultModelUrl != null, "resource could not be found")
+
+ def fromModelUrl(url: URL) = {
+ using (new FileInputStream("/home/michael/english.all.3class.distsim.crf.ser.gz")) { stream =>
+ new StanfordNer(CRFClassifier.getClassifier(new BufferedInputStream(new GZIPInputStream(stream))))
+ }
+ }
+
+ def withDefaultModel = {
+ fromModelUrl(defaultModelUrl)
+ }
+}
@@ -1,34 +0,0 @@
-package edu.washington.cs.knowitall
-package tool
-package parse
-
-class StanfordNer(private val classifier: AbstractSequenceClassifier[_]) extends Typer[Token] {
- val DEFAULT_MODEL = "edu/stanford/nlp/models/ner/all.3class.distsim.crf.ser.gz"
-
- override def apply(seq: Seq[Token]) = {
- val response = classifier.classifyToCharacterOffsets(sentence.originalText)
-
- var tags = List.empty[Type]
- for (triple <- response) {
- val nerInterval = Interval.open(triple.second, triple.third)
- val nerType = triple.first
-
- // find actual token offsets from NER offsets
- val start = seq.find(_.offset.start == nerInterval.start)
- val end = seq.find(_.offset.end == nerInterval.end)
-
- for (s <- start; e <- end) {
- val typ = new Type(this.descriptor + nerType, "Stanford", Interval.open(s, e), seq.slice(s, e))
- tags ::= new Type(this.descriptor + nerType, "Stanford", range)
- }
- }
-
- tags
- }
-}
-
-object StanfordNer {
- def withDefaultModel = {
- new StanfordNer(this.classifier = CRFClassifier.getClassifierNoExceptions(model))
- }
-}
@@ -0,0 +1,24 @@
+package edu.washington.cs.knowitall
+package tool
+package typer
+
+import org.junit.runner.RunWith
+import org.specs2.mutable.Specification
+import org.specs2.runner.JUnitRunner
+import edu.washington.cs.knowitall.tool.tokenize.Tokenizer
+import edu.washington.cs.knowitall.collection.immutable.Interval
+
+@RunWith(classOf[JUnitRunner])
+object StanfordNerTest extends Specification {
+ "stanford ner example sentence" in {
+ val text = "This is an example sentence for the Stanford named entity recognizer ."
+ val split = text.split(" ")
+ val tokens = Tokenizer.computeOffsets(split, text)
+
+ val ner = StanfordNer.withDefaultModel
+ val types = ner(tokens)
+
+ types.size == 1
+ types.head.interval == Interval.open(36, 44)
+ }
+}
@@ -1,29 +0,0 @@
-package edu.washington.cs.knowitall
-package tool
-package parse
-
-import org.junit._
-import org.junit.Assert._
-import org.specs2.mutable.Specification
-import org.junit.runner.RunWith
-import org.specs2.runner.JUnitRunner
-
-@RunWith(classOf[JUnitRunner])
-object StanfordParserTest extends Specification {
- "constituency parse example sentence" in {
- val text = "This is a test of the Stanford Parser."
- val parser = new StanfordParser
-
- val constituency = parser.parse(text)
- constituency.toString must_== "(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT a) (NN test)) (PP (IN of) (NP (DT the) (NNP Stanford) (NNP Parser))))) (. .)))"
- }
-
- "dependency parse example sentence" in {
- val text = "This is a test of the Stanford Parser."
- val parser = new StanfordParser
-
- val dependency = parser.dependencyGraph(text, BaseStanfordParser.CCCompressed)
- dependency.toString must_== "(of_IN_4_15); (._._8_37); nsubj(test_NN_3_10, This_DT_0_0); cop(test_NN_3_10, is_VBZ_1_5); det(test_NN_3_10, a_DT_2_8); prep_of(test_NN_3_10, Parser_NNP_7_31); det(Parser_NNP_7_31, the_DT_5_18); nn(Parser_NNP_7_31, Stanford_NNP_6_22)"
- }
-}
-

0 comments on commit a0084bf

Please sign in to comment.