Skip to content
Browse files

Experiments with a specific configuration of the Stanford Parser.

  • Loading branch information...
0 parents commit 454d6cb59e95b99f8ddc8b8c57330095a586b819 @jimtyhurst committed Apr 9, 2011
10 .classpath
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src"/>
+ <classpathentry kind="src" path="test/src"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
+ <classpathentry kind="lib" path="resources"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
+ <classpathentry kind="lib" path="lib/stanford-parser-2010-11-30.jar" sourcepath="/stanford-parser/src"/>
+ <classpathentry kind="output" path="bin"/>
+</classpath>
8 .gitignore
@@ -0,0 +1,8 @@
+bin
+classes
+lib
+*.jar
+*.gz
+
+# OS-generated files
+.DS_Store
17 .project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>stanford-parser-util</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ </natures>
+</projectDescription>
12 .settings/org.eclipse.jdt.core.prefs
@@ -0,0 +1,12 @@
+#Sat Apr 09 16:14:53 CDT 2011
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.6
21 README
@@ -0,0 +1,21 @@
+# stanford-parser-util-java
+
+Java utility classes to simplify use of the Stanford Parser for a specific configuration for parsing English sentences.
+
+My goal is to provide access to a parser that can handle simple English sentences, so that the surface structure can be used to generate a logical form that can be interpreted in a model-theory semantics. See http://github.com/jimtyhurst/generalized-quantifiers for the beginnings of such a semantic analyzer written in Clojure.
+
+## Usage
+
+See the unit tests in LexicalizedParserTest for examples.
+
+## Dependencies
+
+Available from http://nlp.stanford.edu/software/lex-parser.shtml:
+lib/stanford-parser-2010-11-30.jar
+resources/englishPCFG.ser.gz
+
+## License
+
+Copyright (C) 2011 Jim Tyhurst
+
+Distributed under the Eclipse Public License.
39 src/com/tyhurst/stanfordparser/util/EnglishParserModule.java
@@ -0,0 +1,39 @@
+package com.tyhurst.stanfordparser.util;
+
+import java.io.StringReader;
+
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.objectbank.TokenizerFactory;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.process.Tokenizer;
+import edu.stanford.nlp.process.WordTokenFactory;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+
+
+public class EnglishParserModule implements ParserModule {
+
+ private static final String PARSER_CONFIGURATION_FILE = "./resources/englishPCFG.ser.gz";
+
+ public LexicalizedParser buildParser() {
+ return new LexicalizedParser(PARSER_CONFIGURATION_FILE);
+ }
+
+ public Tokenizer<Word> buildTokenizer(String sentence) {
+ TokenizerFactory<Word> factory = PTBTokenizer.factory(false, new WordTokenFactory());
+ return factory.getTokenizer(new StringReader(sentence));
+ }
+
+ public TreeUtil buildTreeUtil() {
+ TreeUtil util = new TreeUtil(buildGrammaticalStructureFactory());
+ return util;
+ }
+
+ private GrammaticalStructureFactory buildGrammaticalStructureFactory() {
+ TreebankLanguagePack languagePack = new PennTreebankLanguagePack();
+ return languagePack.grammaticalStructureFactory(languagePack.punctuationWordRejectFilter());
+ }
+
+}
15 src/com/tyhurst/stanfordparser/util/ParserModule.java
@@ -0,0 +1,15 @@
+package com.tyhurst.stanfordparser.util;
+
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.process.Tokenizer;
+
+public interface ParserModule {
+
+ LexicalizedParser buildParser();
+
+ Tokenizer<Word> buildTokenizer(String sentence);
+
+ TreeUtil buildTreeUtil();
+
+}
49 src/com/tyhurst/stanfordparser/util/TreeUtil.java
@@ -0,0 +1,49 @@
+package com.tyhurst.stanfordparser.util;
+
+import java.util.List;
+
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeFunctions;
+import edu.stanford.nlp.trees.TypedDependency;
+
+public class TreeUtil {
+
+ private GrammaticalStructureFactory grammaticalStructureFactory;
+
+ public TreeUtil() {
+ }
+
+ public TreeUtil(GrammaticalStructureFactory grammaticalStructureFactory) {
+ this.grammaticalStructureFactory = grammaticalStructureFactory;
+ }
+
+ public List<TypedDependency> getTypedDependencies(Tree tree) {
+ return getTypedDependencies(tree, getGrammaticalStructureFactory());
+ }
+
+ public List<TypedDependency> getTypedDependencies(Tree tree, GrammaticalStructureFactory grammaticalStructureFactory) {
+ if (grammaticalStructureFactory == null) {
+ throw new IllegalArgumentException("grammaticalStructureFactory cannot be null.");
+ }
+ Tree stringLabeledTree = TreeFunctions.getLabeledTreeToStringLabeledTreeFunction().apply(tree);
+ GrammaticalStructure surfaceStructure = grammaticalStructureFactory.newGrammaticalStructure(stringLabeledTree);
+ List<TypedDependency> dependencies = surfaceStructure.typedDependenciesCCprocessed(true);
+ return dependencies;
+ }
+
+ public Tree treeToStringLabeledTree(Tree tree) {
+ Tree stringLabeledTree = TreeFunctions.getLabeledTreeToStringLabeledTreeFunction().apply(tree);
+ return stringLabeledTree;
+ }
+
+ public GrammaticalStructureFactory getGrammaticalStructureFactory() {
+ return grammaticalStructureFactory;
+ }
+
+ public void setGrammaticalStructureFactory(GrammaticalStructureFactory grammaticalStructureFactory) {
+ this.grammaticalStructureFactory = grammaticalStructureFactory;
+ }
+
+}
119 test/src/com/tyhurst/stanfordparser/LexicalizedParserTest.java
@@ -0,0 +1,119 @@
+package com.tyhurst.stanfordparser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.tyhurst.stanfordparser.util.EnglishParserModule;
+import com.tyhurst.stanfordparser.util.ParserModule;
+import com.tyhurst.stanfordparser.util.TreeUtil;
+
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TypedDependency;
+
+public class LexicalizedParserTest {
+
+ private static final String EXPECTED_LABEL_ROOT = "ROOT";
+ private static final String EXPECTED_LABEL_S = "S";
+ private static final String EXPECTED_LABEL_NP = "NP";
+ private static final String EXPECTED_LABEL_VP = "VP";
+
+ private static ParserModule parserModule;
+ private static LexicalizedParser parser;
+ private static TreeUtil treeUtil;
+
+ @BeforeClass
+ public static void setup() {
+ parserModule = new EnglishParserModule();
+ }
+
+ /**
+ * Expect:
+ * tree = (ROOT (S (NP (DT every) (JJ male) (NN student)) (VP (VBP read) (NP (DT some) (NN book)))))
+ */
+ @Test public void testSimpleSVO() {
+ Tree tree = getBestParse("every male student read some book");
+
+ assertEquals(EXPECTED_LABEL_ROOT, tree.label().value());
+ Tree actualSNode = tree.firstChild();
+ assertEquals(EXPECTED_LABEL_S, actualSNode.label().value());
+ Tree actualNPNode = actualSNode.getChild(0);
+ assertEquals(EXPECTED_LABEL_NP, actualNPNode.label().value());
+ Tree actualVPNode = actualSNode.getChild(1);
+ assertEquals(EXPECTED_LABEL_VP, actualVPNode.label().value());
+ }
+
+ /**
+ * Expect:
+ * tree = (ROOT (S (NP (DT every) (NN student)) (VP (VBP read) (NP (DT some) (NN book)))))
+ * dependencies = [det(student-3, every-1), amod(student-3, male-2), nsubj(read-4, student-3), det(book-6, some-5), dobj(read-4, book-6)]
+ */
+ @Test public void testSimpleSVODependencies() {
+ Tree tree = getBestParse("every male student read some book");
+ List<TypedDependency> dependencies = getTreeUtil().getTypedDependencies(tree);
+
+ assertTrue(dependencies.size() == 5);
+ TypedDependency detDependency = dependencies.get(0);
+ assertEquals("det", detDependency.reln().getShortName());
+ assertEquals("every-1", detDependency.dep().toString());
+ assertEquals("student-3", detDependency.gov().toString());
+ }
+
+ /**
+ * Expect:
+ * tree = (ROOT (S (NP (DT every) (JJ male) (NN student)) (VP (VP (VBD read) (NP (DT some) (NN book))) (CC and) (VP (VBD kissed) (NP (DT a) (NN girl))))))
+ * dependencies = [det(student-3, every-1), amod(student-3, male-2), nsubj(read-4, student-3), nsubj(kissed-8, student-3), det(book-6, some-5), dobj(read-4, book-6), conj_and(read-4, kissed-8), det(girl-10, a-9), dobj(kissed-8, girl-10)]
+ */
+ @Test public void testSVODependenciesWithConjunction() {
+ Tree tree = getBestParse("every male student read some book and kissed a girl");
+ List<TypedDependency> dependencies = getTreeUtil().getTypedDependencies(tree);
+
+ assertTrue(dependencies.size() == 9);
+ TypedDependency detDependency = dependencies.get(0);
+ assertEquals("det", detDependency.reln().getShortName());
+ assertEquals("every-1", detDependency.dep().toString());
+ assertEquals("student-3", detDependency.gov().toString());
+ }
+
+ private Tree getBestParse(String sentence) {
+ List<Word> tokens = tokenize(sentence);
+ LexicalizedParser parser = getParser();
+ parser.parse(tokens);
+ Tree tree = parser.getBestParse();
+ Tree stringLabeledTree = getTreeUtil().treeToStringLabeledTree(tree);
+ return stringLabeledTree;
+ }
+
+ private List<Word> tokenize(String sentence) {
+ return getParserModule().buildTokenizer(sentence).tokenize();
+ }
+
+ private LexicalizedParser getParser() {
+ if (parser == null) {
+ parser = getParserModule().buildParser();
+ }
+ parser.reset();
+ return parser;
+ }
+
+ private TreeUtil getTreeUtil() {
+ if (treeUtil == null) {
+ treeUtil = getParserModule().buildTreeUtil();
+ }
+ return treeUtil;
+ }
+
+ private static ParserModule getParserModule() {
+ if (parserModule == null) {
+ parserModule = new EnglishParserModule();
+ }
+ return parserModule;
+ }
+
+}

0 comments on commit 454d6cb

Please sign in to comment.
Something went wrong with that request. Please try again.