-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactored test, now the grobid home is taken either from GROBID_HOME…
…, or from ../grobid-home or ../../grobid-home. Moved test of the sax parser in the specific class Added logger configuration (in console) Added some training data for evaluation (to be corrected)
- Loading branch information
Showing
10 changed files
with
54,452 additions
and
102 deletions.
There are no files selected for viewing
2,121 changes: 2,121 additions & 0 deletions
2,121
grobid-ner/resources/dataset/ner/evaluation/Wikipedia_worldWarZ.training.txt
Large diffs are not rendered by default.
Oops, something went wrong.
11,314 changes: 11,314 additions & 0 deletions
11,314
grobid-ner/resources/dataset/ner/evaluation/inprogress/Wikipedia_brexit.1.training.txt
Large diffs are not rendered by default.
Oops, something went wrong.
7,581 changes: 7,581 additions & 0 deletions
7,581
grobid-ner/resources/dataset/ner/evaluation/todo/Wikipedia_brexit.2.training.txt
Large diffs are not rendered by default.
Oops, something went wrong.
33,283 changes: 33,283 additions & 0 deletions
33,283
grobid-ner/resources/dataset/ner/evaluation/todo/Wikipedia_holocaust.training.txt
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd" > | ||
<log4j:configuration> | ||
<appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender"> | ||
<layout class="org.apache.log4j.PatternLayout"> | ||
<param name="ConversionPattern" value="%d{dd MMM yyyy HH:mm.ss} [%-5p] %-25c{1} - %m\n"/> | ||
</layout> | ||
</appender> | ||
|
||
<root> | ||
<priority value="DEBUG"></priority> | ||
<appender-ref ref="CONSOLE"/> | ||
</root> | ||
</log4j:configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 0 additions & 97 deletions
97
grobid-ner/src/test/java/org/grobid/trainer/AssemblerTest.java
This file was deleted.
Oops, something went wrong.
54 changes: 54 additions & 0 deletions
54
grobid-ner/src/test/java/org/grobid/trainer/sax/ReutersSaxHandlerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
package org.grobid.trainer.sax; | ||
|
||
import org.junit.Test; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import static org.hamcrest.CoreMatchers.is; | ||
import static org.junit.Assert.assertThat; | ||
|
||
/** | ||
* Created by lfoppiano on 25/08/16. | ||
*/ | ||
public class ReutersSaxHandlerTest { | ||
|
||
@Test | ||
public void testRetokenize_1() throws Exception { | ||
List<String> tokens = new ArrayList<String>(); | ||
tokens.add("around"); | ||
tokens.add(" "); | ||
tokens.add("10"); | ||
tokens.add(","); | ||
tokens.add("000"); | ||
|
||
List<String> tokens2 = ReutersSaxHandler.retokenize(tokens); | ||
|
||
assertThat(tokens2.size(), is(3)); | ||
assertThat(tokens2.get(0), is("around")); | ||
assertThat(tokens2.get(2), is("10,000")); | ||
} | ||
|
||
@Test | ||
public void testRetokenize_2() throws Exception { | ||
|
||
List<String> tokens = new ArrayList<String>(); | ||
tokens.add("10"); | ||
tokens.add(","); | ||
tokens.add("000"); | ||
tokens.add(","); | ||
tokens.add("000"); | ||
tokens.add("."); | ||
tokens.add("00"); | ||
tokens.add(" "); | ||
tokens.add("errors"); | ||
|
||
List<String> tokens2 = ReutersSaxHandler.retokenize(tokens); | ||
|
||
assertThat(tokens2.size(), is(3)); | ||
assertThat(tokens2.get(0), is("10,000,000.00")); | ||
assertThat(tokens2.get(2), is("errors")); | ||
} | ||
|
||
|
||
} |
46 changes: 46 additions & 0 deletions
46
grobid-ner/src/test/java/org/grobid/trainer/sax/SemDocSaxHandlerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package org.grobid.trainer.sax; | ||
|
||
import org.junit.Test; | ||
|
||
import javax.xml.parsers.SAXParser; | ||
import javax.xml.parsers.SAXParserFactory; | ||
import java.io.InputStream; | ||
|
||
import static org.hamcrest.CoreMatchers.is; | ||
import static org.junit.Assert.assertThat; | ||
|
||
/** | ||
* Created by lfoppiano on 25/08/16. | ||
*/ | ||
public class SemDocSaxHandlerTest { | ||
|
||
private SemDocSaxHandler target; | ||
|
||
@Test | ||
public void testAssembler() throws Exception { | ||
InputStream reutersFile = this.getClass().getResourceAsStream("/100100newsML.xml"); | ||
InputStream semdocFile = this.getClass().getResourceAsStream("/100100newsML.semdoc.xml"); | ||
|
||
ReutersSaxHandler reutersSax = new ReutersSaxHandler(); | ||
|
||
// get a factory | ||
SAXParserFactory spf = SAXParserFactory.newInstance(); | ||
spf.setValidating(false); | ||
spf.setFeature("http://xml.org/sax/features/namespaces", false); | ||
spf.setFeature("http://xml.org/sax/features/validation", false); | ||
spf.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false); | ||
spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); | ||
|
||
//get a new instance of parser | ||
SAXParser p = spf.newSAXParser(); | ||
p.parse(reutersFile, reutersSax); | ||
|
||
target = new SemDocSaxHandler(reutersSax.getTextVector()); | ||
|
||
p = spf.newSAXParser(); | ||
p.parse(semdocFile, target); | ||
|
||
assertThat(target.getAnnotatedTextVector().size(), is(243)); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd" > | ||
<log4j:configuration> | ||
<appender name="CONSOLE" class="org.apache.log4j.ConsoleAppender"> | ||
<layout class="org.apache.log4j.PatternLayout"> | ||
<param name="ConversionPattern" value="%d{dd MMM yyyy HH:mm.ss} [%-5p] %-25c{1} - %m\n"/> | ||
</layout> | ||
</appender> | ||
|
||
<root> | ||
<priority value="DEBUG"></priority> | ||
<appender-ref ref="CONSOLE"/> | ||
</root> | ||
</log4j:configuration> |