Skip to content

Commit

Permalink
* Adapted trainer to pick files from xml/final
Browse files Browse the repository at this point in the history
* Forcing encoding to UTF_8 when reading files
* Updated grobid dependencies
  • Loading branch information
lfoppiano committed Aug 21, 2017
1 parent 0f805df commit 06b49b7
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 6 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,12 @@
<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-core</artifactId>
<version>0.4.2-SNAPSHOT</version>
<version>0.4.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.grobid</groupId>
<artifactId>grobid-trainer</artifactId>
<version>0.4.2-SNAPSHOT</version>
<version>0.4.3-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>net.arnx</groupId>
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/grobid/core/engines/QuantityParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import java.util.TimeZone;
import java.util.stream.Collectors;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.commons.lang3.StringUtils.*;
import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement;
import static org.grobid.core.engines.label.QuantitiesTaggingLabels.*;
Expand Down Expand Up @@ -476,15 +477,15 @@ public void createTraining(String inputFile,
if (root != null) {
//System.out.println(XmlBuilderUtils.toXml(root));
try {
FileUtils.writeStringToFile(new File(pathTEI), XmlBuilderUtils.toXml(root));
FileUtils.writeStringToFile(new File(pathTEI), XmlBuilderUtils.toXml(root), UTF_8);
} catch (IOException e) {
throw new GrobidException("Cannot create training data because output file can not be accessed: " + pathTEI);
}
}
}

private Element createTrainingText(File file, Element root) throws IOException {
String text = FileUtils.readFileToString(file);
String text = FileUtils.readFileToString(file, UTF_8);

Element textNode = teiElement("text");
// for the moment we suppose we have english only...
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/org/grobid/trainer/QuantityTrainer.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,11 @@ public int createCRFPPData(File sourcePathLabel,
System.out.println("sourcePathLabel: " + sourcePathLabel);
System.out.println("outputPath: " + outputPath);

File input = new File(sourcePathLabel.getAbsolutePath() + "/xml/final");

// then we convert the tei files into the usual CRF label format
// we process all tei files in the output directory
File[] refFiles = sourcePathLabel.listFiles(new FilenameFilter() {
File[] refFiles = input.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.toLowerCase().endsWith(".tei") || name.toLowerCase().endsWith(".tei.xml");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@


import org.grobid.core.analyzers.QuantityAnalyzer;
import org.grobid.core.engines.UnitParser;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.utilities.Pair;
import org.grobid.core.utilities.UnitUtilities;
Expand Down

0 comments on commit 06b49b7

Please sign in to comment.