Skip to content

Commit

Permalink
Merge pull request #752 from kermitt2/pdf_md5
Browse files Browse the repository at this point in the history
MD5 last commit missing due to my internet connection :(
  • Loading branch information
kermitt2 committed Apr 28, 2021
2 parents 0cff20c + 5ea4768 commit 3d7d7b1
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 12 deletions.
Expand Up @@ -385,6 +385,13 @@ else if (consolidate == 2)
return results;
}

public List<BibDataSet> processingReferenceSection(File input,
ReferenceSegmenter referenceSegmenter,
int consolidate) {
DocumentSource documentSource = DocumentSource.fromPdf(input);
return processingReferenceSection(documentSource, referenceSegmenter, consolidate);
}

public List<BibDataSet> processingReferenceSection(File input,
String md5Str,
ReferenceSegmenter referenceSegmenter,
Expand Down
5 changes: 5 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/engines/Engine.java
Expand Up @@ -524,6 +524,11 @@ public Document fullTextToTEIDoc(File inputFile,
return resultDoc;
}

public Document fullTextToTEIDoc(File inputFile,
GrobidAnalysisConfig config) throws Exception {
return fullTextToTEIDoc(inputFile, null, config);
}

public Document fullTextToTEIDoc(DocumentSource documentSource,
GrobidAnalysisConfig config) throws Exception {
FullTextParser fullTextParser = parsers.getFullTextParser();
Expand Down
Expand Up @@ -101,6 +101,14 @@ public FullTextParser(EngineParsers parsers) {
tmpPath = GrobidProperties.getTempPath();
}

public Document processing(File inputPdf,
GrobidAnalysisConfig config) throws Exception {
DocumentSource documentSource =
DocumentSource.fromPdf(inputPdf, config.getStartPage(), config.getEndPage(),
config.getPdfAssetPath() != null, true, false);
return processing(documentSource, config);
}

public Document processing(File inputPdf,
String md5Str,
GrobidAnalysisConfig config) throws Exception {
Expand Down
Expand Up @@ -686,7 +686,7 @@ public void testPDF() throws Exception {
// System.out.println(resHeader.getAbstract());
//
Document d =
engine.fullTextToTEIDoc(input, null, GrobidAnalysisConfig.defaultInstance());
engine.fullTextToTEIDoc(input, GrobidAnalysisConfig.defaultInstance());

d.getBlocks();
System.out.println(d.getTei());
Expand All @@ -695,15 +695,15 @@ public void testPDF() throws Exception {

}

@Test
/*@Test
public void testEmailPDF() throws Exception {
Engine engine = GrobidFactory.getInstance().getEngine();
BiblioItem resHeader = new BiblioItem();
engine.getParsers().getHeaderParser().processing(new File("/Work/temp/1.pdf"), null, resHeader, GrobidAnalysisConfig.defaultInstance());
engine.getParsers().getHeaderParser().processing(new File("/Work/temp/1.pdf"), resHeader, GrobidAnalysisConfig.defaultInstance());
System.out.println(resHeader);
// System.out.println(engine.fullTextToTEI("/tmp/2.pdf", false, false));
}
}*/


/*@Test
Expand Down
Expand Up @@ -46,7 +46,7 @@ public static void tearDown(){
public void testFullTextParser_1() throws Exception {
File inputTmpFile = getInputDocument("/test/Wang-paperAVE2008.pdf");

Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
}

Expand All @@ -64,35 +64,35 @@ private File getInputDocument(String inputPath) throws IOException {
public void testFullTextParser_2() throws Exception {
File inputTmpFile = getInputDocument("/test/two_pages.pdf");

Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
}

@Test
public void testFullTextParser_3() throws Exception {
File inputTmpFile = getInputDocument("/test/MullenJSSv18i03.pdf");
Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
}

@Test
public void testFullTextParser_4() throws Exception {
File inputTmpFile = getInputDocument("/test/1001._0908.0054.pdf");
Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
}

@Test
public void testFullTextParser_5() throws Exception {
File inputTmpFile = getInputDocument("/test/submission_161.pdf");
Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
}

@Test
public void testFullTextParser_6() throws Exception {
File inputTmpFile = getInputDocument("/test/submission_363.pdf");
Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());
assertTei(tei);
}

Expand Down
Expand Up @@ -37,7 +37,7 @@ public static void tearDown(){
public void testJSONAnnotationStructure() throws Exception {
Engine engine = GrobidFactory.getInstance().getEngine();
File inputTmpFile = getInputDocument("/test/test_Grobid_1_05452615.pdf");
Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());

String refURL = "http://example.com/xyz";
List<String> refURLs = Arrays.asList(refURL);
Expand Down Expand Up @@ -91,7 +91,7 @@ public void testJSONAnnotationStructure() throws Exception {
public void testJSONAnnotationEscaping() throws Exception {
Engine engine = GrobidFactory.getInstance().getEngine();
File inputTmpFile = getInputDocument("/test/test_Grobid_1_05452615.pdf");
Document tei = engine.fullTextToTEIDoc(inputTmpFile, null, GrobidAnalysisConfig.defaultInstance());
Document tei = engine.fullTextToTEIDoc(inputTmpFile, GrobidAnalysisConfig.defaultInstance());

// check that this embedded backslash is escaped properly
String refURL = "http://example.com/xyz?a=ab\\c123";
Expand Down

0 comments on commit 3d7d7b1

Please sign in to comment.