-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added class to guess the grobid-home, added lemonde xml with obfuscat…
…ed content only for training parser purposes, Updated documentation.
- Loading branch information
Showing
9 changed files
with
154 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Generate training corpus | ||
|
||
### Datasets | ||
|
||
The Grobid NER has been trained on several different datasets: | ||
- Reuters NER [CONLL 2003](http://www.cnts.ua.ac.be/conll2003/ner/) manually annotated training data (10k words, 26 classes). This dataset is not public, so not shipped with the code. In order to obtain it, | ||
- Manually annotated extract from the Wikipedia article on World War 1 (approximately 10k words, 26 classes) | ||
|
||
The datasets distributed with this project are publicly available under the following licences: | ||
- [Wikipedia](http://www.wikipedia.org) data is available under the licence [Creative Commons Attribution-ShareAlike License](https://creativecommons.org/licenses/by-sa/3.0/). | ||
- [EHRI](https://portal.ehri-project.eu) data from the research portal, openly available as mentioned in the EHRI [data policy](https://portal.ehri-project.eu/data-policy). | ||
|
||
The following datasets has been used as training data, but are not distributed with the project: | ||
- Reuters corpus, not publicly available. To obtain it, contact [NIST](http://trec.nist.gov/data/reuters/reuters.html). |
42 changes: 42 additions & 0 deletions
42
grobid-ner/src/main/java/org/grobid/core/utilities/GrobidHome.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package org.grobid.core.utilities; | ||
|
||
import org.grobid.core.exceptions.GrobidPropertyException; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import static org.apache.commons.lang3.StringUtils.isEmpty; | ||
|
||
/** | ||
* Created by lfoppiano on 31/08/16. | ||
*/ | ||
public class GrobidHome { | ||
|
||
private static Logger LOGGER = LoggerFactory.getLogger(GrobidHome.class); | ||
|
||
/** | ||
* Try to get the GROBID_HOME from the environment variable or by using some default locations: | ||
* - ../grobid-home | ||
* - ../../grobid-home (in case the whole repository is cloned directly under the grobid project) | ||
*/ | ||
public static void findGrobidHome() { | ||
String grobidHome = System.getenv("GROBID_HOME"); | ||
if (!isEmpty(grobidHome)) { | ||
GrobidProperties.set_GROBID_HOME_PATH(grobidHome); | ||
GrobidProperties.setGrobidPropertiesPath(grobidHome + "/config/grobid.properties"); | ||
} else { | ||
try { | ||
LOGGER.trace("Trying grobid home from the usual location at ../grobid-home "); | ||
GrobidProperties.set_GROBID_HOME_PATH("../grobid-home"); | ||
GrobidProperties.setGrobidPropertiesPath("../grobid-home/config/grobid.properties"); | ||
} catch (GrobidPropertyException gpe) { | ||
LOGGER.error("Grobid HOME not found, trying to fish it from ../../grobid-home "); | ||
try { | ||
GrobidProperties.set_GROBID_HOME_PATH("../../grobid-home"); | ||
GrobidProperties.setGrobidPropertiesPath("../../grobid-home/config/grobid.properties"); | ||
} catch (GrobidPropertyException gpe2) { | ||
LOGGER.error("Grobid HOME at ../../grobid-home not found, set the environment variable GROBID_HOME"); | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<corpus> | ||
<subcorpus name="ftb6 1"> | ||
<document id="id248980"> | ||
<sentence id="E1"> | ||
zzbbzb, bbzb bz zbb bb'bbz bzzbbzz bbbbbb zz zzbbbbz bbb bbb bbbbzb zz bbb, bzbbzbzbb b'zzzbbz zb 10 | ||
zbbbb, b'zbbzbb bzb ébé bz zbbbz bzbb zzbbz bbbzzzbbbbé, bzzbbbzbbbzbb bzb bbzbzbbbbzb, b'ébzbb bzb bz | ||
bbbb bbzbbbzbz. | ||
</sentence> | ||
<sentence id="E2"> | ||
bbbbbbbb zbb-bb bbz bz bbbbzbbzbzbb z zézé zbbbb bbz bzb zzbzbbzbbzb bbbbbbbbzb bz bbbbbbzbb zbzbbz | ||
bbbbbbbb zbbzbbzbbbz zb zbbbbzèbzbb, zbzz bb zzb zbbzbzbz, bbz bz zézzbbz zzb bbbébêbb zzb bzbzzzb bzbbz | ||
bzb bz bzbbbbzb z'bb bbbbèbz bbb zzbbbbb à bbbbbbbbzb bzb zzbzb bbbbbbzb zb zb zébzbbzbzbb bébébzbbbé | ||
zzb bbbbbzbbzb zz bz "<ENAMEX type="Organization" sub_type="InstitutionalOrganization" eid="null" | ||
name="Sécurité sociale en France" uri="null">bécb</ENAMEX>". | ||
</sentence> | ||
<sentence id="E3"> | ||
zb zbbb, bb b'zbb bbzbbbé zzbzbb bz zbbbbbzbbbbz zzb bzbbébzbbzbbb zb zbbbb bézbzzb (bbb bbzbzbbzbb bz | ||
bêbz zbbbz zz bzbbébzbbzbbbbbé bbz bzb bbbzbzzbbbbzb zb bbbzz bzbzbbé), bbbb bbébzzbbéb bzb bzbbb | ||
bbbzbbbéb bbbzbbzb bbz bzb bzb zbbzbbzb zzb bézzzbbb à bz bzzbzbzbz zz bèbbzb zb bzb zbzbbzb zb | ||
zzzzbbzzbzb. | ||
</sentence> | ||
<sentence id="E4"> | ||
bz bbbbzbbzbzbb bz bébbbb zbbz à bzbbbbzb bz zzbbz zzbb bz zzbb zz bzbbzbzbbzb bbb bbb zbbbzbzbb zzbb bz | ||
bbzbbz zz bzbb bbzzbzzbbé à bébzb bz bbbbèbz zz bzbbé. | ||
</sentence> | ||
<sentence id="E5"> | ||
bbbbbz zb bbbb, zbzzbb b'zbbzbz à zz bbz bz zbbbzbb z'zbzb zbbbbz zzbb bzb bbbzbzbbb bbbbb bz zbbbzbbbbb | ||
bézbzzbz zz 1989 bébbzbéz bzb b'zbbbbzbzz-bzbzzbz zb bzb bézzzbbb... | ||
</sentence> | ||
<sentence id="E6"> | ||
M. <ENAMEX type="Person" gender="m" eid="1000000001656194" oldname="René Teulade" name="René Teulade" | ||
uri="http://fr.wikipedia.org/wiki/René Teulade">Teulade | ||
</ENAMEX> bzbb, à bbbbz bbbbz, zbbbbzébzb bbz "bz zbézbzbbbbé zb bbbbèbz zbbbzbbbbbbzb zbb zb bzb". | ||
</sentence> | ||
<sentence id="E12"> | ||
z'zzbbz, bzbzz bbz b'bbbbbbzbzz zz b. <ENAMEX type="Person" gender="m" eid="1000000001656194" | ||
oldname="René Teulade" name="René Teulade" | ||
uri="http://fr.wikipedia.org/wiki/René Teulade">Teulade | ||
</ENAMEX> zb zz bbb bbézézzbbzbb, b. <ENAMEX type="Person" gender="m" eid="1000000000009172" | ||
oldname="Jean-Louis Bianco" name="Jean-Louis Bianco" | ||
uri="http://fr.wikipedia.org/wiki/Jean-Louis Bianco">Jean-Louis | ||
Bbabcb</ENAMEX>, à bbzbzzb bz zzbbz zz bz bzbbbbzbbbzbbbb zb bbbbèbz zz bzbbé zbbbzbçzbb à bbbbzb bzb | ||
zbbbbb. | ||
</sentence> | ||
<sentence id="E13"> | ||
Sur lzs zouzz zzrnizrs mois, lzs zépznszs zz sznté n'ont progrzssé quz zz 5% zlors quz lzur zroissznzz | ||
vzrizit zntrz 6% zt 9% zzs trois zzrnièrzs znnézs. | ||
</sentence> | ||
<sentence id="E14"> | ||
zb bzzbbz bbzb, bzb bébzbzbbbbbbb zb bzbzbbbbbzbzbb ézbbbbbbbz zb zz bz bbbbéz zb zbôbzbz bbb bzb | ||
bzbbbézb zz zbbbbzbbbbb, bbb bbbb bz bbzzbbbz zzbb bzb bbbzbzbbb bbbbb bzb bb zézbbbzbb zz 30 bbbbbzbzb | ||
zz zbzbzb zz bz bbébbbzbbz zz bz "<ENAMEX type="Organization" sub_type="InstitutionalOrganization" | ||
eid="null" name="Sécurité sociale en France" uri="null"> | ||
Sécu</ENAMEX>", bbbbbzbzbb zbzbbz zzbzbbzbz bbz bzîbbbbz zbbzzbz zzb zébzbbzb. | ||
</sentence> | ||
<sentence id="E15"> | ||
Lz <ENAMEX type="Organization" eid="1000000001671259" sub_type="InstitutionalOrganization" | ||
oldname="Caisse nationale de l’assurance maladie des travailleurs salariés" | ||
name="Caisse nationale de l’assurance maladie des travailleurs salariés" | ||
uri="http://fr.wikipedia.org/wiki/Caisse nationale de l’assurance maladie des travailleurs salariés"> | ||
zzbbbz bzbbbbzbz z'zbbbbzbzz-bzbzzbz | ||
</ENAMEX> bbzbb z'zbbbzbbb zz bzbbbz zb bbbbb bb zbbbbbbbbz bzbzbzzbb zz bbzbbz bbbbb bz zébzb zz | ||
bzbzbbbbzbzbb zzb zbbbbéb. | ||
</sentence> | ||
<sentence id="E16"> | ||
zzbbz bzbbbz, bbb bbbbbzbb êbbz zbbbbbbéz zzbb bzb bbbzbzbbzb bzbzbbzb, bzbbzbbbzbb z'ézbbbbbbzb bbzbbbz | ||
4 bbbbbzbzb zz zbzbzb. | ||
</sentence> | ||
</document> | ||
<document id="id248982"> | ||
<sentence id="E49"> | ||
bzbbb bbz ébbzz zz b'<ENAMEX type="Organization" sub_type="InstitutionalOrganization" | ||
eid="1000000000002268" | ||
oldname="Organisation de coopération et de développement économiques" | ||
name="Organisation de coopération et de développement économiques" | ||
uri="http://fr.wikipedia.org/wiki/Organisation de coopération et de développement économiques"> | ||
OCDE</ENAMEX>, zb zbb zbb, zbbbz 1980 zb 1990, bz bbbzbzbbbbbé zbbzbb bbbbbzbbé zz 47% zzbb bz bzzbzbb | ||
bbzbbbbbzb zb bzbbzbzbb zz 2% zzbb bzb bzbbbzzb, bbbb bbz zbbbzbbzbbbb zzb bbbb à bz bbbzbzbbbb zzbb | ||
zbbb bbbbzbz zzbb b'bbzbbbbbz. | ||
</sentence> | ||
</document> | ||
</subcorpus> | ||
</corpus> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters