Skip to content

Commit

Permalink
fix usage of parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Feb 6, 2024
1 parent 5750ad7 commit 8282dad
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ else if (code.equals("it"))
return "Italian";
else if (code.equals("jp"))
return "Japanese";
else if (code.equals("kr"))
else if (code.equals("kr") || code.equals("ko"))
return "Korean";
else if (code.equals("nl"))
return "Deutch";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,16 +182,22 @@ public boolean accept(File dir, String name) {
List<List<LayoutToken>> segmentedAccumulatedTokens = new ArrayList<>();
List<List<String>> segmentedAccumulatedLabels = new ArrayList<>();

if (accumulatedTokens.size() > 1000) {
int maxSequence = 1000;
if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) {
List<String> newTexts = new ArrayList<>();
maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation");
}

if (accumulatedTokens.size() > maxSequence) {
// we have a problem of sequence length for Deep Learning algorithms
// we need to segment further. We ensure here that we don't segment
// near or inside patent or NPL references
int k = 0;
while(k<accumulatedTokens.size()) {
int origin = k;

if (k+1000 < accumulatedTokens.size()) {
k = k+1000;
if (k+maxSequence < accumulatedTokens.size()) {
k = k+maxSequence;
// adjust position to avoid reference label
while (accumulatedLabels.get(k-1).endsWith("refNPL>") || accumulatedLabels.get(k-1).endsWith("refPatent>")) {
k--;
Expand Down Expand Up @@ -355,16 +361,22 @@ public void createDataSet(String setName, String corpusPath, String outputPath,
List<List<LayoutToken>> segmentedAccumulatedTokens = new ArrayList<>();
List<List<String>> segmentedAccumulatedLabels = new ArrayList<>();

if (accumulatedTokens.size() > 1000) {
int maxSequence = 1000;
if (GrobidProperties.getGrobidCRFEngineName("patent-citation").equals("delft")) {
List<String> newTexts = new ArrayList<>();
maxSequence = GrobidProperties.getDelftTrainingMaxSequenceLength("patent-citation");
}

if (accumulatedTokens.size() > maxSequence) {
// we have a problem of sequence length for Deep Learning algorithms
// we need to segment further. We ensure here that we don't segment
// near or inside patent or NPL references
int k = 0;
while(k<accumulatedTokens.size()) {
int origin = k;

if (k+1000 < accumulatedTokens.size()) {
k = k+1000;
if (k+maxSequence < accumulatedTokens.size()) {
k = k+maxSequence;
// adjust position to avoid reference label
while (accumulatedLabels.get(k-1).endsWith("refNPL>") || accumulatedLabels.get(k-1).endsWith("refPatent>")) {
k--;
Expand Down

0 comments on commit 8282dad

Please sign in to comment.