Skip to content

Commit

Permalink
#3905 - Allow sentence splitting to consider document structure
Browse files Browse the repository at this point in the history
- Fix bug causing test to fail
- Added additional zoning test
  • Loading branch information
reckart committed Mar 28, 2023
1 parent da704e0 commit 22dd28f
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 10 deletions.
4 changes: 4 additions & 0 deletions inception/inception-export/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
</dependency>

<dependency>
<groupId> org.springframework</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ public static void splitSentences(CAS aCas)
splitSentences(aCas, null);
}

public static void splitSentences(CAS aCas, Iterable<AnnotationFS> aZones)
public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZones)
{
if (aCas.getDocumentText() == null) {
return;
Expand Down Expand Up @@ -428,10 +428,10 @@ public static void splitSentences(CAS aCas, Iterable<AnnotationFS> aZones)
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last, cur };
int[] span = new int[] { last + begin, cur + begin };
trim(aCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(createSentence(aCas, span[0] + begin, span[1] + end));
aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
}
last = cur;
cur = bi.next();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@
*/
package de.tudarmstadt.ukp.inception.export;

import static java.util.Arrays.asList;
import static org.apache.uima.fit.util.CasUtil.toText;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.assertj.core.api.Assertions.assertThat;

import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.junit.jupiter.api.Test;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

Expand All @@ -38,21 +40,36 @@ public void testSplitSentences() throws Exception

DocumentImportExportServiceImpl.splitSentences(jcas.getCas());

assertEquals(asList("I am one.", "I am two."), toText(select(jcas, Sentence.class)));
assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("I am one.", "I am two.");
}

@Test
public void testSplitSentencesWithZones() throws Exception
{
JCas jcas = JCasFactory.createText("Heading I am two.", "en");
new Heading(jcas, 0, 7).addToIndexes();
new Paragraph(jcas, 8, 17).addToIndexes();

DocumentImportExportServiceImpl.splitSentences(jcas.getCas(), jcas.select(Div.class));

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("Heading", "I am two.");
}

@Test
public void testTokenize() throws Exception
{
JCas jcas = JCasFactory.createText("i am one.i am two.", "en");
new Sentence(jcas, 0, 9).addToIndexes();
;
new Sentence(jcas, 9, 18).addToIndexes();

DocumentImportExportServiceImpl.tokenize(jcas.getCas());

assertEquals(asList("i am one.", "i am two."), toText(select(jcas, Sentence.class)));
assertEquals(asList("i", "am", "one", ".", "i", "am", "two", "."),
toText(select(jcas, Token.class)));
assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("i am one.", "i am two.");

assertThat(toText(select(jcas, Token.class))) //
.containsExactly("i", "am", "one", ".", "i", "am", "two", ".");
}
}

0 comments on commit 22dd28f

Please sign in to comment.