#3905 - Allow sentence splitting to consider document structure

- Fix bug causing test to fail - Added additional zoning test
inception-project · Mar 28, 2023 · 22dd28f · 22dd28f
1 parent da704e0
commit 22dd28f
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 10 deletions.
diff --git a/inception/inception-export/pom.xml b/inception/inception-export/pom.xml
@@ -87,6 +87,10 @@
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
     </dependency>
+    <dependency>
+      <groupId>it.unimi.dsi</groupId>
+      <artifactId>fastutil</artifactId>
+    </dependency>
 
     <dependency>
       <groupId> org.springframework</groupId>

diff --git a/...rt/src/main/java/de/tudarmstadt/ukp/inception/export/DocumentImportExportServiceImpl.java b/...rt/src/main/java/de/tudarmstadt/ukp/inception/export/DocumentImportExportServiceImpl.java
@@ -398,7 +398,7 @@ public static void splitSentences(CAS aCas)
         splitSentences(aCas, null);
     }
 
-    public static void splitSentences(CAS aCas, Iterable<AnnotationFS> aZones)
+    public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZones)
     {
         if (aCas.getDocumentText() == null) {
             return;
@@ -428,10 +428,10 @@ public static void splitSentences(CAS aCas, Iterable<AnnotationFS> aZones)
             int last = bi.first();
             int cur = bi.next();
             while (cur != BreakIterator.DONE) {
-                int[] span = new int[] { last, cur };
+                int[] span = new int[] { last + begin, cur + begin };
                 trim(aCas.getDocumentText(), span);
                 if (!isEmpty(span[0], span[1])) {
-                    aCas.addFsToIndexes(createSentence(aCas, span[0] + begin, span[1] + end));
+                    aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
                 }
                 last = cur;
                 cur = bi.next();

diff --git a/.../inception-export/src/test/java/de/tudarmstadt/ukp/inception/export/SegmentationTest.java b/.../inception-export/src/test/java/de/tudarmstadt/ukp/inception/export/SegmentationTest.java
@@ -17,15 +17,17 @@
  */
 package de.tudarmstadt.ukp.inception.export;
 
-import static java.util.Arrays.asList;
 import static org.apache.uima.fit.util.CasUtil.toText;
 import static org.apache.uima.fit.util.JCasUtil.select;
-import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.assertj.core.api.Assertions.assertThat;
 
 import org.apache.uima.fit.factory.JCasFactory;
 import org.apache.uima.jcas.JCas;
 import org.junit.jupiter.api.Test;
 
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 
@@ -38,21 +40,36 @@ public void testSplitSentences() throws Exception
 
         DocumentImportExportServiceImpl.splitSentences(jcas.getCas());
 
-        assertEquals(asList("I am one.", "I am two."), toText(select(jcas, Sentence.class)));
+        assertThat(toText(select(jcas, Sentence.class))) //
+                .containsExactly("I am one.", "I am two.");
+    }
+
+    @Test
+    public void testSplitSentencesWithZones() throws Exception
+    {
+        JCas jcas = JCasFactory.createText("Heading I am two.", "en");
+        new Heading(jcas, 0, 7).addToIndexes();
+        new Paragraph(jcas, 8, 17).addToIndexes();
+
+        DocumentImportExportServiceImpl.splitSentences(jcas.getCas(), jcas.select(Div.class));
+
+        assertThat(toText(select(jcas, Sentence.class))) //
+                .containsExactly("Heading", "I am two.");
     }
 
     @Test
     public void testTokenize() throws Exception
     {
         JCas jcas = JCasFactory.createText("i am one.i am two.", "en");
         new Sentence(jcas, 0, 9).addToIndexes();
-        ;
         new Sentence(jcas, 9, 18).addToIndexes();
 
         DocumentImportExportServiceImpl.tokenize(jcas.getCas());
 
-        assertEquals(asList("i am one.", "i am two."), toText(select(jcas, Sentence.class)));
-        assertEquals(asList("i", "am", "one", ".", "i", "am", "two", "."),
-                toText(select(jcas, Token.class)));
+        assertThat(toText(select(jcas, Sentence.class))) //
+                .containsExactly("i am one.", "i am two.");
+
+        assertThat(toText(select(jcas, Token.class))) //
+                .containsExactly("i", "am", "one", ".", "i", "am", "two", ".");
     }
 }