Fix dcm4che#976 : ISO 2022 text encoding issue

* Remove escape sequence(G0-DESIGNATE ASCII) of Codec.KS_X_1001 and Codec.GB2312 * Update the Chinese test pattern with DICOM PS3.5 2021e * Divide the Korean test pattern two parts, encode and decode The old source code misunderstood that the codec GR(G1) characters could be used even when used G0-EscSeq of the codec, and vice versa. This caused problems when complex switches were needed, like Japanese.
itr-tert · Jan 24, 2022 · 175cd6e · 175cd6e
1 parent d2d43d4
commit 175cd6e
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 19 deletions.
diff --git a/dcm4che-core/src/main/java/org/dcm4che3/data/SpecificCharacterSet.java b/dcm4che-core/src/main/java/org/dcm4che3/data/SpecificCharacterSet.java
@@ -53,6 +53,7 @@
 
 /**
  * @author Gunter Zeilinger (gunterze@protonmail.com)
+ * @author Itr Tert (itr.tert@gmail.com)
  */
 public class SpecificCharacterSet {
 
@@ -87,8 +88,8 @@ public String toText(String s) {
         TIS_620(true, 0x2842, 0x2d54, 1),
         JIS_X_208(false, 0x2442, 0, 1),
         JIS_X_212(false, 0x242844, 0, 2),
-        KS_X_1001(false, 0x2842, 0x242943, -1),
-        GB2312(false, 0x2842, 0x242941, -1),
+        KS_X_1001(false, 0, 0x242943, -1),
+        GB2312(false, 0, 0x242941, -1),
         UTF_8(true, 0, 0, -1),
         GB18030(false, 0, 0, -1);
 
@@ -241,6 +242,10 @@ public String toText(String s) {
         }
     }
 
+    private enum G0G1 {
+        G0, G1, Both
+    }
+
     private static final class Encoder {
         final Codec codec;
         final CharsetEncoder encoder;
@@ -251,20 +256,36 @@ public Encoder(Codec codec) {
         }
 
         public boolean encode(CharBuffer cb, ByteBuffer bb, int escSeq,
-                CodingErrorAction errorAction) {
+                G0G1 useRange, CodingErrorAction errorAction) {
             encoder.onMalformedInput(errorAction)
                     .onUnmappableCharacter(errorAction)
                     .reset();
             int cbmark = cb.position();
             int bbmark = bb.position();
             try {
                 escSeq(bb, escSeq);
+                int graphicCharStart = bb.position();
                 CoderResult cr = encoder.encode(cb, bb, true);
                 if (!cr.isUnderflow())
                     cr.throwException();
                 cr = encoder.flush(bb);
                 if (!cr.isUnderflow())
                     cr.throwException();
+
+                if (useRange == G0G1.G0) {
+                    for (int i = graphicCharStart, end = bb.position(); i < end; ++i) {
+                        if (0 > bb.get(i)) {
+                            throw new CharacterCodingException();
+                        }
+                    }
+                } else if (useRange == G0G1.G1) {
+                    for (int i = graphicCharStart, end = bb.position(); i < end; ++i) {
+                        if (0 <= bb.get(i)) {
+                            throw new CharacterCodingException();
+                        }
+                    }
+                }
+                // if useRange == G0G1.Both, then do nothing
             } catch (CharacterCodingException x) {
                 SafeBuffer.position(cb, cbmark);
                 SafeBuffer.position(bb, bbmark);
@@ -305,14 +326,14 @@ public byte[] encode(String val, String delimiters) {
             ByteBuffer bb = ByteBuffer.wrap(buf);
             // try to encode whole string value with character set specified
             // by value1 of (0008,0005) Specific Character Set
-            if (!enc1.encode(cb, bb, 0, CodingErrorAction.REPORT)) {
+            if (!enc1.encode(cb, bb, 0, G0G1.Both, CodingErrorAction.REPORT)) {
                 // split whole string value according VR specific delimiters
                 // and try to encode each component separately
                 Encoder[] encs = new Encoder[codecs.length];
                 encs[0] = enc1;
                 encs[1] = encoder(cachedEncoder2, codecs[1]);
                 StringTokenizer comps = new StringTokenizer(val, delimiters, true);
-                buf = new byte[2 * strlen + 4 * (comps.countTokens() + 1)];
+                buf = new byte[(2 + 4) * strlen];
                 bb = ByteBuffer.wrap(buf);
                 int[] cur = { 0, 0 };
                 while (comps.hasMoreTokens()) {
@@ -332,25 +353,26 @@ public byte[] encode(String val, String delimiters) {
 
         private void encodeComponent(Encoder[] encs, CharBuffer cb, ByteBuffer bb, int[] cur) {
             // try to encode component with current active character of G1
-            if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, CodingErrorAction.REPORT))
+            if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, G0G1.G1, CodingErrorAction.REPORT))
                 return;
 
             // try to encode component with current active character set of G0, if different to G1
             if ((codecs[cur[1]].getEscSeq1() == 0 || codecs[cur[1]].getEscSeq0() != codecs[cur[0]].getEscSeq0())
-                    && encs[cur[0]].encode(cb, bb, 0, CodingErrorAction.REPORT))
+                    && encs[cur[0]].encode(cb, bb, 0, G0G1.G0, CodingErrorAction.REPORT))
                 return;
 
             int next = encs.length;
             while (--next >= 0) {
                 if (encs[next] == null)
                     encs[next] = new Encoder(codecs[next]);
                 if (codecs[next].getEscSeq1() != 0) {
-                    if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), CodingErrorAction.REPORT)) {
+                    if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), G0G1.G1, CodingErrorAction.REPORT)) {
                         cur[1] = next;
                         break;
                     }
-                } else {
-                    if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), CodingErrorAction.REPORT)) {
+                }
+                if (codecs[next].getEscSeq0() != 0) {
+                    if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), G0G1.G0, CodingErrorAction.REPORT)) {
                         cur[0] = next;
                         break;
                     }

diff --git a/dcm4che-core/src/test/java/org/dcm4che3/data/SpecificCharacterSetTest.java b/dcm4che-core/src/test/java/org/dcm4che3/data/SpecificCharacterSetTest.java
@@ -70,8 +70,8 @@ public class SpecificCharacterSetTest {
             "Zhang^XiaoDong=张^小东=";
     private static final String CHINESE_LONG_TEXT_GB2312 =
             "1.第一行文字。\r\n" +
-            "2.第一行文字。\r\n" +
-            "3.第一行文字。\r\n";
+            "2.第二行文字。\r\n" +
+            "3.第三行文字。\r\n";
     private static final String CHINESE_PERSON_NAME_UTF8 =
             "Wang^XiaoDong=王^小東=";
     private static final String CHINESE_PERSON_NAME_GB18030 =
@@ -144,6 +144,25 @@ public class SpecificCharacterSetTest {
             (byte) 0xb1, (byte) 0xe6, (byte) 0xb5, (byte) 0xbf };
 
     private static final byte[] KOREAN_LONG_TEXT_BYTES = {
+            (byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20, (byte) 0x31,
+            (byte) 0x73, (byte) 0x74, (byte) 0x20, (byte) 0x6c, (byte) 0x69,
+            (byte) 0x6e, (byte) 0x65, (byte) 0x20, (byte) 0x69, (byte) 0x6e,
+            (byte) 0x63, (byte) 0x6c, (byte) 0x75, (byte) 0x64, (byte) 0x65,
+            (byte) 0x73, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x29,
+            (byte) 0x43, (byte) 0xb1, (byte) 0xe6, (byte) 0xb5, (byte) 0xbf,
+            (byte) 0x2e, (byte) 0x0d, (byte) 0x0a, (byte) 0x54, (byte) 0x68,
+            (byte) 0x65, (byte) 0x20, (byte) 0x32, (byte) 0x6e, (byte) 0x64,
+            (byte) 0x20, (byte) 0x6c, (byte) 0x69, (byte) 0x6e, (byte) 0x65,
+            (byte) 0x20, (byte) 0x69, (byte) 0x6e, (byte) 0x63, (byte) 0x6c,
+            (byte) 0x75, (byte) 0x64, (byte) 0x65, (byte) 0x73, (byte) 0x20,
+            (byte) 0x1b, (byte) 0x24, (byte) 0x29, (byte) 0x43, (byte) 0xb1,
+            (byte) 0xe6, (byte) 0xb5, (byte) 0xbf, (byte) 0x2c, (byte) 0x20,
+            (byte) 0x74, (byte) 0x6f, (byte) 0x6f, (byte) 0x2e, (byte) 0x0d,
+            (byte) 0x0a, (byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20,
+            (byte) 0x33, (byte) 0x72, (byte) 0x64, (byte) 0x20, (byte) 0x6c,
+            (byte) 0x69, (byte) 0x6e, (byte) 0x65, (byte) 0x2e };
+
+    private static final byte[] KOREAN_LONG_TEXT_NO_EXPLICIT_ESCSEQ_BYTES = {
             (byte) 0x1b, (byte) 0x24, (byte) 0x29, (byte) 0x43, (byte) 0x54,
             (byte) 0x68, (byte) 0x65, (byte) 0x20, (byte) 0x31, (byte) 0x73,
             (byte) 0x74, (byte) 0x20, (byte) 0x6c, (byte) 0x69, (byte) 0x6e,
@@ -172,16 +191,16 @@ public class SpecificCharacterSetTest {
             (byte) 0x3D };
 
     private static final byte[] CHINESE_LONG_TEXT_GB2312_BYTES = {
-            (byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x31,
-            (byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
+            (byte) 0x31, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29,
+            (byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
             (byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7,
             (byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A,
-            (byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x32,
-            (byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
+            (byte) 0x32, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29,
+            (byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xB6, (byte) 0xFE,
             (byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7,
             (byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A,
-            (byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x33,
-            (byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
+            (byte) 0x33, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29,
+            (byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xC8, (byte) 0xFD,
             (byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7,
             (byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A };
 
@@ -395,7 +414,7 @@ public void testEncodeKoreanLongText() {
     @Test
     public void testDecodeKoreanLongText() {
         assertEquals(KOREAN_LONG_TEXT,
-                ksx1001().decode(KOREAN_LONG_TEXT_BYTES));
+                ksx1001().decode(KOREAN_LONG_TEXT_NO_EXPLICIT_ESCSEQ_BYTES));
     }
 
     @Test