From 175cd6e58771a77549d5bbd15f3bc8bfd0fa419a Mon Sep 17 00:00:00 2001 From: itr tert Date: Mon, 24 Jan 2022 20:56:04 +0900 Subject: [PATCH] Fix #976 : ISO 2022 text encoding issue * Remove escape sequence(G0-DESIGNATE ASCII) of Codec.KS_X_1001 and Codec.GB2312 * Update the Chinese test pattern with DICOM PS3.5 2021e * Divide the Korean test pattern two parts, encode and decode The old source code misunderstood that the codec GR(G1) characters could be used even when used G0-EscSeq of the codec, and vice versa. This caused problems when complex switches were needed, like Japanese. --- .../dcm4che3/data/SpecificCharacterSet.java | 42 ++++++++++++++----- .../data/SpecificCharacterSetTest.java | 37 ++++++++++++---- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/dcm4che-core/src/main/java/org/dcm4che3/data/SpecificCharacterSet.java b/dcm4che-core/src/main/java/org/dcm4che3/data/SpecificCharacterSet.java index 1b139bcb26..dc9fa74b5f 100644 --- a/dcm4che-core/src/main/java/org/dcm4che3/data/SpecificCharacterSet.java +++ b/dcm4che-core/src/main/java/org/dcm4che3/data/SpecificCharacterSet.java @@ -53,6 +53,7 @@ /** * @author Gunter Zeilinger (gunterze@protonmail.com) + * @author Itr Tert (itr.tert@gmail.com) */ public class SpecificCharacterSet { @@ -87,8 +88,8 @@ public String toText(String s) { TIS_620(true, 0x2842, 0x2d54, 1), JIS_X_208(false, 0x2442, 0, 1), JIS_X_212(false, 0x242844, 0, 2), - KS_X_1001(false, 0x2842, 0x242943, -1), - GB2312(false, 0x2842, 0x242941, -1), + KS_X_1001(false, 0, 0x242943, -1), + GB2312(false, 0, 0x242941, -1), UTF_8(true, 0, 0, -1), GB18030(false, 0, 0, -1); @@ -241,6 +242,10 @@ public String toText(String s) { } } + private enum G0G1 { + G0, G1, Both + } + private static final class Encoder { final Codec codec; final CharsetEncoder encoder; @@ -251,7 +256,7 @@ public Encoder(Codec codec) { } public boolean encode(CharBuffer cb, ByteBuffer bb, int escSeq, - CodingErrorAction errorAction) { + G0G1 useRange, CodingErrorAction errorAction) { encoder.onMalformedInput(errorAction) .onUnmappableCharacter(errorAction) .reset(); @@ -259,12 +264,28 @@ public boolean encode(CharBuffer cb, ByteBuffer bb, int escSeq, int bbmark = bb.position(); try { escSeq(bb, escSeq); + int graphicCharStart = bb.position(); CoderResult cr = encoder.encode(cb, bb, true); if (!cr.isUnderflow()) cr.throwException(); cr = encoder.flush(bb); if (!cr.isUnderflow()) cr.throwException(); + + if (useRange == G0G1.G0) { + for (int i = graphicCharStart, end = bb.position(); i < end; ++i) { + if (0 > bb.get(i)) { + throw new CharacterCodingException(); + } + } + } else if (useRange == G0G1.G1) { + for (int i = graphicCharStart, end = bb.position(); i < end; ++i) { + if (0 <= bb.get(i)) { + throw new CharacterCodingException(); + } + } + } + // if useRange == G0G1.Both, then do nothing } catch (CharacterCodingException x) { SafeBuffer.position(cb, cbmark); SafeBuffer.position(bb, bbmark); @@ -305,14 +326,14 @@ public byte[] encode(String val, String delimiters) { ByteBuffer bb = ByteBuffer.wrap(buf); // try to encode whole string value with character set specified // by value1 of (0008,0005) Specific Character Set - if (!enc1.encode(cb, bb, 0, CodingErrorAction.REPORT)) { + if (!enc1.encode(cb, bb, 0, G0G1.Both, CodingErrorAction.REPORT)) { // split whole string value according VR specific delimiters // and try to encode each component separately Encoder[] encs = new Encoder[codecs.length]; encs[0] = enc1; encs[1] = encoder(cachedEncoder2, codecs[1]); StringTokenizer comps = new StringTokenizer(val, delimiters, true); - buf = new byte[2 * strlen + 4 * (comps.countTokens() + 1)]; + buf = new byte[(2 + 4) * strlen]; bb = ByteBuffer.wrap(buf); int[] cur = { 0, 0 }; while (comps.hasMoreTokens()) { @@ -332,12 +353,12 @@ public byte[] encode(String val, String delimiters) { private void encodeComponent(Encoder[] encs, CharBuffer cb, ByteBuffer bb, int[] cur) { // try to encode component with current active character of G1 - if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, CodingErrorAction.REPORT)) + if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, G0G1.G1, CodingErrorAction.REPORT)) return; // try to encode component with current active character set of G0, if different to G1 if ((codecs[cur[1]].getEscSeq1() == 0 || codecs[cur[1]].getEscSeq0() != codecs[cur[0]].getEscSeq0()) - && encs[cur[0]].encode(cb, bb, 0, CodingErrorAction.REPORT)) + && encs[cur[0]].encode(cb, bb, 0, G0G1.G0, CodingErrorAction.REPORT)) return; int next = encs.length; @@ -345,12 +366,13 @@ private void encodeComponent(Encoder[] encs, CharBuffer cb, ByteBuffer bb, int[] if (encs[next] == null) encs[next] = new Encoder(codecs[next]); if (codecs[next].getEscSeq1() != 0) { - if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), CodingErrorAction.REPORT)) { + if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), G0G1.G1, CodingErrorAction.REPORT)) { cur[1] = next; break; } - } else { - if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), CodingErrorAction.REPORT)) { + } + if (codecs[next].getEscSeq0() != 0) { + if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), G0G1.G0, CodingErrorAction.REPORT)) { cur[0] = next; break; } diff --git a/dcm4che-core/src/test/java/org/dcm4che3/data/SpecificCharacterSetTest.java b/dcm4che-core/src/test/java/org/dcm4che3/data/SpecificCharacterSetTest.java index 9dcb3d58e3..63b43353d0 100644 --- a/dcm4che-core/src/test/java/org/dcm4che3/data/SpecificCharacterSetTest.java +++ b/dcm4che-core/src/test/java/org/dcm4che3/data/SpecificCharacterSetTest.java @@ -70,8 +70,8 @@ public class SpecificCharacterSetTest { "Zhang^XiaoDong=张^小东="; private static final String CHINESE_LONG_TEXT_GB2312 = "1.第一行文字。\r\n" + - "2.第一行文字。\r\n" + - "3.第一行文字。\r\n"; + "2.第二行文字。\r\n" + + "3.第三行文字。\r\n"; private static final String CHINESE_PERSON_NAME_UTF8 = "Wang^XiaoDong=王^小東="; private static final String CHINESE_PERSON_NAME_GB18030 = @@ -144,6 +144,25 @@ public class SpecificCharacterSetTest { (byte) 0xb1, (byte) 0xe6, (byte) 0xb5, (byte) 0xbf }; private static final byte[] KOREAN_LONG_TEXT_BYTES = { + (byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20, (byte) 0x31, + (byte) 0x73, (byte) 0x74, (byte) 0x20, (byte) 0x6c, (byte) 0x69, + (byte) 0x6e, (byte) 0x65, (byte) 0x20, (byte) 0x69, (byte) 0x6e, + (byte) 0x63, (byte) 0x6c, (byte) 0x75, (byte) 0x64, (byte) 0x65, + (byte) 0x73, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x29, + (byte) 0x43, (byte) 0xb1, (byte) 0xe6, (byte) 0xb5, (byte) 0xbf, + (byte) 0x2e, (byte) 0x0d, (byte) 0x0a, (byte) 0x54, (byte) 0x68, + (byte) 0x65, (byte) 0x20, (byte) 0x32, (byte) 0x6e, (byte) 0x64, + (byte) 0x20, (byte) 0x6c, (byte) 0x69, (byte) 0x6e, (byte) 0x65, + (byte) 0x20, (byte) 0x69, (byte) 0x6e, (byte) 0x63, (byte) 0x6c, + (byte) 0x75, (byte) 0x64, (byte) 0x65, (byte) 0x73, (byte) 0x20, + (byte) 0x1b, (byte) 0x24, (byte) 0x29, (byte) 0x43, (byte) 0xb1, + (byte) 0xe6, (byte) 0xb5, (byte) 0xbf, (byte) 0x2c, (byte) 0x20, + (byte) 0x74, (byte) 0x6f, (byte) 0x6f, (byte) 0x2e, (byte) 0x0d, + (byte) 0x0a, (byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20, + (byte) 0x33, (byte) 0x72, (byte) 0x64, (byte) 0x20, (byte) 0x6c, + (byte) 0x69, (byte) 0x6e, (byte) 0x65, (byte) 0x2e }; + + private static final byte[] KOREAN_LONG_TEXT_NO_EXPLICIT_ESCSEQ_BYTES = { (byte) 0x1b, (byte) 0x24, (byte) 0x29, (byte) 0x43, (byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20, (byte) 0x31, (byte) 0x73, (byte) 0x74, (byte) 0x20, (byte) 0x6c, (byte) 0x69, (byte) 0x6e, @@ -172,16 +191,16 @@ public class SpecificCharacterSetTest { (byte) 0x3D }; private static final byte[] CHINESE_LONG_TEXT_GB2312_BYTES = { - (byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x31, - (byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB, + (byte) 0x31, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29, + (byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB, (byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7, (byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A, - (byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x32, - (byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB, + (byte) 0x32, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29, + (byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xB6, (byte) 0xFE, (byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7, (byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A, - (byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x33, - (byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB, + (byte) 0x33, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29, + (byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xC8, (byte) 0xFD, (byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7, (byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A }; @@ -395,7 +414,7 @@ public void testEncodeKoreanLongText() { @Test public void testDecodeKoreanLongText() { assertEquals(KOREAN_LONG_TEXT, - ksx1001().decode(KOREAN_LONG_TEXT_BYTES)); + ksx1001().decode(KOREAN_LONG_TEXT_NO_EXPLICIT_ESCSEQ_BYTES)); } @Test