Skip to content

Commit

Permalink
Fix dcm4che#976 : ISO 2022 text encoding issue
Browse files Browse the repository at this point in the history
 * Remove escape sequence(G0-DESIGNATE ASCII) of Codec.KS_X_1001 and
     Codec.GB2312
 * Update the Chinese test pattern with DICOM PS3.5 2021e
 * Divide the Korean test pattern two parts, encode and decode

The old source code misunderstood that the codec GR(G1) characters could
be used even when used G0-EscSeq of the codec, and vice versa. This
caused problems when complex switches were needed, like Japanese.
  • Loading branch information
itr-tert committed Jan 24, 2022
1 parent d2d43d4 commit 175cd6e
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@

/**
* @author Gunter Zeilinger (gunterze@protonmail.com)
* @author Itr Tert (itr.tert@gmail.com)
*/
public class SpecificCharacterSet {

Expand Down Expand Up @@ -87,8 +88,8 @@ public String toText(String s) {
TIS_620(true, 0x2842, 0x2d54, 1),
JIS_X_208(false, 0x2442, 0, 1),
JIS_X_212(false, 0x242844, 0, 2),
KS_X_1001(false, 0x2842, 0x242943, -1),
GB2312(false, 0x2842, 0x242941, -1),
KS_X_1001(false, 0, 0x242943, -1),
GB2312(false, 0, 0x242941, -1),
UTF_8(true, 0, 0, -1),
GB18030(false, 0, 0, -1);

Expand Down Expand Up @@ -241,6 +242,10 @@ public String toText(String s) {
}
}

private enum G0G1 {
G0, G1, Both
}

private static final class Encoder {
final Codec codec;
final CharsetEncoder encoder;
Expand All @@ -251,20 +256,36 @@ public Encoder(Codec codec) {
}

public boolean encode(CharBuffer cb, ByteBuffer bb, int escSeq,
CodingErrorAction errorAction) {
G0G1 useRange, CodingErrorAction errorAction) {
encoder.onMalformedInput(errorAction)
.onUnmappableCharacter(errorAction)
.reset();
int cbmark = cb.position();
int bbmark = bb.position();
try {
escSeq(bb, escSeq);
int graphicCharStart = bb.position();
CoderResult cr = encoder.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = encoder.flush(bb);
if (!cr.isUnderflow())
cr.throwException();

if (useRange == G0G1.G0) {
for (int i = graphicCharStart, end = bb.position(); i < end; ++i) {
if (0 > bb.get(i)) {
throw new CharacterCodingException();
}
}
} else if (useRange == G0G1.G1) {
for (int i = graphicCharStart, end = bb.position(); i < end; ++i) {
if (0 <= bb.get(i)) {
throw new CharacterCodingException();
}
}
}
// if useRange == G0G1.Both, then do nothing
} catch (CharacterCodingException x) {
SafeBuffer.position(cb, cbmark);
SafeBuffer.position(bb, bbmark);
Expand Down Expand Up @@ -305,14 +326,14 @@ public byte[] encode(String val, String delimiters) {
ByteBuffer bb = ByteBuffer.wrap(buf);
// try to encode whole string value with character set specified
// by value1 of (0008,0005) Specific Character Set
if (!enc1.encode(cb, bb, 0, CodingErrorAction.REPORT)) {
if (!enc1.encode(cb, bb, 0, G0G1.Both, CodingErrorAction.REPORT)) {
// split whole string value according VR specific delimiters
// and try to encode each component separately
Encoder[] encs = new Encoder[codecs.length];
encs[0] = enc1;
encs[1] = encoder(cachedEncoder2, codecs[1]);
StringTokenizer comps = new StringTokenizer(val, delimiters, true);
buf = new byte[2 * strlen + 4 * (comps.countTokens() + 1)];
buf = new byte[(2 + 4) * strlen];
bb = ByteBuffer.wrap(buf);
int[] cur = { 0, 0 };
while (comps.hasMoreTokens()) {
Expand All @@ -332,25 +353,26 @@ public byte[] encode(String val, String delimiters) {

private void encodeComponent(Encoder[] encs, CharBuffer cb, ByteBuffer bb, int[] cur) {
// try to encode component with current active character of G1
if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, CodingErrorAction.REPORT))
if (codecs[cur[1]].getEscSeq1() != 0 && encs[cur[1]].encode(cb, bb, 0, G0G1.G1, CodingErrorAction.REPORT))
return;

// try to encode component with current active character set of G0, if different to G1
if ((codecs[cur[1]].getEscSeq1() == 0 || codecs[cur[1]].getEscSeq0() != codecs[cur[0]].getEscSeq0())
&& encs[cur[0]].encode(cb, bb, 0, CodingErrorAction.REPORT))
&& encs[cur[0]].encode(cb, bb, 0, G0G1.G0, CodingErrorAction.REPORT))
return;

int next = encs.length;
while (--next >= 0) {
if (encs[next] == null)
encs[next] = new Encoder(codecs[next]);
if (codecs[next].getEscSeq1() != 0) {
if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), CodingErrorAction.REPORT)) {
if (encs[next].encode(cb, bb, codecs[next].getEscSeq1(), G0G1.G1, CodingErrorAction.REPORT)) {
cur[1] = next;
break;
}
} else {
if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), CodingErrorAction.REPORT)) {
}
if (codecs[next].getEscSeq0() != 0) {
if (encs[next].encode(cb, bb, codecs[next].getEscSeq0(), G0G1.G0, CodingErrorAction.REPORT)) {
cur[0] = next;
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ public class SpecificCharacterSetTest {
"Zhang^XiaoDong=张^小东=";
private static final String CHINESE_LONG_TEXT_GB2312 =
"1.第一行文字。\r\n" +
"2.第一行文字\r\n" +
"3.第一行文字\r\n";
"2.第二行文字\r\n" +
"3.第三行文字\r\n";
private static final String CHINESE_PERSON_NAME_UTF8 =
"Wang^XiaoDong=王^小東=";
private static final String CHINESE_PERSON_NAME_GB18030 =
Expand Down Expand Up @@ -144,6 +144,25 @@ public class SpecificCharacterSetTest {
(byte) 0xb1, (byte) 0xe6, (byte) 0xb5, (byte) 0xbf };

private static final byte[] KOREAN_LONG_TEXT_BYTES = {
(byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20, (byte) 0x31,
(byte) 0x73, (byte) 0x74, (byte) 0x20, (byte) 0x6c, (byte) 0x69,
(byte) 0x6e, (byte) 0x65, (byte) 0x20, (byte) 0x69, (byte) 0x6e,
(byte) 0x63, (byte) 0x6c, (byte) 0x75, (byte) 0x64, (byte) 0x65,
(byte) 0x73, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x29,
(byte) 0x43, (byte) 0xb1, (byte) 0xe6, (byte) 0xb5, (byte) 0xbf,
(byte) 0x2e, (byte) 0x0d, (byte) 0x0a, (byte) 0x54, (byte) 0x68,
(byte) 0x65, (byte) 0x20, (byte) 0x32, (byte) 0x6e, (byte) 0x64,
(byte) 0x20, (byte) 0x6c, (byte) 0x69, (byte) 0x6e, (byte) 0x65,
(byte) 0x20, (byte) 0x69, (byte) 0x6e, (byte) 0x63, (byte) 0x6c,
(byte) 0x75, (byte) 0x64, (byte) 0x65, (byte) 0x73, (byte) 0x20,
(byte) 0x1b, (byte) 0x24, (byte) 0x29, (byte) 0x43, (byte) 0xb1,
(byte) 0xe6, (byte) 0xb5, (byte) 0xbf, (byte) 0x2c, (byte) 0x20,
(byte) 0x74, (byte) 0x6f, (byte) 0x6f, (byte) 0x2e, (byte) 0x0d,
(byte) 0x0a, (byte) 0x54, (byte) 0x68, (byte) 0x65, (byte) 0x20,
(byte) 0x33, (byte) 0x72, (byte) 0x64, (byte) 0x20, (byte) 0x6c,
(byte) 0x69, (byte) 0x6e, (byte) 0x65, (byte) 0x2e };

private static final byte[] KOREAN_LONG_TEXT_NO_EXPLICIT_ESCSEQ_BYTES = {
(byte) 0x1b, (byte) 0x24, (byte) 0x29, (byte) 0x43, (byte) 0x54,
(byte) 0x68, (byte) 0x65, (byte) 0x20, (byte) 0x31, (byte) 0x73,
(byte) 0x74, (byte) 0x20, (byte) 0x6c, (byte) 0x69, (byte) 0x6e,
Expand Down Expand Up @@ -172,16 +191,16 @@ public class SpecificCharacterSetTest {
(byte) 0x3D };

private static final byte[] CHINESE_LONG_TEXT_GB2312_BYTES = {
(byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x31,
(byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
(byte) 0x31, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29,
(byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
(byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7,
(byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A,
(byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x32,
(byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
(byte) 0x32, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29,
(byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xB6, (byte) 0xFE,
(byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7,
(byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A,
(byte) 0x1B, (byte) 0x24, (byte) 0x29, (byte) 0x41, (byte) 0x33,
(byte) 0x2E, (byte) 0xB5, (byte) 0xDA, (byte) 0xD2, (byte) 0xBB,
(byte) 0x33, (byte) 0x2e, (byte) 0x1B, (byte) 0x24, (byte) 0x29,
(byte) 0x41, (byte) 0xB5, (byte) 0xDA, (byte) 0xC8, (byte) 0xFD,
(byte) 0xD0, (byte) 0xD0, (byte) 0xCE, (byte) 0xC4, (byte) 0xD7,
(byte) 0xD6, (byte) 0xA1, (byte) 0xA3, (byte) 0x0D, (byte) 0x0A };

Expand Down Expand Up @@ -395,7 +414,7 @@ public void testEncodeKoreanLongText() {
@Test
public void testDecodeKoreanLongText() {
assertEquals(KOREAN_LONG_TEXT,
ksx1001().decode(KOREAN_LONG_TEXT_BYTES));
ksx1001().decode(KOREAN_LONG_TEXT_NO_EXPLICIT_ESCSEQ_BYTES));
}

@Test
Expand Down

0 comments on commit 175cd6e

Please sign in to comment.