From b052bb57d1d50d70196db4636c175fb7ba6e3571 Mon Sep 17 00:00:00 2001 From: hatf0 Date: Mon, 15 Nov 2021 15:19:22 -0500 Subject: [PATCH 1/4] Add simple base64 de/encoding support --- source/mir/base64.d | 317 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 source/mir/base64.d diff --git a/source/mir/base64.d b/source/mir/base64.d new file mode 100644 index 00000000..08933e46 --- /dev/null +++ b/source/mir/base64.d @@ -0,0 +1,317 @@ +/++ +$(H1 @nogc Simple Base64 parsing) + +License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0) +Authors: Harrison Ford +Copyright: 2021 Harrison Ford, Kaleidic Associates Advisory Limited, Symmetry Investments ++/ +module mir.base64; +import mir.ndslice.topology; +import core.bitop : bswap; + +package static immutable base64DecodeInvalidCharMsg = "Invalid character encountered."; +package static immutable base64DecodeInvalidLenMsg = "Cannot decode a buffer with given length (not a multiple of 4, missing padding?)"; +version(D_Exceptions) { + package static immutable base64DecodeInvalidCharException = new Exception(base64DecodeInvalidCharMsg); + package static immutable base64DecodeInvalidLenException = new Exception(base64DecodeInvalidLenMsg); +} + +// NOTE: I do not know if this would work on big-endian systems. +// Needs further testing to figure out if it *does* work on them. + +// Technique borrowed from http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html#branchless-code-for-lookup-table +private char lookup_encoding(ubyte i) @safe @nogc pure { + assert(i < 64); + + ubyte shift; + + if (i < 26) + { + // range A-Z + shift = 'A'; + } + else if (i >= 26 && i < 52) + { + // range a-z + shift = 'a' - 26; + } + else if (i >= 52 && i < 62) + { + // range 0-9 + shift = cast(ubyte)('0' - 52); + } + else if (i == 62) + { + // character plus + shift = cast(ubyte)('+' - 62); + } + else if (i == 63) + { + // character slash + shift = cast(ubyte)('/' - 63); + } + + return cast(char)(i + shift); +} + +// Do the inverse of above (convert an ASCII value into the Base64 character set) +private ubyte lookup_decoding(char i) @safe @nogc pure +{ + // Branching bad, but this isn't performance sensitive + if (i <= 'Z' && i >= 'A') { + return cast(ubyte)(i - 'A'); + } + else if (i <= 'z' && i >= 'a') { + return cast(ubyte)(i - 'a' + 26); + } + else if (i <= '9' && i >= '0') { + return cast(ubyte)(i - '0' + 52); + } + else if (i == '+') { + return 62; + } + else if (i == '/') { + return 63; + } + // Just return 0 for padding, + // as it typically means nothing. + else if (i == '=') { + return 0; + } + else { + version(D_Exceptions) { + throw base64DecodeInvalidCharException; + } else { + assert(0, base64DecodeInvalidCharMsg); + } + } + +} + +/++ +Decode a Base64 encoded value, returning the buffer. ++/ +ubyte[] decodeBase64(scope ubyte[] buf) @safe pure +{ + import mir.appender : scopedBuffer; + auto app = scopedBuffer!ubyte; + decodeBase64(buf, app); + return app.data.dup; +} + +/++ +Decode a Base64 encoded value, placing the result onto an Appender. ++/ +void decodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @nogc pure +{ + // We expect data should be well-formed (with padding), + // so we should throw if it is not well-formed. + if (input.length % 4 != 0) + { + version(D_Exceptions) { + throw base64DecodeInvalidLenException; + } else { + assert(0, base64DecodeInvalidLenMsg); + } + } + foreach(group; input.bytegroup!(4, uint).map!bswap) + { + // We only expect valid ASCII values for these, + // hence the 0x7f. + const(ubyte) a = lookup_decoding((group >> 24) & 0x7f); + const(ubyte) b = lookup_decoding((group >> 16) & 0x7f); + const(ubyte) c = lookup_decoding((group >> 8) & 0x7f); + const(ubyte) d = lookup_decoding((group) & 0x7f); + + // We do the inverse of how we encoded it... + uint transformed_group = (a << 26) | (b << 20) | (c << 14) | (d << 8); + + const(ubyte) t_a = (transformed_group >> 24) & 0xff; + const(ubyte) t_b = (transformed_group >> 16) & 0xff; + const(ubyte) t_c = (transformed_group >> 8) & 0xff; + const(ubyte) t_d = (transformed_group) & 0xff; + + // We should *always* have enough for at least + // one, but we don't need to have enough for the rest.. + appender.put(t_a); + + // Only emit transformed groups if we have enough data for them. + if (t_b == 0 && t_c == 0 && t_d == 0) + { + return; + } + else if (t_c == 0 && t_d == 0) + { + appender.put(t_b); + } + else if (t_d == 0) + { + appender.put(t_b); + appender.put(t_c); + } + else + { + appender.put(t_b); + appender.put(t_c); + appender.put(t_d); + } + } +} + +/// Test decoding of data which has a length which can be +/// cleanly decoded. +unittest +{ + { + ubyte[] data = cast(ubyte[])"QUJD"; + assert(data.decodeBase64 == "ABC"); + } + + { + ubyte[] data = cast(ubyte[])"QQ=="; + assert(data.decodeBase64 == "A"); + } + + { + ubyte[] data = cast(ubyte[])"YSBiIGMgZCBlIGYgZyBoIGkgaiBrIGwgbSBuIG8gcCBxIHIgcyB0IHUgdiB3IHggeSB6"; + assert(data.decodeBase64 == "a b c d e f g h i j k l m n o p q r s t u v w x y z"); + } + + { + ubyte[] data = cast(ubyte[])"LCAuIDsgLyBbICcgXSBcID0gLSAwIDkgOCA3IDYgNSA0IDMgMiAxIGAgfiAhIEAgIyAkICUgXiAmICogKCApIF8gKyB8IDogPCA+ID8="; + assert(data.decodeBase64 == ", . ; / [ ' ] \\ = - 0 9 8 7 6 5 4 3 2 1 ` ~ ! @ # $ % ^ & * ( ) _ + | : < > ?"); + } +} + +/++ +Encode a ubyte array as Base64, returning the encoded value. ++/ +ubyte[] encodeBase64(scope ubyte[] buf) @safe pure +{ + import mir.appender : scopedBuffer; + auto app = scopedBuffer!ubyte; + encodeBase64(buf, app); + return app.data.dup; +} + +/++ +Encode a ubyte array as Base64, placing the result onto an Appender. ++/ +void encodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @nogc pure +{ + // Slice our input array so that n % 3 == 0 (we have a multiple of 3) + // If we have less then 3, then this is effectively a no-op (will result in a 0-length slice) + ubyte[] window = input[0 .. input.length - (input.length % 3)]; + foreach(group; window.bytegroup!(3, uint).map!bswap) { + const(ubyte) a = (group >> 26) & 0x3f; + const(ubyte) b = (group >> 20) & 0x3f; + const(ubyte) c = (group >> 14) & 0x3f; + const(ubyte) d = (group >> 8) & 0x3f; + + appender.put(a.lookup_encoding); + appender.put(b.lookup_encoding); + appender.put(c.lookup_encoding); + appender.put(d.lookup_encoding); + } + + // If it's a clean multiple of 3, then it requires no padding. + // If not, then we need to add padding. + if (input.length % 3 != 0) + { + window = input[window.length .. input.length]; + + uint group = (window[0] << 24); + + if (window.length == 1) { + const(ubyte) a = (group >> 26) & 0x3f; + const(ubyte) b = (group >> 20) & 0x3f; + appender.put(a.lookup_encoding); + appender.put(b.lookup_encoding); + appender.put('='); + appender.put('='); + } + else { + // Just in case math fails or something + assert(window.length == 2); + + group |= (window[1] << 16); + const(ubyte) a = (group >> 26) & 0x3f; + const(ubyte) b = (group >> 20) & 0x3f; + const(ubyte) c = (group >> 14) & 0x3f; + appender.put(a.lookup_encoding); + appender.put(b.lookup_encoding); + appender.put(c.lookup_encoding); + appender.put('='); + } + } +} + +/// Test encoding of data which has a length that can be cleanly +/// encoded. +unittest +{ + // 3 bytes + { + ubyte[] data = cast(ubyte[])"ABC"; + assert(data.encodeBase64 == cast(ubyte[])"QUJD"); + } + + // 6 bytes + { + ubyte[] data = cast(ubyte[])"ABCDEF"; + assert(data.encodeBase64 == cast(ubyte[])"QUJDREVG"); + } + + // 9 bytes + { + ubyte[] data = cast(ubyte[])"ABCDEFGHI"; + assert(data.encodeBase64 == cast(ubyte[])"QUJDREVGR0hJ"); + } + + // 12 bytes + { + ubyte[] data = cast(ubyte[])"ABCDEFGHIJKL"; + assert(data.encodeBase64 == cast(ubyte[])"QUJDREVGR0hJSktM"); + } +} + +/// Test encoding of data which has a length which CANNOT be cleanly encoded. +/// This typically means that there's padding. +unittest +{ + // 1 byte + { + ubyte[] data = cast(ubyte[])"A"; + assert(data.encodeBase64 == cast(ubyte[])"QQ=="); + } + // 2 bytes + { + ubyte[] data = cast(ubyte[])"AB"; + assert(data.encodeBase64 == cast(ubyte[])"QUI="); + } + // 4 bytes + { + ubyte[] data = [0xDE, 0xAD, 0xBA, 0xBE]; + assert(data.encodeBase64 == cast(ubyte[])"3q26vg=="); + } + // 37 bytes + { + ubyte[] data = cast(ubyte[])"A Very Very Very Very Large Test Blob"; + assert(data.encodeBase64 == cast(ubyte[])"QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); + } +} + +/// Make sure we can decode what we encode. +unittest +{ + // Test an example string + { + enum ubyte[] data = cast(ubyte[])"abc123!?$*&()'-=@~"; + assert(data.encodeBase64.decodeBase64 == data); + } + // Test an example from Ion data + { + enum ubyte[] data = cast(ubyte[])"a b c d e f g h i j k l m n o p q r s t u v w x y z"; + assert(data.encodeBase64.decodeBase64 == data); + } +} \ No newline at end of file From 74bc1d4a05ecf5823e491a9bebf0d51c2ef5969b Mon Sep 17 00:00:00 2001 From: hatf0 Date: Wed, 17 Nov 2021 10:24:31 -0500 Subject: [PATCH 2/4] Add fixes, improve coverage --- source/mir/base64.d | 274 ++++++++++++++++++++++++++++++-------------- 1 file changed, 191 insertions(+), 83 deletions(-) diff --git a/source/mir/base64.d b/source/mir/base64.d index 08933e46..c645ee2f 100644 --- a/source/mir/base64.d +++ b/source/mir/base64.d @@ -7,7 +7,6 @@ Copyright: 2021 Harrison Ford, Kaleidic Associates Advisory Limited, Symmetry In +/ module mir.base64; import mir.ndslice.topology; -import core.bitop : bswap; package static immutable base64DecodeInvalidCharMsg = "Invalid character encountered."; package static immutable base64DecodeInvalidLenMsg = "Cannot decode a buffer with given length (not a multiple of 4, missing padding?)"; @@ -20,7 +19,7 @@ version(D_Exceptions) { // Needs further testing to figure out if it *does* work on them. // Technique borrowed from http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html#branchless-code-for-lookup-table -private char lookup_encoding(ubyte i) @safe @nogc pure { +private char lookup_encoding(char PlusChar = '+', char SlashChar = '/')(ubyte i) @safe @nogc pure { assert(i < 64); ubyte shift; @@ -43,19 +42,19 @@ private char lookup_encoding(ubyte i) @safe @nogc pure { else if (i == 62) { // character plus - shift = cast(ubyte)('+' - 62); + shift = cast(ubyte)(PlusChar - 62); } else if (i == 63) { // character slash - shift = cast(ubyte)('/' - 63); + shift = cast(ubyte)(SlashChar - 63); } return cast(char)(i + shift); } // Do the inverse of above (convert an ASCII value into the Base64 character set) -private ubyte lookup_decoding(char i) @safe @nogc pure +private ubyte lookup_decoding(char PlusChar = '+', char SlashChar = '/')(char i) @safe @nogc pure { // Branching bad, but this isn't performance sensitive if (i <= 'Z' && i >= 'A') { @@ -67,10 +66,10 @@ private ubyte lookup_decoding(char i) @safe @nogc pure else if (i <= '9' && i >= '0') { return cast(ubyte)(i - '0' + 52); } - else if (i == '+') { + else if (i == PlusChar) { return 62; } - else if (i == '/') { + else if (i == SlashChar) { return 63; } // Just return 0 for padding, @@ -91,22 +90,25 @@ private ubyte lookup_decoding(char i) @safe @nogc pure /++ Decode a Base64 encoded value, returning the buffer. +/ -ubyte[] decodeBase64(scope ubyte[] buf) @safe pure +ubyte[] decodeBase64(char PlusChar = '+', char SlashChar = '/')(scope const(char)[] data) @safe pure { import mir.appender : scopedBuffer; auto app = scopedBuffer!ubyte; - decodeBase64(buf, app); + decodeBase64!(PlusChar, SlashChar)(data, app); return app.data.dup; } /++ Decode a Base64 encoded value, placing the result onto an Appender. +/ -void decodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @nogc pure +void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope const(char)[] data, + scope return ref Appender appender) @safe pure { + import mir.ndslice.slice : sliced; + import mir.ndslice.chunks : chunks; // We expect data should be well-formed (with padding), // so we should throw if it is not well-formed. - if (input.length % 4 != 0) + if (data.length % 4 != 0) { version(D_Exceptions) { throw base64DecodeInvalidLenException; @@ -114,47 +116,93 @@ void decodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @n assert(0, base64DecodeInvalidLenMsg); } } - foreach(group; input.bytegroup!(4, uint).map!bswap) + + ubyte[3] decodedByteGroup; + ubyte sz = 0; + auto groups = data.sliced.chunks(4); + for (size_t i = 0; i < groups.length; i++) { - // We only expect valid ASCII values for these, - // hence the 0x7f. - const(ubyte) a = lookup_decoding((group >> 24) & 0x7f); - const(ubyte) b = lookup_decoding((group >> 16) & 0x7f); - const(ubyte) c = lookup_decoding((group >> 8) & 0x7f); - const(ubyte) d = lookup_decoding((group) & 0x7f); - - // We do the inverse of how we encoded it... - uint transformed_group = (a << 26) | (b << 20) | (c << 14) | (d << 8); - - const(ubyte) t_a = (transformed_group >> 24) & 0xff; - const(ubyte) t_b = (transformed_group >> 16) & 0xff; - const(ubyte) t_c = (transformed_group >> 8) & 0xff; - const(ubyte) t_d = (transformed_group) & 0xff; - - // We should *always* have enough for at least - // one, but we don't need to have enough for the rest.. - appender.put(t_a); - - // Only emit transformed groups if we have enough data for them. - if (t_b == 0 && t_c == 0 && t_d == 0) + auto group = groups[i]; + + ubyte[4] decodedBytes; + decodedBytes[0] = lookup_decoding!(PlusChar, SlashChar)(group[0]); + decodedBytes[1] = lookup_decoding!(PlusChar, SlashChar)(group[1]); + + uint transformed_group = (decodedBytes[0] << 26) | (decodedBytes[1] << 20); + + // According to RFC4648 Section 3.3, we don't have to accept extra padding characters, + // and we can safely throw (and stay within spec). + // x=== is also invalid, so we can just throw on that here. + if (group[0] == '=' || group[1] == '=') { - return; + version(D_Exceptions) + throw base64DecodeInvalidCharException; + else + assert(0, base64DecodeInvalidCharMsg); } - else if (t_c == 0 && t_d == 0) + + // xx=(=)? + if (group[2] == '=') { - appender.put(t_b); - } - else if (t_d == 0) + // If we are not at the end of a string, according to RFC4648, + // we can safely treat a padding character as "non-alphabet data", + // and as such, we should throw. See RFC4648 Section 3.3 for more information + if (i != (groups.length - 1)) + { + version(D_Exceptions) + throw base64DecodeInvalidCharException; + else + assert(0, base64DecodeInvalidCharMsg); + } + + if (group[3] == '=') + { + // xx== + sz = 1; + } + // xx=x (invalid) + // Padding should not be in the middle of a chunk + else + { + version(D_Exceptions) + throw base64DecodeInvalidCharException; + else + assert(0, base64DecodeInvalidCharMsg); + } + } + // xxx= + else if (group[3] == '=') { - appender.put(t_b); - appender.put(t_c); + // If we are not at the end of a string, according to RFC4648, + // we can safely treat a padding character as "non-alphabet data", + // and as such, we should throw. See RFC4648 Section 3.3 for more information + if (i != (groups.length - 1)) + { + version(D_Exceptions) + throw base64DecodeInvalidCharException; + else + assert(0, base64DecodeInvalidCharMsg); + } + + decodedBytes[2] = lookup_decoding!(PlusChar, SlashChar)(group[2]); + transformed_group |= (decodedBytes[2] << 14); + sz = 2; } - else + // xxxx + else { - appender.put(t_b); - appender.put(t_c); - appender.put(t_d); + decodedBytes[2] = lookup_decoding!(PlusChar, SlashChar)(group[2]); + decodedBytes[3] = lookup_decoding!(PlusChar, SlashChar)(group[3]); + transformed_group |= ((decodedBytes[2] << 14) | (decodedBytes[3] << 8)); + sz = 3; } + + decodedByteGroup[0] = (transformed_group >> 24) & 0xff; + decodedByteGroup[1] = (transformed_group >> 16) & 0xff; + decodedByteGroup[2] = (transformed_group >> 8) & 0xff; + + // Only emit the transformed bytes that we got data for. + appender.put(decodedByteGroup[0 .. sz]); } } @@ -163,55 +211,113 @@ void decodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @n unittest { { - ubyte[] data = cast(ubyte[])"QUJD"; + enum data = "QUJD"; assert(data.decodeBase64 == "ABC"); } { - ubyte[] data = cast(ubyte[])"QQ=="; + enum data = "QQ=="; assert(data.decodeBase64 == "A"); } { - ubyte[] data = cast(ubyte[])"YSBiIGMgZCBlIGYgZyBoIGkgaiBrIGwgbSBuIG8gcCBxIHIgcyB0IHUgdiB3IHggeSB6"; + enum data = "YSBiIGMgZCBlIGYgZyBoIGkgaiBrIGwgbSBuIG8gcCBxIHIgcyB0IHUgdiB3IHggeSB6"; assert(data.decodeBase64 == "a b c d e f g h i j k l m n o p q r s t u v w x y z"); } { - ubyte[] data = cast(ubyte[])"LCAuIDsgLyBbICcgXSBcID0gLSAwIDkgOCA3IDYgNSA0IDMgMiAxIGAgfiAhIEAgIyAkICUgXiAmICogKCApIF8gKyB8IDogPCA+ID8="; + enum data = "LCAuIDsgLyBbICcgXSBcID0gLSAwIDkgOCA3IDYgNSA0IDMgMiAxIGAgfiAhIEAgIyAkICUgXiAmICogKCApIF8gKyB8IDogPCA+ID8="; assert(data.decodeBase64 == ", . ; / [ ' ] \\ = - 0 9 8 7 6 5 4 3 2 1 ` ~ ! @ # $ % ^ & * ( ) _ + | : < > ?"); } + + { + enum data = "AAA="; + assert(data.decodeBase64 == "\x00\x00"); + } + + { + enum data = "AAAABBCC"; + assert(data.decodeBase64 == "\x00\x00\x00\x04\x10\x82"); + } + + { + enum data = "AA=="; + assert(data.decodeBase64 == "\x00"); + } + + { + enum data = "AA/="; + assert(data.decodeBase64 == "\x00\x0f"); + } +} + +/// Test decoding invalid data +unittest +{ + void testFail(const(char)[] input) + { + bool thrown = false; + try { + ubyte[] decoded = input.decodeBase64; + } catch (Throwable t) { + thrown = true; + } + + assert(thrown); + } + + testFail("===A"); + testFail("A="); + testFail("AA="); + testFail("A=AA"); + testFail("AA=A"); + testFail("AA=A===="); + testFail("=AAA"); + testFail("AAA=QUJD"); + // This fails because we don't allow extra padding (than what is necessary) + testFail("AA======"); + // This fails because we don't allow padding before the end of the string (otherwise we'd have a side-channel) + testFail("QU==QUJD"); + testFail("QU======QUJD"); + // Invalid data that's out of the alphabet + testFail("!@##@@!@"); } /++ Encode a ubyte array as Base64, returning the encoded value. +/ -ubyte[] encodeBase64(scope ubyte[] buf) @safe pure +const(char)[] encodeBase64(char PlusChar = '+', char SlashChar = '/')(scope const(ubyte)[] buf) @safe pure { import mir.appender : scopedBuffer; - auto app = scopedBuffer!ubyte; - encodeBase64(buf, app); + // XXX: is a stringBuf more appropriate here? + auto app = scopedBuffer!char; + encodeBase64!(PlusChar, SlashChar)(buf, app); return app.data.dup; } /++ Encode a ubyte array as Base64, placing the result onto an Appender. +/ -void encodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @nogc pure +void encodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope const(ubyte)[] input, + scope return ref Appender appender) @safe pure { + import mir.ndslice.topology : bytegroup, map; + import core.bitop : bswap; // Slice our input array so that n % 3 == 0 (we have a multiple of 3) // If we have less then 3, then this is effectively a no-op (will result in a 0-length slice) - ubyte[] window = input[0 .. input.length - (input.length % 3)]; + char[4] encodedByteGroup; + const(ubyte)[] window = input[0 .. input.length - (input.length % 3)]; foreach(group; window.bytegroup!(3, uint).map!bswap) { const(ubyte) a = (group >> 26) & 0x3f; const(ubyte) b = (group >> 20) & 0x3f; const(ubyte) c = (group >> 14) & 0x3f; const(ubyte) d = (group >> 8) & 0x3f; - appender.put(a.lookup_encoding); - appender.put(b.lookup_encoding); - appender.put(c.lookup_encoding); - appender.put(d.lookup_encoding); + encodedByteGroup[0] = a.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[1] = b.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[2] = c.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[3] = d.lookup_encoding!(PlusChar, SlashChar); + appender.put(encodedByteGroup[]); } // If it's a clean multiple of 3, then it requires no padding. @@ -225,10 +331,11 @@ void encodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @n if (window.length == 1) { const(ubyte) a = (group >> 26) & 0x3f; const(ubyte) b = (group >> 20) & 0x3f; - appender.put(a.lookup_encoding); - appender.put(b.lookup_encoding); - appender.put('='); - appender.put('='); + encodedByteGroup[0] = a.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[1] = b.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[2] = '='; + encodedByteGroup[3] = '='; + appender.put(encodedByteGroup[]); } else { // Just in case math fails or something @@ -238,10 +345,11 @@ void encodeBase64(Appender)(scope ubyte[] input, ref Appender appender) @safe @n const(ubyte) a = (group >> 26) & 0x3f; const(ubyte) b = (group >> 20) & 0x3f; const(ubyte) c = (group >> 14) & 0x3f; - appender.put(a.lookup_encoding); - appender.put(b.lookup_encoding); - appender.put(c.lookup_encoding); - appender.put('='); + encodedByteGroup[0] = a.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[1] = b.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[2] = c.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[3] = '='; + appender.put(encodedByteGroup[]); } } } @@ -252,26 +360,26 @@ unittest { // 3 bytes { - ubyte[] data = cast(ubyte[])"ABC"; - assert(data.encodeBase64 == cast(ubyte[])"QUJD"); + enum data = cast(ubyte[])"ABC"; + assert(data.encodeBase64 == "QUJD"); } // 6 bytes { - ubyte[] data = cast(ubyte[])"ABCDEF"; - assert(data.encodeBase64 == cast(ubyte[])"QUJDREVG"); + enum data = cast(ubyte[])"ABCDEF"; + assert(data.encodeBase64 == "QUJDREVG"); } // 9 bytes { - ubyte[] data = cast(ubyte[])"ABCDEFGHI"; - assert(data.encodeBase64 == cast(ubyte[])"QUJDREVGR0hJ"); + enum data = cast(ubyte[])"ABCDEFGHI"; + assert(data.encodeBase64 == "QUJDREVGR0hJ"); } // 12 bytes { - ubyte[] data = cast(ubyte[])"ABCDEFGHIJKL"; - assert(data.encodeBase64 == cast(ubyte[])"QUJDREVGR0hJSktM"); + enum data = cast(ubyte[])"ABCDEFGHIJKL"; + assert(data.encodeBase64 == "QUJDREVGR0hJSktM"); } } @@ -281,23 +389,23 @@ unittest { // 1 byte { - ubyte[] data = cast(ubyte[])"A"; - assert(data.encodeBase64 == cast(ubyte[])"QQ=="); + enum data = cast(ubyte[])"A"; + assert(data.encodeBase64 == "QQ=="); } // 2 bytes { - ubyte[] data = cast(ubyte[])"AB"; - assert(data.encodeBase64 == cast(ubyte[])"QUI="); + enum data = cast(ubyte[])"AB"; + assert(data.encodeBase64 == "QUI="); } // 4 bytes { - ubyte[] data = [0xDE, 0xAD, 0xBA, 0xBE]; - assert(data.encodeBase64 == cast(ubyte[])"3q26vg=="); + enum data = [0xDE, 0xAD, 0xBA, 0xBE]; + assert(data.encodeBase64 == "3q26vg=="); } // 37 bytes { - ubyte[] data = cast(ubyte[])"A Very Very Very Very Large Test Blob"; - assert(data.encodeBase64 == cast(ubyte[])"QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); + enum data = cast(ubyte[])"A Very Very Very Very Large Test Blob"; + assert(data.encodeBase64 == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); } } @@ -306,12 +414,12 @@ unittest { // Test an example string { - enum ubyte[] data = cast(ubyte[])"abc123!?$*&()'-=@~"; + enum data = cast(ubyte[])"abc123!?$*&()'-=@~"; assert(data.encodeBase64.decodeBase64 == data); } // Test an example from Ion data { - enum ubyte[] data = cast(ubyte[])"a b c d e f g h i j k l m n o p q r s t u v w x y z"; + enum data = cast(ubyte[])"a b c d e f g h i j k l m n o p q r s t u v w x y z"; assert(data.encodeBase64.decodeBase64 == data); } } \ No newline at end of file From 217898ee187ddce13a245cb5fd9e42633e3c3384 Mon Sep 17 00:00:00 2001 From: hatf0 Date: Wed, 17 Nov 2021 10:32:41 -0500 Subject: [PATCH 3/4] Fix build --- source/mir/base64.d | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/source/mir/base64.d b/source/mir/base64.d index c645ee2f..f305e514 100644 --- a/source/mir/base64.d +++ b/source/mir/base64.d @@ -104,8 +104,6 @@ Decode a Base64 encoded value, placing the result onto an Appender. void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope const(char)[] data, scope return ref Appender appender) @safe pure { - import mir.ndslice.slice : sliced; - import mir.ndslice.chunks : chunks; // We expect data should be well-formed (with padding), // so we should throw if it is not well-formed. if (data.length % 4 != 0) @@ -119,10 +117,12 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con ubyte[3] decodedByteGroup; ubyte sz = 0; - auto groups = data.sliced.chunks(4); - for (size_t i = 0; i < groups.length; i++) + + // We can't use mir.ndslice.chunk.chunks here, as it violates + // the scope requirements. + for (size_t i = 0; i < data.length; i += 4) { - auto group = groups[i]; + auto group = data[i .. (i + 4)]; ubyte[4] decodedBytes; decodedBytes[0] = lookup_decoding!(PlusChar, SlashChar)(group[0]); @@ -147,7 +147,7 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con // If we are not at the end of a string, according to RFC4648, // we can safely treat a padding character as "non-alphabet data", // and as such, we should throw. See RFC4648 Section 3.3 for more information - if (i != (groups.length - 1)) + if ((i / 4) != ((data.length / 4) - 1)) { version(D_Exceptions) throw base64DecodeInvalidCharException; @@ -176,7 +176,7 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con // If we are not at the end of a string, according to RFC4648, // we can safely treat a padding character as "non-alphabet data", // and as such, we should throw. See RFC4648 Section 3.3 for more information - if (i != (groups.length - 1)) + if ((i / 4) != ((data.length / 4) - 1)) { version(D_Exceptions) throw base64DecodeInvalidCharException; From 511ab90a173b6b2f553d74b0d4e0ee5293ae4bd4 Mon Sep 17 00:00:00 2001 From: hatf0 Date: Fri, 19 Nov 2021 10:59:06 -0500 Subject: [PATCH 4/4] Remove Kaledic from copyright, add fixes requested --- source/mir/base64.d | 133 +++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 52 deletions(-) diff --git a/source/mir/base64.d b/source/mir/base64.d index f305e514..9b754f93 100644 --- a/source/mir/base64.d +++ b/source/mir/base64.d @@ -3,7 +3,7 @@ $(H1 @nogc Simple Base64 parsing) License: $(HTTP www.apache.org/licenses/LICENSE-2.0, Apache-2.0) Authors: Harrison Ford -Copyright: 2021 Harrison Ford, Kaleidic Associates Advisory Limited, Symmetry Investments +Copyright: 2021 Harrison Ford, Symmetry Investments +/ module mir.base64; import mir.ndslice.topology; @@ -19,7 +19,7 @@ version(D_Exceptions) { // Needs further testing to figure out if it *does* work on them. // Technique borrowed from http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html#branchless-code-for-lookup-table -private char lookup_encoding(char PlusChar = '+', char SlashChar = '/')(ubyte i) @safe @nogc pure { +private char lookup_encoding(ubyte i, char plusChar = '+', char slashChar = '/') @safe @nogc pure { assert(i < 64); ubyte shift; @@ -42,19 +42,19 @@ private char lookup_encoding(char PlusChar = '+', char SlashChar = '/')(ubyte i) else if (i == 62) { // character plus - shift = cast(ubyte)(PlusChar - 62); + shift = cast(ubyte)(plusChar - 62); } else if (i == 63) { // character slash - shift = cast(ubyte)(SlashChar - 63); + shift = cast(ubyte)(slashChar - 63); } return cast(char)(i + shift); } // Do the inverse of above (convert an ASCII value into the Base64 character set) -private ubyte lookup_decoding(char PlusChar = '+', char SlashChar = '/')(char i) @safe @nogc pure +private ubyte lookup_decoding(char i, char plusChar = '+', char slashChar = '/') @safe @nogc pure { // Branching bad, but this isn't performance sensitive if (i <= 'Z' && i >= 'A') { @@ -66,10 +66,10 @@ private ubyte lookup_decoding(char PlusChar = '+', char SlashChar = '/')(char i) else if (i <= '9' && i >= '0') { return cast(ubyte)(i - '0' + 52); } - else if (i == PlusChar) { + else if (i == plusChar) { return 62; } - else if (i == SlashChar) { + else if (i == slashChar) { return 63; } // Just return 0 for padding, @@ -90,19 +90,21 @@ private ubyte lookup_decoding(char PlusChar = '+', char SlashChar = '/')(char i) /++ Decode a Base64 encoded value, returning the buffer. +/ -ubyte[] decodeBase64(char PlusChar = '+', char SlashChar = '/')(scope const(char)[] data) @safe pure +ubyte[] decodeBase64(scope const(char)[] data, char plusChar = '+', char slashChar = '/') @safe pure { import mir.appender : scopedBuffer; auto app = scopedBuffer!ubyte; - decodeBase64!(PlusChar, SlashChar)(data, app); + decodeBase64(data, app, plusChar, slashChar); return app.data.dup; } /++ Decode a Base64 encoded value, placing the result onto an Appender. +/ -void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope const(char)[] data, - scope return ref Appender appender) @safe pure +void decodeBase64(Appender)(scope const(char)[] data, + scope return ref Appender appender, + char plusChar = '+', + char slashChar = '/') @safe pure { // We expect data should be well-formed (with padding), // so we should throw if it is not well-formed. @@ -117,7 +119,7 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con ubyte[3] decodedByteGroup; ubyte sz = 0; - + // We can't use mir.ndslice.chunk.chunks here, as it violates // the scope requirements. for (size_t i = 0; i < data.length; i += 4) @@ -125,8 +127,8 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con auto group = data[i .. (i + 4)]; ubyte[4] decodedBytes; - decodedBytes[0] = lookup_decoding!(PlusChar, SlashChar)(group[0]); - decodedBytes[1] = lookup_decoding!(PlusChar, SlashChar)(group[1]); + decodedBytes[0] = lookup_decoding(group[0], plusChar, slashChar); + decodedBytes[1] = lookup_decoding(group[1], plusChar, slashChar); uint transformed_group = (decodedBytes[0] << 26) | (decodedBytes[1] << 20); @@ -184,15 +186,15 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con assert(0, base64DecodeInvalidCharMsg); } - decodedBytes[2] = lookup_decoding!(PlusChar, SlashChar)(group[2]); + decodedBytes[2] = lookup_decoding(group[2], plusChar, slashChar); transformed_group |= (decodedBytes[2] << 14); sz = 2; } // xxxx else { - decodedBytes[2] = lookup_decoding!(PlusChar, SlashChar)(group[2]); - decodedBytes[3] = lookup_decoding!(PlusChar, SlashChar)(group[3]); + decodedBytes[2] = lookup_decoding(group[2], plusChar, slashChar); + decodedBytes[3] = lookup_decoding(group[3], plusChar, slashChar); transformed_group |= ((decodedBytes[2] << 14) | (decodedBytes[3] << 8)); sz = 3; } @@ -208,7 +210,7 @@ void decodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con /// Test decoding of data which has a length which can be /// cleanly decoded. -unittest +@safe pure unittest { { enum data = "QUJD"; @@ -252,14 +254,14 @@ unittest } /// Test decoding invalid data -unittest +@safe pure unittest { - void testFail(const(char)[] input) + void testFail(const(char)[] input) @safe pure { bool thrown = false; try { ubyte[] decoded = input.decodeBase64; - } catch (Throwable t) { + } catch (Exception t) { thrown = true; } @@ -286,23 +288,24 @@ unittest /++ Encode a ubyte array as Base64, returning the encoded value. +/ -const(char)[] encodeBase64(char PlusChar = '+', char SlashChar = '/')(scope const(ubyte)[] buf) @safe pure +const(char)[] encodeBase64(scope const(ubyte)[] buf, char plusChar = '+', char slashChar = '/') @safe pure { import mir.appender : scopedBuffer; - // XXX: is a stringBuf more appropriate here? auto app = scopedBuffer!char; - encodeBase64!(PlusChar, SlashChar)(buf, app); + encodeBase64(buf, app, plusChar, slashChar); return app.data.dup; } /++ Encode a ubyte array as Base64, placing the result onto an Appender. +/ -void encodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope const(ubyte)[] input, - scope return ref Appender appender) @safe pure +void encodeBase64(Appender)(scope const(ubyte)[] input, + scope return ref Appender appender, + char plusChar = '+', + char slashChar = '/') @safe pure { - import mir.ndslice.topology : bytegroup, map; import core.bitop : bswap; + import mir.ndslice.topology : bytegroup, map; // Slice our input array so that n % 3 == 0 (we have a multiple of 3) // If we have less then 3, then this is effectively a no-op (will result in a 0-length slice) char[4] encodedByteGroup; @@ -313,10 +316,10 @@ void encodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con const(ubyte) c = (group >> 14) & 0x3f; const(ubyte) d = (group >> 8) & 0x3f; - encodedByteGroup[0] = a.lookup_encoding!(PlusChar, SlashChar); - encodedByteGroup[1] = b.lookup_encoding!(PlusChar, SlashChar); - encodedByteGroup[2] = c.lookup_encoding!(PlusChar, SlashChar); - encodedByteGroup[3] = d.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar); + encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar); + encodedByteGroup[2] = c.lookup_encoding(plusChar, slashChar); + encodedByteGroup[3] = d.lookup_encoding(plusChar, slashChar); appender.put(encodedByteGroup[]); } @@ -331,72 +334,77 @@ void encodeBase64(char PlusChar = '+', char SlashChar = '/', Appender)(scope con if (window.length == 1) { const(ubyte) a = (group >> 26) & 0x3f; const(ubyte) b = (group >> 20) & 0x3f; - encodedByteGroup[0] = a.lookup_encoding!(PlusChar, SlashChar); - encodedByteGroup[1] = b.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar); + encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar); encodedByteGroup[2] = '='; encodedByteGroup[3] = '='; - appender.put(encodedByteGroup[]); } else { - // Just in case math fails or something + // Just in case assert(window.length == 2); group |= (window[1] << 16); const(ubyte) a = (group >> 26) & 0x3f; const(ubyte) b = (group >> 20) & 0x3f; const(ubyte) c = (group >> 14) & 0x3f; - encodedByteGroup[0] = a.lookup_encoding!(PlusChar, SlashChar); - encodedByteGroup[1] = b.lookup_encoding!(PlusChar, SlashChar); - encodedByteGroup[2] = c.lookup_encoding!(PlusChar, SlashChar); + encodedByteGroup[0] = a.lookup_encoding(plusChar, slashChar); + encodedByteGroup[1] = b.lookup_encoding(plusChar, slashChar); + encodedByteGroup[2] = c.lookup_encoding(plusChar, slashChar); encodedByteGroup[3] = '='; - appender.put(encodedByteGroup[]); } + + appender.put(encodedByteGroup[]); } } /// Test encoding of data which has a length that can be cleanly /// encoded. -unittest +@safe pure unittest { // 3 bytes { - enum data = cast(ubyte[])"ABC"; + enum data = cast(immutable(ubyte)[])"ABC"; assert(data.encodeBase64 == "QUJD"); } // 6 bytes { - enum data = cast(ubyte[])"ABCDEF"; + enum data = cast(immutable(ubyte)[])"ABCDEF"; assert(data.encodeBase64 == "QUJDREVG"); } // 9 bytes { - enum data = cast(ubyte[])"ABCDEFGHI"; + enum data = cast(immutable(ubyte)[])"ABCDEFGHI"; assert(data.encodeBase64 == "QUJDREVGR0hJ"); } // 12 bytes { - enum data = cast(ubyte[])"ABCDEFGHIJKL"; + enum data = cast(immutable(ubyte)[])"ABCDEFGHIJKL"; assert(data.encodeBase64 == "QUJDREVGR0hJSktM"); } } /// Test encoding of data which has a length which CANNOT be cleanly encoded. /// This typically means that there's padding. -unittest +@safe pure unittest { // 1 byte { - enum data = cast(ubyte[])"A"; + enum data = cast(immutable(ubyte)[])"A"; assert(data.encodeBase64 == "QQ=="); } // 2 bytes { - enum data = cast(ubyte[])"AB"; + enum data = cast(immutable(ubyte)[])"AB"; assert(data.encodeBase64 == "QUI="); } + // 2 bytes + { + enum data = [0xFF, 0xFF]; + assert(data.encodeBase64 == "//8="); + } // 4 bytes { enum data = [0xDE, 0xAD, 0xBA, 0xBE]; @@ -404,22 +412,43 @@ unittest } // 37 bytes { - enum data = cast(ubyte[])"A Very Very Very Very Large Test Blob"; + enum data = cast(immutable(ubyte)[])"A Very Very Very Very Large Test Blob"; assert(data.encodeBase64 == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); } } +/// Test nogc encoding +@safe pure @nogc unittest +{ + import mir.appender : scopedBuffer; + + { + enum data = cast(immutable(ubyte)[])"A Very Very Very Very Large Test Blob"; + auto appender = scopedBuffer!char(); + data.encodeBase64(appender); + assert(appender.data == "QSBWZXJ5IFZlcnkgVmVyeSBWZXJ5IExhcmdlIFRlc3QgQmxvYg=="); + } + + { + enum data = cast(immutable(ubyte)[])"abc123!?$*&()'-=@~"; + auto appender = scopedBuffer!char(); + data.encodeBase64(appender); + assert(appender.data == "YWJjMTIzIT8kKiYoKSctPUB+"); + } +} + /// Make sure we can decode what we encode. -unittest +@safe pure unittest { // Test an example string { - enum data = cast(ubyte[])"abc123!?$*&()'-=@~"; + enum data = cast(immutable(ubyte)[])"abc123!?$*&()'-=@~"; assert(data.encodeBase64.decodeBase64 == data); } // Test an example from Ion data { - enum data = cast(ubyte[])"a b c d e f g h i j k l m n o p q r s t u v w x y z"; + enum data = cast(immutable(ubyte)[])"a b c d e f g h i j k l m n o p q r s t u v w x y z"; assert(data.encodeBase64.decodeBase64 == data); } -} \ No newline at end of file +} +