From fa61e526637ef9d4d9f826c65cc403dc5b605545 Mon Sep 17 00:00:00 2001 From: Pavel Bazika Date: Wed, 14 Jul 2021 14:30:17 +0200 Subject: [PATCH 1/5] ParseMediaType tolerates unencoded 8bit characters --- header.go | 132 ++++++++++++++++++++++++++++--------------------- header_test.go | 30 +++++++++++ 2 files changed, 106 insertions(+), 56 deletions(-) diff --git a/header.go b/header.go index f636881e..8d3a8dde 100644 --- a/header.go +++ b/header.go @@ -358,6 +358,7 @@ func consumeParam(s string) (consumed, rest string) { valueQuotedOriginally := false valueQuoteAdded := false valueQuoteNeeded := false + rfc2047Needed := false var r rune findValueStart: @@ -369,7 +370,8 @@ findValueStart: case '"': valueQuotedOriginally = true valueQuoteAdded = true - value.WriteRune(r) + valueQuoteNeeded = true + param.WriteRune(r) break findValueStart @@ -381,6 +383,10 @@ findValueStart: break findValueStart default: + if r > 127 { + rfc2047Needed = true + } + valueQuotedOriginally = false valueQuoteAdded = false value.WriteRune(r) @@ -389,6 +395,20 @@ findValueStart: } } + quoteIfUnquoted := func() { + if !valueQuoteNeeded { + if !valueQuoteAdded { + param.WriteByte('"') + + valueQuoteAdded = true + } + + valueQuoteNeeded = true + } + } + + hasRest := false + if len(s)-i < 1 { // parameter value starts at the end of the string, make empty // quoted string to play nice with mime.ParseMediaType @@ -397,81 +417,57 @@ findValueStart: } else { // The beginning of the value is not at the end of the string - quoteIfUnquoted := func() { - if !valueQuoteNeeded { - if !valueQuoteAdded { - param.WriteByte('"') - - valueQuoteAdded = true - } - - valueQuoteNeeded = true - } - } - for _, v := range []byte{'(', ')', '<', '>', '@', ',', ':', '/', '[', ']', '?', '='} { if s[0] == v { quoteIfUnquoted() + break } } s = s[i+1:] + escaped := false findValueEnd: - for len(s) > 0 { - switch s[0] { + for i, r = range s { + if escaped { + value.WriteRune(r) + escaped = false + continue + } + + switch r { case ';', ' ', '\t': if valueQuotedOriginally { // We're in a quoted string, so whitespace is allowed. - value.WriteByte(s[0]) - s = s[1:] + value.WriteRune(r) break } // Otherwise, we've reached the end of an unquoted value. - - param.WriteString(value.String()) - value.Reset() - - if valueQuoteNeeded { - param.WriteByte('"') - } - - param.WriteByte(s[0]) - s = s[1:] - + hasRest = true + s = s[i:] break findValueEnd case '"': if valueQuotedOriginally { // We're in a quoted value. This is the end of that value. - param.WriteString(value.String()) - value.Reset() - - param.WriteByte(s[0]) - s = s[1:] - + hasRest = true + s = s[i:] break findValueEnd } quoteIfUnquoted() value.WriteByte('\\') - value.WriteByte(s[0]) - s = s[1:] + value.WriteRune(r) case '\\': - if len(s) > 1 { - value.WriteByte(s[0]) - s = s[1:] - - // Backslash escapes the next char. Consume that next char. - value.WriteByte(s[0]) - + if i < len(s)-1 { + // If next char is present, escape it with backslash + value.WriteRune(r) + escaped = true quoteIfUnquoted() } - // Else there is no next char to consume. - s = s[1:] case '(', ')', '<', '>', '@', ',', ':', '/', '[', ']', '?', '=': quoteIfUnquoted() @@ -479,23 +475,47 @@ findValueStart: fallthrough default: - value.WriteByte(s[0]) - s = s[1:] + if r > 127 { + rfc2047Needed = true + } + value.WriteRune(r) } } } + if !hasRest { + // Whole string was processed + s = "" + } + if value.Len() > 0 { - // There is a value that ends with the string. Capture it. - param.WriteString(value.String()) - - if valueQuotedOriginally || valueQuoteNeeded { - // If valueQuotedOriginally is true and we got here, - // that means there was no closing quote. So we'll add one. - // Otherwise, we're here because it was an unquoted value - // with a special char in it, and we had to quote it. - param.WriteByte('"') + // Convert whole value to RFC2047 if it contains forbidden characters (ASCII > 127) + val := value.String() + if rfc2047Needed { + val = mime.BEncoding.Encode("UTF-8", val) + // RFC 2047 must be quoted + quoteIfUnquoted() + } + + // Write the value + param.WriteString(val) + } + + // Add final quote if required + if valueQuoteNeeded { + param.WriteByte('"') + } + + // Write last parsed char if any + if s != "" { + if s[0] != '"' { + // When last char is quote, valueQuotedOriginally is surely true and the quote was already written. + // Otherwise output the character (; for example) + param.WriteByte(s[0]) } + + // Focus the rest of the string + s = s[1:] } return param.String(), s diff --git a/header_test.go b/header_test.go index 6d232d9d..e7523b86 100644 --- a/header_test.go +++ b/header_test.go @@ -395,6 +395,36 @@ func TestFixUnquotedSpecials(t *testing.T) { input: `text/html;charset="`, want: `text/html;charset=""`, }, + { + // Check unquoted 8bit is encoded + input: `application/msword;name=管理.doc`, + want: `application/msword;name="=?UTF-8?b?566h77+977+955CGLmRvYw==?="`, + }, + { + // Check mix of ascii and unquoted 8bit is encoded + input: `application/msword;name=15管理.doc`, + want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9j?="`, + }, + { + // Check quoted 8bit is encoded + input: `application/msword;name="15管理.doc"`, + want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9j?="`, + }, + { + // Check quoted 8bit with missing closing quote is encoded + input: `application/msword;name="15管理.doc`, + want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9j?="`, + }, + { + // Trailing quote without starting quote is considered as part of param text for simplicity + input: `application/msword;name=15管理.doc"`, + want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9jXCI=?="`, + }, + { + // Invalid UTF-8 sequence does not cause any fatal error + input: "application/msword;name=\xe2\x28\xa1.doc", + want: `application/msword;name="=?UTF-8?b?77+9KO+/vS5kb2M=?="`, + }, } for _, tc := range testCases { t.Run(tc.input, func(t *testing.T) { From 22f227f129714b77fab0c0aeaa2f0a9780dc8415 Mon Sep 17 00:00:00 2001 From: Pavel Bazika Date: Thu, 15 Jul 2021 14:02:59 +0200 Subject: [PATCH 2/5] Improved rest of the string processing --- header.go | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/header.go b/header.go index 8d3a8dde..39c10e9a 100644 --- a/header.go +++ b/header.go @@ -407,8 +407,6 @@ findValueStart: } } - hasRest := false - if len(s)-i < 1 { // parameter value starts at the end of the string, make empty // quoted string to play nice with mime.ParseMediaType @@ -444,15 +442,13 @@ findValueStart: } // Otherwise, we've reached the end of an unquoted value. - hasRest = true - s = s[i:] + rest = s[i:] break findValueEnd case '"': if valueQuotedOriginally { // We're in a quoted value. This is the end of that value. - hasRest = true - s = s[i:] + rest = s[i:] break findValueEnd } @@ -483,11 +479,6 @@ findValueStart: } } - if !hasRest { - // Whole string was processed - s = "" - } - if value.Len() > 0 { // Convert whole value to RFC2047 if it contains forbidden characters (ASCII > 127) val := value.String() @@ -507,18 +498,18 @@ findValueStart: } // Write last parsed char if any - if s != "" { - if s[0] != '"' { + if rest != "" { + if rest[0] != '"' { // When last char is quote, valueQuotedOriginally is surely true and the quote was already written. // Otherwise output the character (; for example) - param.WriteByte(s[0]) + param.WriteByte(rest[0]) } // Focus the rest of the string - s = s[1:] + rest = rest[1:] } - return param.String(), s + return param.String(), rest } // fixUnquotedSpecials as defined in RFC 2045, section 5.1: From 9d57ade7cb24ceb29cf598da673f7f585c6d2087 Mon Sep 17 00:00:00 2001 From: Pavel Bazika Date: Thu, 15 Jul 2021 14:04:02 +0200 Subject: [PATCH 3/5] Constant used --- header.go | 2 +- header_test.go | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/header.go b/header.go index 39c10e9a..7ea35418 100644 --- a/header.go +++ b/header.go @@ -483,7 +483,7 @@ findValueStart: // Convert whole value to RFC2047 if it contains forbidden characters (ASCII > 127) val := value.String() if rfc2047Needed { - val = mime.BEncoding.Encode("UTF-8", val) + val = mime.BEncoding.Encode(utf8, val) // RFC 2047 must be quoted quoteIfUnquoted() } diff --git a/header_test.go b/header_test.go index e7523b86..0e83f7ae 100644 --- a/header_test.go +++ b/header_test.go @@ -403,27 +403,27 @@ func TestFixUnquotedSpecials(t *testing.T) { { // Check mix of ascii and unquoted 8bit is encoded input: `application/msword;name=15管理.doc`, - want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9j?="`, + want: `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9j?="`, }, { // Check quoted 8bit is encoded input: `application/msword;name="15管理.doc"`, - want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9j?="`, + want: `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9j?="`, }, { // Check quoted 8bit with missing closing quote is encoded input: `application/msword;name="15管理.doc`, - want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9j?="`, + want: `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9j?="`, }, { // Trailing quote without starting quote is considered as part of param text for simplicity input: `application/msword;name=15管理.doc"`, - want: `application/msword;name="=?UTF-8?b?MTXnrqHnkIYuZG9jXCI=?="`, + want: `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9jXCI=?="`, }, { // Invalid UTF-8 sequence does not cause any fatal error input: "application/msword;name=\xe2\x28\xa1.doc", - want: `application/msword;name="=?UTF-8?b?77+9KO+/vS5kb2M=?="`, + want: `application/msword;name="=?utf-8?b?77+9KO+/vS5kb2M=?="`, }, } for _, tc := range testCases { From ef3d5068fa960f6544ae672180fae9561594e6c1 Mon Sep 17 00:00:00 2001 From: Pavel Bazika Date: Thu, 15 Jul 2021 14:05:21 +0200 Subject: [PATCH 4/5] Fixed consumeParam for strings starting with 8bit character --- header.go | 4 +++- header_test.go | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/header.go b/header.go index 7ea35418..b83301e1 100644 --- a/header.go +++ b/header.go @@ -7,6 +7,7 @@ import ( "mime" "net/textproto" "strings" + _utf8 "unicode/utf8" "github.com/jhillyerd/enmime/internal/coding" "github.com/jhillyerd/enmime/internal/stringutil" @@ -422,7 +423,8 @@ findValueStart: } } - s = s[i+1:] + _, runeLength := _utf8.DecodeRuneInString(s[i:]) + s = s[i+runeLength:] escaped := false findValueEnd: diff --git a/header_test.go b/header_test.go index 0e83f7ae..3973dbc7 100644 --- a/header_test.go +++ b/header_test.go @@ -398,7 +398,7 @@ func TestFixUnquotedSpecials(t *testing.T) { { // Check unquoted 8bit is encoded input: `application/msword;name=管理.doc`, - want: `application/msword;name="=?UTF-8?b?566h77+977+955CGLmRvYw==?="`, + want: `application/msword;name="=?utf-8?b?566h55CGLmRvYw==?="`, }, { // Check mix of ascii and unquoted 8bit is encoded From dcee36d72142608e8601e8cd49cba4947acac467 Mon Sep 17 00:00:00 2001 From: Pavel Bazika Date: Thu, 15 Jul 2021 14:06:01 +0200 Subject: [PATCH 5/5] Added ReadEnvelope test with unencoded 8bit attachment filename --- envelope_test.go | 18 ++++++++++++++++ testdata/mail/mime-bad-8bit-filename.raw | 26 ++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 testdata/mail/mime-bad-8bit-filename.raw diff --git a/envelope_test.go b/envelope_test.go index 48d873c5..508b4ac8 100644 --- a/envelope_test.go +++ b/envelope_test.go @@ -938,6 +938,24 @@ func TestBadContentTransferEncodingInMime(t *testing.T) { } } +func TestBadMime8bitFilename(t *testing.T) { + msg := test.OpenTestData("mail", "mime-bad-8bit-filename.raw") + e, err := enmime.ReadEnvelope(msg) + + if err != nil { + t.Fatal("Failed to parse MIME:", err) + } + if strings.TrimSpace(e.Text) != "Text part" { + t.Fatal("Text part not parsed correctly") + } + if len(e.Attachments) != 1 { + t.Fatal("Wrong number of attachments") + } + if e.Attachments[0].FileName != "管理.doc" { + t.Fatal("Wrong attachment name") + } +} + func TestBlankMediaName(t *testing.T) { msg := test.OpenTestData("mail", "mime-blank-media-name.raw") e, err := enmime.ReadEnvelope(msg) diff --git a/testdata/mail/mime-bad-8bit-filename.raw b/testdata/mail/mime-bad-8bit-filename.raw new file mode 100644 index 00000000..d2bbeff1 --- /dev/null +++ b/testdata/mail/mime-bad-8bit-filename.raw @@ -0,0 +1,26 @@ +Date: Wed, 22 Feb 2021 13:29:24 +0800 +From: "Pavel Bazika" +To: , +Subject: Malformed test +Mime-Version: 1.0 +Content-Type: multipart/mixed; + boundary="=====003_Dragon323481247347_=====" + +This is a multi-part message in MIME format. + +--=====003_Dragon323481247347_===== +Content-Type: text/plain; + charset=us-ascii + +Text part + +--=====003_Dragon323481247347_===== +Content-Type: application/msword; + name=管理.doc +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; + filename=管理.doc + +PGh0bWw+Cg== + +--=====003_Dragon323481247347_=====--