ParseMediaType tolerates unencoded 8bit characters (#201)

* ParseMediaType tolerates unencoded 8bit characters * Added ReadEnvelope test with unencoded 8bit attachment filename Co-authored-by: Pavel Bazika <pavel.bazika@icewarp.com>
jhillyerd · Jul 15, 2021 · 0c598a7 · 0c598a7
1 parent 6817b15
commit 0c598a7
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 58 deletions.
diff --git a/envelope_test.go b/envelope_test.go
@@ -938,6 +938,24 @@ func TestBadContentTransferEncodingInMime(t *testing.T) {
 	}
 }
 
+func TestBadMime8bitFilename(t *testing.T) {
+	msg := test.OpenTestData("mail", "mime-bad-8bit-filename.raw")
+	e, err := enmime.ReadEnvelope(msg)
+
+	if err != nil {
+		t.Fatal("Failed to parse MIME:", err)
+	}
+	if strings.TrimSpace(e.Text) != "Text part" {
+		t.Fatal("Text part not parsed correctly")
+	}
+	if len(e.Attachments) != 1 {
+		t.Fatal("Wrong number of attachments")
+	}
+	if e.Attachments[0].FileName != "管理.doc" {
+		t.Fatal("Wrong attachment name")
+	}
+}
+
 func TestBlankMediaName(t *testing.T) {
 	msg := test.OpenTestData("mail", "mime-blank-media-name.raw")
 	e, err := enmime.ReadEnvelope(msg)

diff --git a/header.go b/header.go
@@ -7,6 +7,7 @@ import (
 	"mime"
 	"net/textproto"
 	"strings"
+	_utf8 "unicode/utf8"
 
 	"github.com/jhillyerd/enmime/internal/coding"
 	"github.com/jhillyerd/enmime/internal/stringutil"
@@ -358,6 +359,7 @@ func consumeParam(s string) (consumed, rest string) {
 	valueQuotedOriginally := false
 	valueQuoteAdded := false
 	valueQuoteNeeded := false
+	rfc2047Needed := false
 
 	var r rune
 findValueStart:
@@ -369,7 +371,8 @@ findValueStart:
 		case '"':
 			valueQuotedOriginally = true
 			valueQuoteAdded = true
-			value.WriteRune(r)
+			valueQuoteNeeded = true
+			param.WriteRune(r)
 
 			break findValueStart
 
@@ -381,6 +384,10 @@ findValueStart:
 			break findValueStart
 
 		default:
+			if r > 127 {
+				rfc2047Needed = true
+			}
+
 			valueQuotedOriginally = false
 			valueQuoteAdded = false
 			value.WriteRune(r)
@@ -389,6 +396,18 @@ findValueStart:
 		}
 	}
 
+	quoteIfUnquoted := func() {
+		if !valueQuoteNeeded {
+			if !valueQuoteAdded {
+				param.WriteByte('"')
+
+				valueQuoteAdded = true
+			}
+
+			valueQuoteNeeded = true
+		}
+	}
+
 	if len(s)-i < 1 {
 		// parameter value starts at the end of the string, make empty
 		// quoted string to play nice with mime.ParseMediaType
@@ -397,108 +416,102 @@ findValueStart:
 	} else {
 		// The beginning of the value is not at the end of the string
 
-		quoteIfUnquoted := func() {
-			if !valueQuoteNeeded {
-				if !valueQuoteAdded {
-					param.WriteByte('"')
-
-					valueQuoteAdded = true
-				}
-
-				valueQuoteNeeded = true
-			}
-		}
-
 		for _, v := range []byte{'(', ')', '<', '>', '@', ',', ':', '/', '[', ']', '?', '='} {
 			if s[0] == v {
 				quoteIfUnquoted()
+				break
 			}
 		}
 
-		s = s[i+1:]
+		_, runeLength := _utf8.DecodeRuneInString(s[i:])
+		s = s[i+runeLength:]
+		escaped := false
 
 	findValueEnd:
-		for len(s) > 0 {
-			switch s[0] {
+		for i, r = range s {
+			if escaped {
+				value.WriteRune(r)
+				escaped = false
+				continue
+			}
+
+			switch r {
 			case ';', ' ', '\t':
 				if valueQuotedOriginally {
 					// We're in a quoted string, so whitespace is allowed.
-					value.WriteByte(s[0])
-					s = s[1:]
+					value.WriteRune(r)
 					break
 				}
 
 				// Otherwise, we've reached the end of an unquoted value.
-
-				param.WriteString(value.String())
-				value.Reset()
-
-				if valueQuoteNeeded {
-					param.WriteByte('"')
-				}
-
-				param.WriteByte(s[0])
-				s = s[1:]
-
+				rest = s[i:]
 				break findValueEnd
 
 			case '"':
 				if valueQuotedOriginally {
 					// We're in a quoted value. This is the end of that value.
-					param.WriteString(value.String())
-					value.Reset()
-
-					param.WriteByte(s[0])
-					s = s[1:]
-
+					rest = s[i:]
 					break findValueEnd
 				}
 
 				quoteIfUnquoted()
 
 				value.WriteByte('\\')
-				value.WriteByte(s[0])
-				s = s[1:]
+				value.WriteRune(r)
 
 			case '\\':
-				if len(s) > 1 {
-					value.WriteByte(s[0])
-					s = s[1:]
-
-					// Backslash escapes the next char. Consume that next char.
-					value.WriteByte(s[0])
-
+				if i < len(s)-1 {
+					// If next char is present, escape it with backslash
+					value.WriteRune(r)
+					escaped = true
 					quoteIfUnquoted()
 				}
-				// Else there is no next char to consume.
-				s = s[1:]
 
 			case '(', ')', '<', '>', '@', ',', ':', '/', '[', ']', '?', '=':
 				quoteIfUnquoted()
 
 				fallthrough
 
 			default:
-				value.WriteByte(s[0])
-				s = s[1:]
+				if r > 127 {
+					rfc2047Needed = true
+				}
+				value.WriteRune(r)
 			}
 		}
 	}
 
 	if value.Len() > 0 {
-		// There is a value that ends with the string. Capture it.
-		param.WriteString(value.String())
-
-		if valueQuotedOriginally || valueQuoteNeeded {
-			// If valueQuotedOriginally is true and we got here,
-			// that means there was no closing quote. So we'll add one.
-			// Otherwise, we're here because it was an unquoted value
-			// with a special char in it, and we had to quote it.
-			param.WriteByte('"')
+		// Convert whole value to RFC2047 if it contains forbidden characters (ASCII > 127)
+		val := value.String()
+		if rfc2047Needed {
+			val = mime.BEncoding.Encode(utf8, val)
+			// RFC 2047 must be quoted
+			quoteIfUnquoted()
 		}
+
+		// Write the value
+		param.WriteString(val)
+	}
+
+	// Add final quote if required
+	if valueQuoteNeeded {
+		param.WriteByte('"')
+	}
+
+	// Write last parsed char if any
+	if rest != "" {
+		if rest[0] != '"' {
+			// When last char is quote, valueQuotedOriginally is surely true and the quote was already written.
+			// Otherwise output the character (; for example)
+			param.WriteByte(rest[0])
+		}
+
+		// Focus the rest of the string
+		rest = rest[1:]
 	}
 
-	return param.String(), s
+	return param.String(), rest
 }
 
 // fixUnquotedSpecials as defined in RFC 2045, section 5.1:

diff --git a/header_test.go b/header_test.go
@@ -395,6 +395,36 @@ func TestFixUnquotedSpecials(t *testing.T) {
 			input: `text/html;charset="`,
 			want:  `text/html;charset=""`,
 		},
+		{
+			// Check unquoted 8bit is encoded
+			input: `application/msword;name=管理.doc`,
+			want:  `application/msword;name="=?utf-8?b?566h55CGLmRvYw==?="`,
+		},
+		{
+			// Check mix of ascii and unquoted 8bit is encoded
+			input: `application/msword;name=15管理.doc`,
+			want:  `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9j?="`,
+		},
+		{
+			// Check quoted 8bit is encoded
+			input: `application/msword;name="15管理.doc"`,
+			want:  `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9j?="`,
+		},
+		{
+			// Check quoted 8bit with missing closing quote is encoded
+			input: `application/msword;name="15管理.doc`,
+			want:  `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9j?="`,
+		},
+		{
+			// Trailing quote without starting quote is considered as part of param text for simplicity
+			input: `application/msword;name=15管理.doc"`,
+			want:  `application/msword;name="=?utf-8?b?MTXnrqHnkIYuZG9jXCI=?="`,
+		},
+		{
+			// Invalid UTF-8 sequence does not cause any fatal error
+			input: "application/msword;name=\xe2\x28\xa1.doc",
+			want:  `application/msword;name="=?utf-8?b?77+9KO+/vS5kb2M=?="`,
+		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.input, func(t *testing.T) {

diff --git a/testdata/mail/mime-bad-8bit-filename.raw b/testdata/mail/mime-bad-8bit-filename.raw
@@ -0,0 +1,26 @@
+Date: Wed, 22 Feb 2021 13:29:24 +0800
+From: "Pavel Bazika" <pavel.bazika@some.domain>
+To: <test@dome.domain>,
+Subject: Malformed test
+Mime-Version: 1.0
+Content-Type: multipart/mixed;
+	boundary="=====003_Dragon323481247347_====="
+
+This is a multi-part message in MIME format.
+
+--=====003_Dragon323481247347_=====
+Content-Type: text/plain;
+	charset=us-ascii
+
+Text part
+
+--=====003_Dragon323481247347_=====
+Content-Type: application/msword;
+	name=管理.doc
+Content-Transfer-Encoding: base64
+Content-Disposition: attachment;
+	filename=管理.doc
+
+PGh0bWw+Cg==
+
+--=====003_Dragon323481247347_=====--