When FormatOptions.AllowMixedHeaderCharsets is disabled, always use u…

…ser-specified charset Previously this could/would still use us-ascii and/or iso-8859-1 if the entire header could fit within one of those charsets. Fixes issue #493
jstedfast · Jul 6, 2019 · 3ad8de7 · 3ad8de7
1 parent b50fc8e
commit 3ad8de7
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 39 deletions.
diff --git a/MimeKit/FormatOptions.cs b/MimeKit/FormatOptions.cs
@@ -223,8 +223,8 @@ internal IMimeFilter CreateNewLineFilter (bool ensureNewLine = false)
 		/// Gets or sets whether the formatter should allow mixed charsets in the headers.
 		/// </summary>
 		/// <remarks>
-		/// <para>When this option is enabled, the MIME formatter will try to use US-ASCII and/or
-		/// ISO-8859-1 to encode headers when appropriate rather than being forced to use the
+		/// <para>When this option is enabled, the MIME formatter will try to use us-ascii and/or
+		/// iso-8859-1 to encode headers when appropriate rather than being forced to use the
 		/// specified charset for all encoded-word tokens in order to maximize readability.</para>
 		/// <para>Unfortunately, mail clients like Outlook and Thunderbird do not treat
 		/// encoded-word tokens individually and assume that all tokens are encoded using the
@@ -234,7 +234,7 @@ internal IMimeFilter CreateNewLineFilter (bool ensureNewLine = false)
 		/// <a href="https://bugzilla.mozilla.org/show_bug.cgi?id=317263">
 		/// https://bugzilla.mozilla.org/show_bug.cgi?id=317263</a>.</para>
 		/// </remarks>
-		/// <value><c>true</c> if the formatter should be allowed to use ISO-8859-1 when encoding headers; otherwise, <c>false</c>.</value>
+		/// <value><c>true</c> if the formatter should be allowed to use us-ascii and/or iso-8859-1 when encoding headers; otherwise, <c>false</c>.</value>
 		public bool AllowMixedHeaderCharsets {
 			get { return allowMixedHeaderCharsets; }
 			set {

diff --git a/MimeKit/Utils/Rfc2047.cs b/MimeKit/Utils/Rfc2047.cs
@@ -1045,11 +1045,17 @@ enum WordType {
 			EncodedWord
 		}
 
+		enum WordEncoding {
+			Ascii,
+			Latin1,
+			UserSpecified
+		}
+
 		class Word {
 			public WordType Type;
 			public int StartIndex;
 			public int CharCount;
-			public int Encoding;  // 0 => ASCII, 1 => iso-8859-1, 2 => custom
+			public WordEncoding Encoding;
 			public int ByteCount;
 			public int EncodeCount;
 			public int QuotedPairs;
@@ -1111,10 +1117,10 @@ static bool ExceedsMaxLineLength (FormatOptions options, Encoding charset, Word
 			switch (word.Type) {
 			case WordType.EncodedWord:
 				switch (word.Encoding) {
-				case 1:
+				case WordEncoding.Latin1:
 					length = EstimateEncodedWordLength ("iso-8859-1", word.ByteCount, word.EncodeCount);
 					break;
-				case 0:
+				case WordEncoding.Ascii:
 					length = EstimateEncodedWordLength ("us-ascii", word.ByteCount, word.EncodeCount);
 					break;
 				default:
@@ -1159,7 +1165,7 @@ static IList<Word> GetRfc822Words (FormatOptions options, Encoding charset, stri
 
 					if (c < 127) {
 						if (IsCtrl (c)) {
-							word.Encoding = Math.Max (word.Encoding, 1);
+							word.Encoding = (WordEncoding) Math.Max ((int) word.Encoding, (int) WordEncoding.Latin1);
 							word.Type = WordType.EncodedWord;
 							word.EncodeCount++;
 						} else if (phrase && !IsAtom (c)) {
@@ -1176,7 +1182,7 @@ static IList<Word> GetRfc822Words (FormatOptions options, Encoding charset, stri
 						nchars = 1;
 					} else if (c < 256) {
 						// iso-8859-1
-						word.Encoding = Math.Max (word.Encoding, 1);
+						word.Encoding = (WordEncoding) Math.Max ((int) word.Encoding, (int) WordEncoding.Latin1);
 						word.Type = WordType.EncodedWord;
 						word.EncodeCount++;
 						word.ByteCount++;
@@ -1198,11 +1204,11 @@ static IList<Word> GetRfc822Words (FormatOptions options, Encoding charset, stri
 							n = 3;
 						}
 
+						word.Encoding = WordEncoding.UserSpecified;
 						word.Type = WordType.EncodedWord;
 						word.CharCount += nchars;
 						word.EncodeCount += n;
 						word.ByteCount += n;
-						word.Encoding = 2;
 					}
 
 					if (ExceedsMaxLineLength (options, charset, word)) {
@@ -1288,11 +1294,11 @@ static bool ShouldMergeWords (FormatOptions options, Encoding charset, IList<Wor
 				if (next.Type == WordType.QuotedString)
 					return false;
 
-				switch (Math.Max (word.Encoding, next.Encoding)) {
-				case 1:
+				switch ((WordEncoding) Math.Max ((int) word.Encoding, (int) next.Encoding)) {
+				case WordEncoding.Latin1:
 					length = EstimateEncodedWordLength ("iso-8859-1", length, encoded);
 					break;
-				case 0:
+				case WordEncoding.Ascii:
 					length = EstimateEncodedWordLength ("us-ascii", length, encoded);
 					break;
 				default:
@@ -1311,8 +1317,9 @@ static IList<Word> Merge (FormatOptions options, Encoding charset, IList<Word> w
 			if (words.Count < 2)
 				return words;
 
-			int lwspCount, encoding, encoded, quoted, byteCount, length;
+			int lwspCount, encoded, quoted, byteCount, length;
 			var merged = new List<Word> ();
+			WordEncoding encoding;
 			Word word, next;
 
 			word = words[0];
@@ -1323,18 +1330,18 @@ static IList<Word> Merge (FormatOptions options, Encoding charset, IList<Word> w
 				next = words[i];
 
 				if (word.Type != WordType.Atom && word.Type == next.Type) {
+					encoding = (WordEncoding) Math.Max ((int) word.Encoding, (int) next.Encoding);
 					lwspCount = next.StartIndex - (word.StartIndex + word.CharCount);
 					byteCount = word.ByteCount + lwspCount + next.ByteCount;
-					encoding = Math.Max (word.Encoding, next.Encoding);
 					encoded = word.EncodeCount + next.EncodeCount;
 					quoted = word.QuotedPairs + next.QuotedPairs;
 
 					if (word.Type == WordType.EncodedWord) {
 						switch (encoding) {
-						case 1:
+						case WordEncoding.Latin1:
 							length = EstimateEncodedWordLength ("iso-8859-1", byteCount, encoded);
 							break;
-						case 0:
+						case WordEncoding.Ascii:
 							length = EstimateEncodedWordLength ("us-ascii", byteCount, encoded);
 							break;
 						default:
@@ -1376,7 +1383,7 @@ static IList<Word> Merge (FormatOptions options, Encoding charset, IList<Word> w
 					word.Type = (WordType) Math.Max ((int) word.Type, (int) next.Type);
 					word.CharCount = (next.StartIndex + next.CharCount) - word.StartIndex;
 					word.ByteCount = word.ByteCount + lwspCount + next.ByteCount;
-					word.Encoding = Math.Max (word.Encoding, next.Encoding);
+					word.Encoding = (WordEncoding) Math.Max ((int) word.Encoding, (int) next.Encoding);
 					word.EncodeCount = word.EncodeCount + next.EncodeCount;
 					word.QuotedPairs = word.QuotedPairs + next.QuotedPairs;
 				} else {
@@ -1398,23 +1405,9 @@ static byte[] Encode (FormatOptions options, Encoding charset, string text, bool
 			byte[] encoded;
 
 			if (!options.AllowMixedHeaderCharsets) {
-				int maxEncoding = 0;
-
 				for (int i = 0; i < words.Count; i++) {
-					if (words[i].Type != WordType.EncodedWord || words[i].Encoding == maxEncoding)
-						continue;
-
-					if (words[i].Encoding > maxEncoding) {
-						maxEncoding = words[i].Encoding;
-						for (int j = 0; j < i; j++) {
-							if (words[j].Type != WordType.EncodedWord)
-								continue;
-
-							words[j].Encoding = maxEncoding;
-						}
-					} else {
-						words[i].Encoding = maxEncoding;
-					}
+					if (words[i].Type == WordType.EncodedWord)
+						words[i].Encoding = WordEncoding.UserSpecified;
 				}
 			}
 
@@ -1447,10 +1440,10 @@ static byte[] Encode (FormatOptions options, Encoding charset, string text, bool
 					}
 
 					switch (word.Encoding) {
-					case 0: // us-ascii
+					case WordEncoding.Ascii:
 						AppendEncodedWord (str, Encoding.ASCII, text, start, length, mode);
 						break;
-					case 1: // iso-8859-1
+					case WordEncoding.Latin1:
 						AppendEncodedWord (str, CharsetUtils.Latin1, text, start, length, mode);
 						break;
 					default: // custom charset

diff --git a/UnitTests/InternetAddressListTests.cs b/UnitTests/InternetAddressListTests.cs
@@ -430,7 +430,8 @@ public void TestMailboxWithDotsInTheName ()
 		[Test]
 		public void TestMailboxWith8bitName ()
 		{
-			const string encoded = "Patrik =?iso-8859-1?b?RqVkbHRzdHKldm0=?= <paf@nada.kth.se>";
+			//const string encoded = "Patrik =?iso-8859-1?b?RqVkbHRzdHKldm0=?= <paf@nada.kth.se>";
+			const string encoded = "Patrik =?utf-8?b?RsKlZGx0c3RywqV2bQ==?= <paf@nada.kth.se>";
 			const string text = "Patrik F¥dltstr¥vm <paf@nada.kth.se>";
 			var expected = new InternetAddressList ();
 
@@ -504,10 +505,14 @@ public void TestEncodingMailboxWithReallyLongWord ()
 			const string expected = "=?us-ascii?q?reeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaallllllllllll?=\n =?us-ascii?q?llllllllllllllllllllllllllllllllllllllllllly?= long word\n\t<really.long.word@example.com>";
 			const string name = "reeeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaallllllllllllllllllllllllllllllllllllllllllllllllllllllly long word";
 			var mailbox = new MailboxAddress (name, "really.long.word@example.com");
+			var options = FormatOptions.Default.Clone ();
 			var list = new InternetAddressList ();
 			list.Add (mailbox);
 
-			var actual = list.ToString (UnixFormatOptions, true);
+			options.NewLineFormat = NewLineFormat.Unix;
+			options.AllowMixedHeaderCharsets = true;
+
+			var actual = list.ToString (options, true);
 
 			Assert.AreEqual (expected, actual, "Encoding really long mailbox did not match expected result: {0}", expected);
 			Assert.IsTrue (InternetAddressList.TryParse (actual, out list), "Failed to parse really long mailbox");
@@ -569,17 +574,19 @@ public void TestEncodingSimpleAddressList ()
 		public void TestEncodingLongNameMixedQuotingAndEncoding ()
 		{
 			const string name = "Dr. xxxxxxxxxx xxxxx | xxxxxx.xxxxxxx für xxxxxxxxxxxxx xxxx";
-			const string encodedName = "\"Dr. xxxxxxxxxx xxxxx | xxxxxx.xxxxxxx\" =?iso-8859-1?b?Zvxy?= xxxxxxxxxxxxx xxxx";
+			const string encodedNameLatin1 = "\"Dr. xxxxxxxxxx xxxxx | xxxxxx.xxxxxxx\" =?iso-8859-1?b?Zvxy?= xxxxxxxxxxxxx xxxx";
+			const string encodedNameUnicode = "\"Dr. xxxxxxxxxx xxxxx | xxxxxx.xxxxxxx\" =?utf-8?b?ZsO8cg==?= xxxxxxxxxxxxx xxxx";
 			const string encodedMailbox = "\"Dr. xxxxxxxxxx xxxxx | xxxxxx.xxxxxxx\" =?iso-8859-1?b?Zvxy?= xxxxxxxxxxxxx\n xxxx <x.xxxxx@xxxxxxx-xxxxxx.xx>";
 			const string address = "x.xxxxx@xxxxxxx-xxxxxx.xx";
 			var options = FormatOptions.Default.Clone ();
 
 			options.NewLineFormat = NewLineFormat.Unix;
+			options.AllowMixedHeaderCharsets = true;
 
 			var buffer = Rfc2047.EncodePhrase (options, Encoding.UTF8, name);
 			var result = Encoding.UTF8.GetString (buffer);
 
-			Assert.AreEqual (encodedName, result);
+			Assert.AreEqual (encodedNameLatin1, result);
 
 			var mailbox = new MailboxAddress (name, address);
 			var list = new InternetAddressList ();
@@ -589,6 +596,15 @@ public void TestEncodingLongNameMixedQuotingAndEncoding ()
 			result = list.ToString (options, true);
 
 			Assert.AreEqual (encodedMailbox, result);
+
+			// Now disable smart encoding
+
+			options.AllowMixedHeaderCharsets = false;
+
+			buffer = Rfc2047.EncodePhrase (options, Encoding.UTF8, name);
+			result = Encoding.UTF8.GetString (buffer);
+
+			Assert.AreEqual (encodedNameUnicode, result);
 		}
 
 		[Test]