Skip to content
This repository has been archived by the owner on Oct 16, 2020. It is now read-only.

Commit

Permalink
fix MimeTypeDetection again
Browse files Browse the repository at this point in the history
  • Loading branch information
siegfriedpammer committed Apr 16, 2012
1 parent 2b81915 commit abce2ee
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 61 deletions.
157 changes: 107 additions & 50 deletions src/Main/Base/Project/Src/Services/MimeTypeDetection.cs
Expand Up @@ -3,22 +3,19 @@

using System;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Xml;

namespace ICSharpCode.SharpDevelop
{
public static class MimeTypeDetection
{
const int BUFFER_SIZE = 4 * 1024;

// Known BOMs
public static readonly byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF };
public static readonly byte[] UTF16BE = new byte[] { 0xFE, 0xFF };
public static readonly byte[] UTF16LE = new byte[] { 0xFF, 0xFE };
public static readonly byte[] UTF32BE = new byte[] { 0x00, 0x00, 0xFE, 0xFF };
public static readonly byte[] UTF32LE = new byte[] { 0xFF, 0xFE, 0x00, 0x00 };
public const string Binary = "application/octet-stream";
public const string Text = "text/plain";
public const string Xml = "text/xml";

[DllImport("urlmon.dll", CharSet = CharSet.Unicode, ExactSpelling = true, SetLastError = false)]
static extern unsafe int FindMimeFromData(
Expand All @@ -31,47 +28,115 @@ public static class MimeTypeDetection
out IntPtr ppwzMimeOut,
int dwReserved);


static byte[] DetectAndRemoveBOM(byte[] buffer, out int len)
public static string FindMimeType(Stream stream)
{
len = UTF8.Length;
if (buffer.StartsWith(UTF8))
return buffer.Skip(UTF8.Length).ToArray();
len = UTF32BE.Length;
if (buffer.StartsWith(UTF32BE))
return buffer.Skip(UTF32BE.Length).ToArray();
len = UTF32LE.Length;
if (buffer.StartsWith(UTF32LE))
return buffer.Skip(UTF32LE.Length).ToArray();
len = UTF16LE.Length;
if (buffer.StartsWith(UTF16LE))
return buffer.Skip(UTF16LE.Length).ToArray();
len = UTF16BE.Length;
if (buffer.StartsWith(UTF16BE))
return buffer.Skip(UTF16BE.Length).ToArray();
len = 0;
return buffer;
StreamReader reader;
if (stream.Length >= 2) {
int firstByte = stream.ReadByte();
int secondByte = stream.ReadByte();
switch ((firstByte << 8) | secondByte) {
case 0xfffe: // UTF-16 LE BOM / UTF-32 LE BOM
case 0xfeff: // UTF-16 BE BOM
stream.Position -= 2;
reader = new StreamReader(stream, detectEncodingFromByteOrderMarks: true);
break;
case 0xefbb: // start of UTF-8 BOM
if (stream.ReadByte() == 0xbf) {
reader = new StreamReader(stream, Encoding.UTF8);
break;
} else {
return Binary;
}
default:
if (IsUTF8(stream, (byte)firstByte, (byte)secondByte)) {
stream.Position = 0;
reader = new StreamReader(stream, Encoding.UTF8);
break;
} else {
byte[] buffer = new byte[BUFFER_SIZE];
int length = stream.Read(buffer, 0, BUFFER_SIZE);
return FindMimeType(buffer, 0, length);
}
}
} else {
return Text;
}
// Now we got a StreamReader with the correct encoding
// Check for XML now
try {
XmlTextReader xmlReader = new XmlTextReader(reader);
xmlReader.XmlResolver = null;
xmlReader.MoveToContent();
return Xml;
} catch (XmlException) {
return Text;
}
}

static bool StartsWith(this byte[] buffer, byte[] start)
static bool IsUTF8(Stream fs, byte firstByte, byte secondByte)
{
if (buffer.Length < start.Length)
return false;
int i = 0;
while (i < start.Length && buffer[i] == start[i])
i++;
return i >= start.Length;
int max = (int)Math.Min(fs.Length, 500000); // look at max. 500 KB
const int ASCII = 0;
const int Error = 1;
const int UTF8 = 2;
const int UTF8Sequence = 3;
int state = ASCII;
int sequenceLength = 0;
byte b;
for (int i = 0; i < max; i++) {
if (i == 0) {
b = firstByte;
} else if (i == 1) {
b = secondByte;
} else {
b = (byte)fs.ReadByte();
}
if (b < 0x80) {
// normal ASCII character
if (state == UTF8Sequence) {
state = Error;
break;
}
} else if (b < 0xc0) {
// 10xxxxxx : continues UTF8 byte sequence
if (state == UTF8Sequence) {
--sequenceLength;
if (sequenceLength < 0) {
state = Error;
break;
} else if (sequenceLength == 0) {
state = UTF8;
}
} else {
state = Error;
break;
}
} else if (b >= 0xc2 && b < 0xf5) {
// beginning of byte sequence
if (state == UTF8 || state == ASCII) {
state = UTF8Sequence;
if (b < 0xe0) {
sequenceLength = 1; // one more byte following
} else if (b < 0xf0) {
sequenceLength = 2; // two more bytes following
} else {
sequenceLength = 3; // three more bytes following
}
} else {
state = Error;
break;
}
} else {
// 0xc0, 0xc1, 0xf5 to 0xff are invalid in UTF-8 (see RFC 3629)
state = Error;
break;
}
}
return state != Error;
}

static unsafe string FindMimeType(byte[] buffer, int offset, int length)
{
int len;
buffer = DetectAndRemoveBOM(buffer, out len);
length -= len;
offset = (offset < len) ? 0 : offset - len;
if (length == 0)
return "text/plain";

fixed (byte *b = &buffer[offset]) {
const int FMFD_ENABLEMIMESNIFFING = 0x00000002;
IntPtr mimeout;
Expand All @@ -89,16 +154,8 @@ public static string FindMimeType(byte[] buffer)
{
if (buffer == null)
throw new ArgumentNullException("buffer");
return FindMimeType(buffer, 0, buffer.Length);
}

public static string FindMimeType(Stream stream)
{
if (stream == null)
throw new ArgumentNullException("stream");
byte[] buffer = new byte[BUFFER_SIZE];
stream.Position = 0;
return FindMimeType(buffer, 0, stream.Read(buffer, 0, buffer.Length));
using (MemoryStream stream = new MemoryStream(buffer))
return FindMimeType(stream);
}
}
}
4 changes: 4 additions & 0 deletions src/Main/Base/Test/ICSharpCode.SharpDevelop.Tests.csproj
Expand Up @@ -195,5 +195,9 @@
<Name>ICSharpCode.SharpDevelop.Dom</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="mime_utf-16_be_test.txt" />
<EmbeddedResource Include="mime_utf-16_le_test.txt" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSHARP.Targets" />
</Project>
47 changes: 36 additions & 11 deletions src/Main/Base/Test/MimeDetectionTests.cs
Expand Up @@ -2,7 +2,9 @@
// This code is distributed under the GNU LGPL (for details please see \doc\license.txt)

using System;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using NUnit.Framework;

Expand All @@ -11,33 +13,56 @@ namespace ICSharpCode.SharpDevelop.Tests
[TestFixture]
public class MimeTypeDetectionTests
{
// Known BOMs
static readonly byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF };
static readonly byte[] UTF16BE = new byte[] { 0xFE, 0xFF };
static readonly byte[] UTF16LE = new byte[] { 0xFF, 0xFE };
// static readonly byte[] UTF32BE = new byte[] { 0x00, 0x00, 0xFE, 0xFF };
static readonly byte[] UTF32LE = new byte[] { 0xFF, 0xFE, 0x00, 0x00 };

[Test]
public void TextPlain()
{
// always open empty files with text editor
TestMime(new byte[] {}, "text/plain");
// UTF-8
TestMime(MimeTypeDetection.UTF8, "text/plain");
TestMime(UTF8, "text/plain");
// UTF-16 Big Endian
TestMime(MimeTypeDetection.UTF16BE, "text/plain");
TestMime(UTF16BE, "text/plain");
// UTF-16 Little Endian
TestMime(MimeTypeDetection.UTF16LE, "text/plain");
TestMime(UTF16LE, "text/plain");
// UTF-32 Big Endian
TestMime(MimeTypeDetection.UTF32BE, "text/plain");
// TestMime(UTF32BE, "text/plain");
// UTF-32 Little Endian
TestMime(MimeTypeDetection.UTF32LE, "text/plain");
TestMime(UTF32LE, "text/plain");
}

[Test]
public void TextXml()
{
string xml = "<?xml version=\"1.0\" ?>";
string xml = "<?xml version=\"1.0\" ?><My File='Test' />";
TestMime(Encoding.Default.GetBytes(xml), "text/xml");
TestMime(MimeTypeDetection.UTF8.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF16BE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF16LE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF32BE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(MimeTypeDetection.UTF32LE.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(UTF8.Concat(Encoding.Default.GetBytes(xml)).ToArray(), "text/xml");
TestMime(UTF16BE.Concat(Encoding.BigEndianUnicode.GetBytes(xml)).ToArray(), "text/xml");
TestMime(UTF16LE.Concat(Encoding.Unicode.GetBytes(xml)).ToArray(), "text/xml");
// TestMime(UTF32BE.Concat(new UTF32Encoding(true, true).GetBytes(xml)).ToArray(), "text/xml");
TestMime(UTF32LE.Concat(Encoding.UTF32.GetBytes(xml)).ToArray(), "text/xml");
}

[Test]
public void TestFiles()
{
TestMime(LoadFile("ICSharpCode.SharpDevelop.Tests.mime_utf-16_be_test.txt"), "text/plain");
TestMime(LoadFile("ICSharpCode.SharpDevelop.Tests.mime_utf-16_le_test.txt"), "text/plain");
}

byte[] LoadFile(string resourceName)
{
using (Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName)) {
byte[] bytes = new byte[stream.Length];
stream.Read(bytes, 0, bytes.Length);
return bytes;
}
}

void TestMime(byte[] bytes, string expectedMime)
Expand Down
Binary file added src/Main/Base/Test/mime_utf-16_be_test.txt
Binary file not shown.
Binary file added src/Main/Base/Test/mime_utf-16_le_test.txt
Binary file not shown.

0 comments on commit abce2ee

Please sign in to comment.