From 6863f3227fb36a4e04b6a6a2b14f9de9862fae85 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Mon, 10 Feb 2025 11:07:55 +0800 Subject: [PATCH 1/2] Improve performance & compliance of MIME parsing Common cases, text/html, text/xml and text/plain parse about 2x faster. Other cases are about 30% faster. Support quoted attributes, i.e. charset="utf-8" & valid escape sequences. This potentially requires allocation, thus Mime.parse now takes an allocator. Stricter validation around type/subtype based on RFC. More tests. Replace Mime.eql with isHTML(). Equality is complicated and was previously incorrect (it was case sensitive, it should not be). Since we currently only use isHTML-like behavior, built a (faster) method specifically for that. --- src/browser/browser.zig | 6 +- src/browser/mime.zig | 452 ++++++++++++++++++++++++++++++---------- src/str/parser.zig | 2 +- src/xhr/xhr.zig | 19 +- 4 files changed, 360 insertions(+), 119 deletions(-) diff --git a/src/browser/browser.zig b/src/browser/browser.zig index e361838e5..87407d7f8 100644 --- a/src/browser/browser.zig +++ b/src/browser/browser.zig @@ -24,7 +24,7 @@ const Types = @import("root").Types; const parser = @import("netsurf"); const Loader = @import("loader.zig").Loader; const Dump = @import("dump.zig"); -const Mime = @import("mime.zig"); +const Mime = @import("mime.zig").Mime; const jsruntime = @import("jsruntime"); const Loop = jsruntime.Loop; @@ -376,7 +376,9 @@ pub const Page = struct { log.debug("header content-type: {s}", .{ct.?}); const mime = try Mime.parse(ct.?); - if (mime.eql(Mime.HTML)) { + defer mime.deinit(); + + if (mime.isHTML()) { try self.loadHTMLDoc(req.reader(), mime.charset orelse "utf-8", auxData); } else { log.info("non-HTML document: {s}", .{ct.?}); diff --git a/src/browser/mime.zig b/src/browser/mime.zig index da8ac7109..3716939f4 100644 --- a/src/browser/mime.zig +++ b/src/browser/mime.zig @@ -17,143 +17,375 @@ // along with this program. If not, see . const std = @import("std"); -const testing = std.testing; +const Allocator = std.mem.Allocator; -const Reader = @import("../str/parser.zig").Reader; +pub const Mime = struct { + content_type: ContentType, + params: []const u8 = "", + charset: ?[]const u8 = null, + arena: std.heap.ArenaAllocator, -const Self = @This(); + pub const ContentTypeEnum = enum { + text_xml, + text_html, + text_plain, + other, + }; -const MimeError = error{ - Empty, - TooBig, - Invalid, - InvalidChar, -}; + pub const ContentType = union(ContentTypeEnum) { + text_xml: void, + text_html: void, + text_plain: void, + other: struct { type: []const u8, sub_type: []const u8 }, + }; -mtype: []const u8, -msubtype: []const u8, -params: []const u8 = "", + pub fn parse(allocator: Allocator, input: []const u8) !Mime { + if (input.len > 255) { + return error.TooBig; + } -charset: ?[]const u8 = null, -boundary: ?[]const u8 = null, + var arena = std.heap.ArenaAllocator.init(allocator); + errdefer arena.deinit(); -pub const Empty = Self{ .mtype = "", .msubtype = "" }; -pub const HTML = Self{ .mtype = "text", .msubtype = "html" }; -pub const Javascript = Self{ .mtype = "application", .msubtype = "javascript" }; + var trimmed = trim(input); -// https://mimesniff.spec.whatwg.org/#http-token-code-point -fn isHTTPCodePoint(c: u8) bool { - return switch (c) { - '!', '#', '$', '%', '&', '\'', '*', '+', '-', '.', '^' => return true, - '_', '`', '|', '~' => return true, - else => std.ascii.isAlphanumeric(c), - }; -} + const content_type, const type_len = try parseContentType(trimmed); + if (type_len >= trimmed.len) { + return .{ .arena = arena, .content_type = content_type }; + } + + const params = trimLeft(trimmed[type_len..]); + + var charset: ?[]const u8 = null; + + var it = std.mem.splitScalar(u8, params, ';'); + while (it.next()) |attr| { + const i = std.mem.indexOfScalarPos(u8, attr, 0, '=') orelse return error.Invalid; + const name = trimLeft(attr[0..i]); -fn valid(s: []const u8) bool { - const ln = s.len; - var i: usize = 0; - while (i < ln) { - if (!isHTTPCodePoint(s[i])) return false; - i += 1; + const value = trimRight(attr[i + 1 ..]); + if (value.len == 0) { + return error.Invalid; + } + + switch (name.len) { + 7 => if (isCaseEqual("charset", name)) { + charset = try parseValue(arena.allocator(), value); + }, + else => {}, + } + } + + return .{ + .arena = arena, + .params = params, + .charset = charset, + .content_type = content_type, + }; } - return true; -} -// https://mimesniff.spec.whatwg.org/#parsing-a-mime-type -pub fn parse(s: []const u8) Self.MimeError!Self { - const ln = s.len; - if (ln == 0) return MimeError.Empty; - // limit input size - if (ln > 255) return MimeError.TooBig; + pub fn deinit(self: *Mime) void { + self.arena.deinit(); + } - var res = Self{ .mtype = "", .msubtype = "" }; - var r = Reader{ .data = s }; + pub fn isHTML(self: *const Mime) bool { + return self.content_type == .text_html; + } - res.mtype = trim(r.until('/')); - if (res.mtype.len == 0) return MimeError.Invalid; - if (!valid(res.mtype)) return MimeError.InvalidChar; + fn parseContentType(value: []const u8) !struct { ContentType, usize } { + const separator = std.mem.indexOfScalarPos(u8, value, 0, '/') orelse { + return error.Invalid; + }; + const end = std.mem.indexOfScalarPos(u8, value, separator, ';') orelse blk: { + break :blk value.len; + }; - if (!r.skip()) return MimeError.Invalid; - res.msubtype = trim(r.until(';')); - if (res.msubtype.len == 0) return MimeError.Invalid; - if (!valid(res.msubtype)) return MimeError.InvalidChar; + const main_type = value[0..separator]; + const sub_type = trimRight(value[separator + 1 .. end]); - if (!r.skip()) return res; - res.params = trim(r.tail()); - if (res.params.len == 0) return MimeError.Invalid; + if (parseCommonContentType(main_type, sub_type)) |content_type| { + return .{ content_type, end + 1 }; + } - // parse well known parameters. - // don't check invalid parameter format. - var rp = Reader{ .data = res.params }; - while (true) { - const name = trim(rp.until('=')); - if (!rp.skip()) return res; - const value = trim(rp.until(';')); + if (main_type.len == 0) { + return error.Invalid; + } + if (validType(main_type) == false) { + return error.Invalid; + } - if (std.ascii.eqlIgnoreCase(name, "charset")) { - res.charset = value; + if (sub_type.len == 0) { + return error.Invalid; } - if (std.ascii.eqlIgnoreCase(name, "boundary")) { - res.boundary = value; + if (validType(sub_type) == false) { + return error.Invalid; } - if (!rp.skip()) return res; + const content_type = ContentType{ .other = .{ + .type = main_type, + .sub_type = sub_type, + } }; + + return .{ content_type, end + 1 }; } - return res; -} + fn parseCommonContentType(main_type: []const u8, sub_type: []const u8) ?ContentType { + switch (main_type.len) { + 4 => if (isCaseEqual("text", main_type)) { + switch (sub_type.len) { + 3 => if (isCaseEqual("xml", sub_type)) { + return .{ .text_xml = {} }; + }, + 4 => if (isCaseEqual("html", sub_type)) { + return .{ .text_html = {} }; + }, + 5 => if (isCaseEqual("plain", sub_type)) { + return .{ .text_plain = {} }; + }, + else => {}, + } + }, + else => {}, + } + return null; + } -fn trim(s: []const u8) []const u8 { - return std.mem.trim(u8, s, &std.ascii.whitespace); -} + const T_SPECIAL = blk: { + var v = [_]bool{false} ** 256; + for ("()<>@,;:\\\"/[]?=") |b| { + v[b] = true; + } + break :blk v; + }; + + fn parseValue(allocator: Allocator, value: []const u8) ![]const u8 { + if (value[0] != '"') { + return value; + } + + // 1 to skip the opening quote + var value_pos: usize = 1; + var unescaped_len: usize = 0; + const last = value.len - 1; + + while (value_pos < value.len) { + switch (value[value_pos]) { + '"' => break, + '\\' => { + if (value_pos == last) { + return error.Invalid; + } + const next = value[value_pos + 1]; + if (T_SPECIAL[next] == false) { + return error.Invalid; + } + value_pos += 2; + }, + else => value_pos += 1, + } + unescaped_len += 1; + } -test "parse valid" { - for ([_][]const u8{ - "text/html", - " \ttext/html", - "text \t/html", - "text/ \thtml", - "text/html \t", - }) |tc| { - const m = try Self.parse(tc); - try testing.expectEqualStrings("text", m.mtype); - try testing.expectEqualStrings("html", m.msubtype); + if (unescaped_len == 0) { + return error.Invalid; + } + + value_pos = 1; + const owned = try allocator.alloc(u8, unescaped_len); + for (0..unescaped_len) |i| { + switch (value[value_pos]) { + '"' => break, + '\\' => { + owned[i] = value[value_pos + 1]; + value_pos += 2; + }, + else => |c| { + owned[i] = c; + value_pos += 1; + }, + } + } + return owned; + } + + const VALID_CODEPOINTS = blk: { + var v: [256]bool = undefined; + for (0..256) |i| { + v[i] = std.ascii.isAlphanumeric(i); + } + for ("!#$%&\\*+-.^'_`|~") |b| { + v[b] = true; + } + break :blk v; + }; + + fn validType(value: []const u8) bool { + for (value) |b| { + if (VALID_CODEPOINTS[b] == false) { + return false; + } + } + return true; + } + + fn trim(s: []const u8) []const u8 { + return std.mem.trim(u8, s, &std.ascii.whitespace); } - const m2 = try Self.parse("text/javascript1.5"); - try testing.expectEqualStrings("text", m2.mtype); - try testing.expectEqualStrings("javascript1.5", m2.msubtype); - - const m3 = try Self.parse("text/html; charset=utf-8"); - try testing.expectEqualStrings("text", m3.mtype); - try testing.expectEqualStrings("html", m3.msubtype); - try testing.expectEqualStrings("charset=utf-8", m3.params); - try testing.expectEqualStrings("utf-8", m3.charset.?); - - const m4 = try Self.parse("text/html; boundary=----"); - try testing.expectEqualStrings("text", m4.mtype); - try testing.expectEqualStrings("html", m4.msubtype); - try testing.expectEqualStrings("boundary=----", m4.params); - try testing.expectEqualStrings("----", m4.boundary.?); -} -test "parse invalid" { - for ([_][]const u8{ + fn trimLeft(s: []const u8) []const u8 { + return std.mem.trimLeft(u8, s, &std.ascii.whitespace); + } + + fn trimRight(s: []const u8) []const u8 { + return std.mem.trimRight(u8, s, &std.ascii.whitespace); + } + + fn isCaseEqual(comptime target: anytype, value: []const u8) bool { + // - 8 beause we don't care about the sentinel + const bit_len = @bitSizeOf(@TypeOf(target.*)) - 8; + const byte_len = bit_len / 8; + + const T = @Type(.{ .Int = .{ + .bits = bit_len, + .signedness = .unsigned, + } }); + + const bit_target: T = @bitCast(@as(*const [byte_len]u8, target).*); + + if (@as(T, @bitCast(value[0..byte_len].*)) == bit_target) { + return true; + } + return std.ascii.eqlIgnoreCase(value, target); + } +}; + +const testing = std.testing; +test "Mime: invalid " { + const invalids = [_][]const u8{ "", - "te xt/html;", - "te@xt/html;", - "text/ht@ml;", - "text/html;", - "/text/html", - "/html", - }) |tc| { - _ = Self.parse(tc) catch continue; - try testing.expect(false); + "text", + "text /html", + "text/ html", + "text / html", + "text/html other", + "text/html; x", + "text/html; x=", + "text/html; x= ", + "text/html; = ", + "text/html;=", + "text/html; charset=\"\"", + "text/html; charset=\"", + "text/html; charset=\"\\", + "text/html; charset=\"\\a\"", // invalid to escape non special characters + }; + + for (invalids) |invalid| { + try testing.expectError(error.Invalid, Mime.parse(undefined, invalid)); } } -// Compare type and subtype. -pub fn eql(self: Self, b: Self) bool { - if (!std.mem.eql(u8, self.mtype, b.mtype)) return false; - return std.mem.eql(u8, self.msubtype, b.msubtype); +test "Mime: parse common" { + try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/html"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, "text/xml;"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/html;"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain;"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, " \ttext/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/html "); + try expect(.{ .content_type = .{ .text_plain = {} } }, "text/plain \t\t"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, "TEXT/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "text/Html"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "TEXT/PLAIN"); + + try expect(.{ .content_type = .{ .text_xml = {} } }, " TeXT/xml"); + try expect(.{ .content_type = .{ .text_html = {} } }, "teXt/HtML ;"); + try expect(.{ .content_type = .{ .text_plain = {} } }, "tExT/PlAiN;"); +} + +test "Mime: parse uncommon" { + const text_javascript = Expectation{ + .content_type = .{ .other = .{ .type = "text", .sub_type = "javascript" } }, + }; + try expect(text_javascript, "text/javascript"); + try expect(text_javascript, "text/javascript;"); + try expect(text_javascript, " text/javascript\t "); + try expect(text_javascript, " text/javascript\t ;"); + + try expect( + .{ .content_type = .{ .other = .{ .type = "Text", .sub_type = "Javascript" } } }, + "Text/Javascript", + ); +} + +test "Mime: parse charset" { + try expect(.{ + .content_type = .{ .text_xml = {} }, + .charset = "utf-8", + .params = "charset=utf-8", + }, "text/xml; charset=utf-8"); + + try expect(.{ + .content_type = .{ .text_xml = {} }, + .charset = "utf-8", + .params = "charset=\"utf-8\"", + }, "text/xml;charset=\"utf-8\""); + + try expect(.{ + .content_type = .{ .text_xml = {} }, + .charset = "\\ \" ", + .params = "charset=\"\\\\ \\\" \"", + }, "text/xml;charset=\"\\\\ \\\" \" "); +} + +test "Mime: isHTML" { + const isHTML = struct { + fn isHTML(expected: bool, input: []const u8) !void { + var mime = try Mime.parse(testing.allocator, input); + defer mime.deinit(); + try testing.expectEqual(expected, mime.isHTML()); + } + }.isHTML; + try isHTML(true, "text/html"); + try isHTML(true, "text/html;"); + try isHTML(true, "text/html; charset=utf-8"); + try isHTML(false, "text/htm"); // htm not html + try isHTML(false, "text/plain"); + try isHTML(false, "over/9000"); +} + +const Expectation = struct { + content_type: Mime.ContentType, + params: []const u8 = "", + charset: ?[]const u8 = null, +}; + +fn expect(expected: Expectation, input: []const u8) !void { + var actual = try Mime.parse(testing.allocator, input); + defer actual.deinit(); + + try testing.expectEqual( + std.meta.activeTag(expected.content_type), + std.meta.activeTag(actual.content_type), + ); + + switch (expected.content_type) { + .other => |e| { + const a = actual.content_type.other; + try testing.expectEqualStrings(e.type, a.type); + try testing.expectEqualStrings(e.sub_type, a.sub_type); + }, + else => {}, // already asserted above + } + + try testing.expectEqualStrings(expected.params, actual.params); + + if (expected.charset) |ec| { + try testing.expectEqualStrings(ec, actual.charset.?); + } else { + try testing.expectEqual(null, actual.charset); + } } diff --git a/src/str/parser.zig b/src/str/parser.zig index f663c4d57..55b6bb325 100644 --- a/src/str/parser.zig +++ b/src/str/parser.zig @@ -35,7 +35,7 @@ pub const Reader = struct { pub fn tail(self: *Reader) []const u8 { const pos = self.pos; const data = self.data; - if (pos > data.len) { + if (pos > data.len) { return ""; } self.pos = data.len; diff --git a/src/xhr/xhr.zig b/src/xhr/xhr.zig index 771315774..2ad2bc264 100644 --- a/src/xhr/xhr.zig +++ b/src/xhr/xhr.zig @@ -28,7 +28,7 @@ const DOMException = @import("../dom/exceptions.zig").DOMException; const ProgressEvent = @import("progress_event.zig").ProgressEvent; const XMLHttpRequestEventTarget = @import("event_target.zig").XMLHttpRequestEventTarget; -const Mime = @import("../browser/mime.zig"); +const Mime = @import("../browser/mime.zig").Mime; const Loop = jsruntime.Loop; const Client = @import("asyncio").Client; @@ -141,7 +141,7 @@ pub const XMLHttpRequest = struct { // https://lightpanda.slack.com/archives/C05TRU6RBM1/p1707819010681019 // response_override_mime_type: ?[]const u8 = null, - response_mime: Mime = undefined, + response_mime: ?Mime = null, response_obj: ?ResponseObj = null, send_flag: bool = false, @@ -313,8 +313,11 @@ pub const XMLHttpRequest = struct { if (self.response_obj) |v| v.deinit(); self.response_obj = null; - self.response_mime = Mime.Empty; self.response_type = .Empty; + if (self.response_mime) |*mime| { + mime.deinit(); + self.response_mime = null; + } // TODO should we clearRetainingCapacity instead? self.headers.clearAndFree(); @@ -336,6 +339,9 @@ pub const XMLHttpRequest = struct { self.reset(); self.headers.deinit(); self.response_headers.deinit(); + if (self.response_mime) |*mime| { + mime.deinit(); + } self.proto.deinit(alloc); } @@ -544,7 +550,7 @@ pub const XMLHttpRequest = struct { // extract a mime type from headers. const ct = self.response_headers.getFirstValue("Content-Type") orelse "text/xml"; - self.response_mime = Mime.parse(ct) catch |e| return self.onErr(e); + self.response_mime = Mime.parse(self.alloc, ct) catch |e| return self.onErr(e); // TODO handle override mime type @@ -820,13 +826,14 @@ pub const XMLHttpRequest = struct { // TODO parse XML. // https://xhr.spec.whatwg.org/#response-object fn setResponseObjDocument(self: *XMLHttpRequest, alloc: std.mem.Allocator) void { - const isHTML = self.response_mime.eql(Mime.HTML); + const response_mime = &self.response_mime.?; + const isHTML = response_mime.isHTML(); // TODO If finalMIME is not an HTML MIME type or an XML MIME type, then // return. if (!isHTML) return; - const ccharset = alloc.dupeZ(u8, self.response_mime.charset orelse "utf-8") catch { + const ccharset = alloc.dupeZ(u8, response_mime.charset orelse "utf-8") catch { self.response_obj = .{ .Failure = true }; return; }; From 4ab02fab1c5474b37ac0201e32e945c7f4e6a8f5 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Mon, 10 Feb 2025 11:18:16 +0800 Subject: [PATCH 2/2] Fix build. zig build test can pass, but zig build run won't even compile. // TODO: fix. --- src/browser/browser.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/browser/browser.zig b/src/browser/browser.zig index 87407d7f8..e7e2445c4 100644 --- a/src/browser/browser.zig +++ b/src/browser/browser.zig @@ -375,7 +375,7 @@ pub const Page = struct { defer alloc.free(ct.?); log.debug("header content-type: {s}", .{ct.?}); - const mime = try Mime.parse(ct.?); + var mime = try Mime.parse(alloc, ct.?); defer mime.deinit(); if (mime.isHTML()) {