Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 63 additions & 40 deletions src/browser/mime.zig
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ const Allocator = std.mem.Allocator;
pub const Mime = struct {
content_type: ContentType,
params: []const u8 = "",
charset: ?[:0]const u8 = null,
// IANA defines max. charset value length as 40.
// We keep 41 for null-termination since HTML parser expects in this format.
charset: [41]u8 = default_charset,

pub const unknown = Mime{
.params = "",
.charset = null,
.content_type = .{ .unknown = {} },
};
/// String "UTF-8" continued by null characters.
pub const default_charset = .{ 'U', 'T', 'F', '-', '8' } ++ .{0} ** 36;

/// Mime with unknown Content-Type, empty params and empty charset.
pub const unknown = Mime{ .content_type = .{ .unknown = {} } };

pub const ContentTypeEnum = enum {
text_xml,
Expand All @@ -52,6 +54,34 @@ pub const Mime = struct {
other: struct { type: []const u8, sub_type: []const u8 },
};

/// Returns the null-terminated charset value.
pub inline fn charsetString(mime: *const Mime) [:0]const u8 {
return @ptrCast(&mime.charset);
}

/// Removes quotes of value if quotes are given.
///
/// Currently we don't validate the charset.
/// See section 2.3 Naming Requirements:
/// https://datatracker.ietf.org/doc/rfc2978/
fn parseCharset(value: []const u8) error{ CharsetTooBig, Invalid }![]const u8 {
// Cannot be larger than 40.
// https://datatracker.ietf.org/doc/rfc2978/
if (value.len > 40) return error.CharsetTooBig;

// If the first char is a quote, look for a pair.
if (value[0] == '"') {
if (value.len < 3 or value[value.len - 1] != '"') {
return error.Invalid;
}

return value[1 .. value.len - 1];
}

// No quotes.
return value;
}

pub fn parse(input: []u8) !Mime {
if (input.len > 255) {
return error.TooBig;
Expand All @@ -69,7 +99,7 @@ pub const Mime = struct {

const params = trimLeft(normalized[type_len..]);

var charset: ?[:0]const u8 = null;
var charset: [41]u8 = undefined;

var it = std.mem.splitScalar(u8, params, ';');
while (it.next()) |attr| {
Expand All @@ -87,35 +117,14 @@ pub const Mime = struct {

switch (attribute_name) {
.charset => {
// We used to have a proper value parser, but we currently
// only care about the charset attribute, plus only about
// the UTF-8 value. It's a lot easier to do it this way,
// and it doesn't require an allocation to (a) unescape the
// value or (b) ensure the correct lifetime.
if (value.len == 0) {
break;
}
var attribute_value = value;
if (value[0] == '"') {
if (value.len < 3 or value[value.len - 1] != '"') {
return error.Invalid;
}
attribute_value = value[1 .. value.len - 1];
}

if (std.ascii.eqlIgnoreCase(attribute_value, "utf-8")) {
charset = "UTF-8";
} else if (std.ascii.eqlIgnoreCase(attribute_value, "iso-8859-1")) {
charset = "ISO-8859-1";
} else {
// we only care about null (which we default to UTF-8)
// or UTF-8. If this is actually set (i.e. not null)
// and isn't UTF-8, we'll just put a dummy value. If
// we want to capture the actual value, we'll need to
// dupe/allocate it. Since, for now, we don't need that
// we can avoid the allocation.
charset = "lightpanda:UNSUPPORTED";
}
const attribute_value = try parseCharset(value);
@memcpy(charset[0..attribute_value.len], attribute_value);
// Null-terminate right after attribute value.
charset[attribute_value.len] = 0;
},
}
}
Expand Down Expand Up @@ -363,21 +372,33 @@ test "Mime: parse charset" {

try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "UTF-8",
.charset = "utf-8",
.params = "charset=utf-8",
}, "text/xml; charset=utf-8");

try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "UTF-8",
.charset = "utf-8",
.params = "charset=\"utf-8\"",
}, "text/xml;charset=\"utf-8\"");
}, "text/xml;charset=\"UTF-8\"");

try expect(.{
.content_type = .{ .text_html = {} },
.charset = "iso-8859-1",
.params = "charset=\"iso-8859-1\"",
}, "text/html; charset=\"iso-8859-1\"");

try expect(.{
.content_type = .{ .text_html = {} },
.charset = "iso-8859-1",
.params = "charset=\"iso-8859-1\"",
}, "text/html; charset=\"ISO-8859-1\"");

try expect(.{
.content_type = .{ .text_xml = {} },
.charset = "lightpanda:UNSUPPORTED",
.params = "charset=\"\\\\ \\\" \"",
}, "text/xml;charset=\"\\\\ \\\" \" ");
.charset = "custom-non-standard-charset-value",
.params = "charset=\"custom-non-standard-charset-value\"",
}, "text/xml;charset=\"custom-non-standard-charset-value\"");
}

test "Mime: isHTML" {
Expand Down Expand Up @@ -490,8 +511,10 @@ fn expect(expected: Expectation, input: []const u8) !void {
try testing.expectEqual(expected.params, actual.params);

if (expected.charset) |ec| {
try testing.expectEqual(ec, actual.charset.?);
// We remove the null characters for testing purposes here.
try testing.expectEqual(ec, actual.charsetString()[0..ec.len]);
} else {
try testing.expectEqual(null, actual.charset);
const m: Mime = .unknown;
try testing.expectEqual(m.charsetString(), actual.charsetString());
}
}
4 changes: 2 additions & 2 deletions src/browser/page.zig
Original file line number Diff line number Diff line change
Expand Up @@ -672,14 +672,14 @@ pub const Page = struct {
log.debug(.http, "navigate first chunk", .{ .content_type = mime.content_type, .len = data.len });

self.mode = switch (mime.content_type) {
.text_html => .{ .html = try parser.Parser.init(mime.charset orelse "UTF-8") },
.text_html => .{ .html = try parser.Parser.init(mime.charsetString()) },

.application_json,
.text_javascript,
.text_css,
.text_plain,
=> blk: {
var p = try parser.Parser.init(mime.charset orelse "UTF-8");
var p = try parser.Parser.init(mime.charsetString());
try p.process("<html><head><meta charset=\"utf-8\"></head><body><pre>");
break :blk .{ .text = p };
},
Expand Down
2 changes: 1 addition & 1 deletion src/browser/xhr/xhr.zig
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ pub const XMLHttpRequest = struct {
}

var fbs = std.io.fixedBufferStream(self.response_bytes.items);
const doc = parser.documentHTMLParse(fbs.reader(), mime.charset orelse "UTF-8") catch {
const doc = parser.documentHTMLParse(fbs.reader(), mime.charsetString()) catch {
self.response_obj = .{ .Failure = {} };
return;
};
Expand Down