Skip to content

Commit

Permalink
Merge d586c3a into e55b299
Browse files Browse the repository at this point in the history
  • Loading branch information
kkebo authored Apr 1, 2024
2 parents e55b299 + d586c3a commit 23c940b
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 20 deletions.
14 changes: 14 additions & 0 deletions Sources/HTMLEntities/namedChars.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2231,3 +2231,17 @@ public let namedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = [
"yen": ("\u{A5}", "\0"),
"yuml": ("\u{FF}", "\0"),
]

// FIXME: This process should be done at compile-time, not runtime.
public let processedNamedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = {
var namedChars = namedChars
for key in namedChars.keys {
for i in 1..<key.count - 1 {
let k = String(key.prefix(i))
if !namedChars.keys.contains(k) {
namedChars[k] = ("\0", "\0")
}
}
}
return namedChars
}()
50 changes: 42 additions & 8 deletions Sources/Tokenizer/CharRefTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ struct CharRefTokenizer {
private var state: CharRefState = .initial
private var num: Int = 0
private var numTooBig: Bool = false
private var nameBuffer: String = ""
private var lastMatch: (endIndex: String.Index, c1: Unicode.Scalar, c2: Unicode.Scalar)?
private let isInAttr: Bool

init(inAttr isInAttr: Bool) {
self.isInAttr = isInAttr
}

mutating func tokenize(tokenizer: inout Tokenizer<some TokenSink>, input: inout Deque<Character>) -> [Unicode.Scalar]? {
repeat {
Expand All @@ -48,10 +55,36 @@ struct CharRefTokenizer {
case _: return .done(["&"])
}
case .named:
// TODO: If there is a match
guard false else {
// TODO: Flush code points consumed as a character reference
guard let c = tokenizer.peek(input) else {
tokenizer.processCharRef("&")
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
}
tokenizer.discardChar(&input)
self.nameBuffer.append(c)
if let (c1, c2) = processedNamedChars[self.nameBuffer] {
if c1 != "\0" {
self.lastMatch = (self.nameBuffer.endIndex, c1, c2)
}
return .progress
} else if let (endIndex, c1, c2) = self.lastMatch {
// swift-format-ignore: NeverForceUnwrap
let lastChar = self.nameBuffer[..<endIndex].last!.firstScalar
let nextChar = self.nameBuffer[endIndex].firstScalar
switch (isInAttr, lastChar, nextChar) {
case (_, ";", _): break
case (true, _, "="), (true, _, "0"..."9"), (true, _, "A"..."Z"), (true, _, "a"..."z"):
tokenizer.processCharRef("&")
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
case _: tokenizer.emitError(.missingSemicolon)
}
return if c2 != "\0" {
.done([c1, c2])
} else {
.done([c1])
}
} else {
self.state = .ambiguousAmpersand
return .progress
}
Expand All @@ -60,13 +93,14 @@ struct CharRefTokenizer {
switch c.firstScalar {
case "0"..."9", "A"..."Z", "a"..."z":
tokenizer.discardChar(&input)
tokenizer.processCharRef(c)
self.nameBuffer.append(c)
return .progress
case ";":
tokenizer.emitError(.unknownNamedCharRef)
return .doneNone
case _: return .doneNone
case ";": tokenizer.emitError(.unknownNamedCharRef)
case _: break
}
tokenizer.processCharRef("&")
input.prepend(contentsOf: self.nameBuffer)
return .doneNone
case .numeric:
switch tokenizer.peek(input) {
case "X":
Expand Down
16 changes: 8 additions & 8 deletions Sources/Tokenizer/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public import Collections
@freestanding(codeItem) private macro goEmitDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goEmitForceQuirksDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goEmitNewForceQuirksDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goConsumeCharRef() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
@freestanding(codeItem) private macro goConsumeCharRef(inAttr: Bool) = #externalMacro(module: "TokenizerMacros", type: "GoMacro")

public struct Tokenizer<Sink: TokenSink>: ~Copyable {
public var sink: Sink
Expand Down Expand Up @@ -94,7 +94,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
switch self.state {
case .data: repeat {
switch self.getChar(from: &input) {
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: false)
case "<": #go(to: .tagOpen)
case "\0": #go(error: .unexpectedNull, emit: "\0")
case nil: #go(emit: .eof)
Expand All @@ -103,7 +103,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
} while true
case .rcdata: repeat {
switch self.getChar(from: &input) {
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: false)
case "<": #go(to: .rcdataLessThanSign)
case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}")
case nil: #go(emit: .eof)
Expand Down Expand Up @@ -508,7 +508,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
case .attributeValueDoubleQuoted: repeat {
switch self.getChar(from: &input) {
case "\"": #go(to: .afterAttributeValueQuoted)
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: true)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
case let c?: #go(appendAttrValue: c)
Expand All @@ -517,7 +517,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
case .attributeValueSingleQuoted: repeat {
switch self.getChar(from: &input) {
case "'": #go(to: .afterAttributeValueQuoted)
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: true)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
case let c?: #go(appendAttrValue: c)
Expand All @@ -526,7 +526,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
case .attributeValueUnquoted: repeat {
switch self.getChar(from: &input) {
case "\t", "\n", "\u{0C}", " ": #go(to: .beforeAttributeName)
case "&": #goConsumeCharRef
case "&": #goConsumeCharRef(inAttr: true)
case ">": #go(emitTag: .data)
case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
case nil: #go(error: .eofInTag, emit: .eof)
Expand Down Expand Up @@ -1138,8 +1138,8 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
}

@inline(__always)
private mutating func consumeCharRef() {
self.charRefTokenizer = .init()
private mutating func consumeCharRef(inAttr isInAttr: Bool) {
self.charRefTokenizer = .init(inAttr: isInAttr)
}
}

Expand Down
2 changes: 1 addition & 1 deletion Sources/TokenizerMacros/Macros.swift
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ extension GoMacro: CodeItemMacro {
case "goEmitNewForceQuirksDOCTYPEAndEOF":
return ["self.createDOCTYPE()", "self.forceQuirks()", "self.emitDOCTYPE()", "self.emitEOF()", "return .suspend"]
case "goConsumeCharRef":
return ["self.consumeCharRef()", "return .continue"]
return ["self.consumeCharRef(\(node.arguments))", "return .continue"]
case let name:
preconditionFailure("not supported: \(name)")
}
Expand Down
3 changes: 0 additions & 3 deletions Tests/TokenizerTests/HTML5LibTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ func html5libTests(_ testCase: TestCase) throws {
// test2.test
case "Entity + newline": return
// entities.test
case "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
case "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
case "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.": return
case "Semicolonless named entity 'not' followed by 'i;' in body": return
case _: break
}
Expand Down

0 comments on commit 23c940b

Please sign in to comment.