Merge d586c3a into e55b299

kkebo · Apr 1, 2024 · 23c940b · 23c940b
2 parents e55b299 + d586c3a
commit 23c940b
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 20 deletions.
diff --git a/Sources/HTMLEntities/namedChars.swift b/Sources/HTMLEntities/namedChars.swift
@@ -2231,3 +2231,17 @@ public let namedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = [
     "yen": ("\u{A5}", "\0"),
     "yuml": ("\u{FF}", "\0"),
 ]
+
+// FIXME: This process should be done at compile-time, not runtime.
+public let processedNamedChars: [String: (Unicode.Scalar, Unicode.Scalar)] = {
+    var namedChars = namedChars
+    for key in namedChars.keys {
+        for i in 1..<key.count - 1 {
+            let k = String(key.prefix(i))
+            if !namedChars.keys.contains(k) {
+                namedChars[k] = ("\0", "\0")
+            }
+        }
+    }
+    return namedChars
+}()
diff --git a/Sources/Tokenizer/CharRefTokenizer.swift b/Sources/Tokenizer/CharRefTokenizer.swift
@@ -23,6 +23,13 @@ struct CharRefTokenizer {
     private var state: CharRefState = .initial
     private var num: Int = 0
     private var numTooBig: Bool = false
+    private var nameBuffer: String = ""
+    private var lastMatch: (endIndex: String.Index, c1: Unicode.Scalar, c2: Unicode.Scalar)?
+    private let isInAttr: Bool
+
+    init(inAttr isInAttr: Bool) {
+        self.isInAttr = isInAttr
+    }
 
     mutating func tokenize(tokenizer: inout Tokenizer<some TokenSink>, input: inout Deque<Character>) -> [Unicode.Scalar]? {
         repeat {
@@ -48,10 +55,36 @@ struct CharRefTokenizer {
             case _: return .done(["&"])
             }
         case .named:
-            // TODO: If there is a match
-            guard false else {
-                // TODO: Flush code points consumed as a character reference
+            guard let c = tokenizer.peek(input) else {
                 tokenizer.processCharRef("&")
+                input.prepend(contentsOf: self.nameBuffer)
+                return .doneNone
+            }
+            tokenizer.discardChar(&input)
+            self.nameBuffer.append(c)
+            if let (c1, c2) = processedNamedChars[self.nameBuffer] {
+                if c1 != "\0" {
+                    self.lastMatch = (self.nameBuffer.endIndex, c1, c2)
+                }
+                return .progress
+            } else if let (endIndex, c1, c2) = self.lastMatch {
+                // swift-format-ignore: NeverForceUnwrap
+                let lastChar = self.nameBuffer[..<endIndex].last!.firstScalar
+                let nextChar = self.nameBuffer[endIndex].firstScalar
+                switch (isInAttr, lastChar, nextChar) {
+                case (_, ";", _): break
+                case (true, _, "="), (true, _, "0"..."9"), (true, _, "A"..."Z"), (true, _, "a"..."z"):
+                    tokenizer.processCharRef("&")
+                    input.prepend(contentsOf: self.nameBuffer)
+                    return .doneNone
+                case _: tokenizer.emitError(.missingSemicolon)
+                }
+                return if c2 != "\0" {
+                    .done([c1, c2])
+                } else {
+                    .done([c1])
+                }
+            } else {
                 self.state = .ambiguousAmpersand
                 return .progress
             }
@@ -60,13 +93,14 @@ struct CharRefTokenizer {
             switch c.firstScalar {
             case "0"..."9", "A"..."Z", "a"..."z":
                 tokenizer.discardChar(&input)
-                tokenizer.processCharRef(c)
+                self.nameBuffer.append(c)
                 return .progress
-            case ";":
-                tokenizer.emitError(.unknownNamedCharRef)
-                return .doneNone
-            case _: return .doneNone
+            case ";": tokenizer.emitError(.unknownNamedCharRef)
+            case _: break
             }
+            tokenizer.processCharRef("&")
+            input.prepend(contentsOf: self.nameBuffer)
+            return .doneNone
         case .numeric:
             switch tokenizer.peek(input) {
             case "X":

diff --git a/Sources/Tokenizer/Tokenizer.swift b/Sources/Tokenizer/Tokenizer.swift
@@ -40,7 +40,7 @@ public import Collections
 @freestanding(codeItem) private macro goEmitDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
 @freestanding(codeItem) private macro goEmitForceQuirksDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
 @freestanding(codeItem) private macro goEmitNewForceQuirksDOCTYPEAndEOF() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
-@freestanding(codeItem) private macro goConsumeCharRef() = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
+@freestanding(codeItem) private macro goConsumeCharRef(inAttr: Bool) = #externalMacro(module: "TokenizerMacros", type: "GoMacro")
 
 public struct Tokenizer<Sink: TokenSink>: ~Copyable {
     public var sink: Sink
@@ -94,7 +94,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
         switch self.state {
         case .data: repeat {
             switch self.getChar(from: &input) {
-            case "&": #goConsumeCharRef
+            case "&": #goConsumeCharRef(inAttr: false)
             case "<": #go(to: .tagOpen)
             case "\0": #go(error: .unexpectedNull, emit: "\0")
             case nil: #go(emit: .eof)
@@ -103,7 +103,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
         } while true
         case .rcdata: repeat {
             switch self.getChar(from: &input) {
-            case "&": #goConsumeCharRef
+            case "&": #goConsumeCharRef(inAttr: false)
             case "<": #go(to: .rcdataLessThanSign)
             case "\0": #go(error: .unexpectedNull, emit: "\u{FFFD}")
             case nil: #go(emit: .eof)
@@ -508,7 +508,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
         case .attributeValueDoubleQuoted: repeat {
             switch self.getChar(from: &input) {
             case "\"": #go(to: .afterAttributeValueQuoted)
-            case "&": #goConsumeCharRef
+            case "&": #goConsumeCharRef(inAttr: true)
             case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
             case nil: #go(error: .eofInTag, emit: .eof)
             case let c?: #go(appendAttrValue: c)
@@ -517,7 +517,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
         case .attributeValueSingleQuoted: repeat {
             switch self.getChar(from: &input) {
             case "'": #go(to: .afterAttributeValueQuoted)
-            case "&": #goConsumeCharRef
+            case "&": #goConsumeCharRef(inAttr: true)
             case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
             case nil: #go(error: .eofInTag, emit: .eof)
             case let c?: #go(appendAttrValue: c)
@@ -526,7 +526,7 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
         case .attributeValueUnquoted: repeat {
             switch self.getChar(from: &input) {
             case "\t", "\n", "\u{0C}", " ": #go(to: .beforeAttributeName)
-            case "&": #goConsumeCharRef
+            case "&": #goConsumeCharRef(inAttr: true)
             case ">": #go(emitTag: .data)
             case "\0": #go(error: .unexpectedNull, appendAttrValue: "\u{FFFD}")
             case nil: #go(error: .eofInTag, emit: .eof)
@@ -1138,8 +1138,8 @@ public struct Tokenizer<Sink: TokenSink>: ~Copyable {
     }
 
     @inline(__always)
-    private mutating func consumeCharRef() {
-        self.charRefTokenizer = .init()
+    private mutating func consumeCharRef(inAttr isInAttr: Bool) {
+        self.charRefTokenizer = .init(inAttr: isInAttr)
     }
 }
 

diff --git a/Sources/TokenizerMacros/Macros.swift b/Sources/TokenizerMacros/Macros.swift
@@ -162,7 +162,7 @@ extension GoMacro: CodeItemMacro {
         case "goEmitNewForceQuirksDOCTYPEAndEOF":
             return ["self.createDOCTYPE()", "self.forceQuirks()", "self.emitDOCTYPE()", "self.emitEOF()", "return .suspend"]
         case "goConsumeCharRef":
-            return ["self.consumeCharRef()", "return .continue"]
+            return ["self.consumeCharRef(\(node.arguments))", "return .continue"]
         case let name:
             preconditionFailure("not supported: \(name)")
         }

diff --git a/Tests/TokenizerTests/HTML5LibTests.swift b/Tests/TokenizerTests/HTML5LibTests.swift
@@ -47,9 +47,6 @@ func html5libTests(_ testCase: TestCase) throws {
     // test2.test
     case "Entity + newline": return
     // entities.test
-    case "Undefined named entity in a double-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
-    case "Undefined named entity in a single-quoted attribute value ending in semicolon and whose name starts with a known entity name.": return
-    case "Undefined named entity in an unquoted attribute value ending in semicolon and whose name starts with a known entity name.": return
     case "Semicolonless named entity 'not' followed by 'i;' in body": return
     case _: break
     }