Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
3437 lines (3296 sloc) 150 KB
;;; html5-tok.el --- HTML Tokenizer
;; Copyright (C) 2010 Edward O'Connor
;; Author: Edward O'Connor <hober0@gmail.com>
;; Keywords: wp, hypermedia, comm, languages
;; Permission is hereby granted, free of charge, to any person obtaining
;; a copy of this software and associated documentation files (the
;; "Software"), to deal in the Software without restriction, including
;; without limitation the rights to use, copy, modify, merge, publish,
;; distribute, sublicense, and/or sell copies of the Software, and to
;; permit persons to whom the Software is furnished to do so, subject to
;; the following conditions:
;; The above copyright notice and this permission notice shall be
;; included in all copies or substantial portions of the Software.
;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
;; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
;; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
;; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
;; SOFTWARE.
;;; Commentary:
;;
;;; History:
;;
;;; Code:
(require 'html5-ncr)
(defvar *h5-curtok* nil)
(defvar *h5-curattr* nil)
(defvar *h5-additional-allowed-character* nil)
(defvar *h5-last-tag-open* nil)
(defvar html5-tok-coalesce-character-tokens nil
"If nil, the tokenizer emits each character token separately.
If non-nil, it collects character tokens and emits strings of
consecutive ones.")
(defun h5-inside (thing)
"Mark the text from the last tag open state as a THING."
(when *h5-last-tag-open*
(put-text-property *h5-last-tag-open* (point)
'h5-inside thing))
(setq *h5-last-tag-open* nil))
;; (defvar h5-charbuf nil)
;; (defvar h5-charbuf-nonempty nil)
;; (defun h5-clear-charbuf ()
;; ""
;; (let (contents)
;; (unless (bufferp h5-charbuf)
;; (setq h5-charbuf (get-buffer-create " *h5-chars*")))
;; (with-current-buffer h5-charbuf
;; (setq contents
;; (buffer-substring-no-properties (point-min) (point-max)))
;; (delete-region (point-min) (point-max)))
;; contents))
;; (defun h5-append-to-charbuf (val)
;; ""
;; (with-current-buffer h5-charbuf
;; (insert val)))
(defun h5-parse-error (&optional reason)
""
(let ((here (point)))
(put-text-property (1- here) here 'h5-parse-error (or reason t)))
(h5-emit :parse-error))
(defsubst h5-space-p (str)
""
(and (stringp str)
(string-match "[\t\n\f ]" str)))
(defsubst h5-uppercase-p (str)
""
(and (stringp str)
(string-match "[A-Z]" str)))
(defsubst h5-lowercase-p (str)
""
(and (stringp str)
(string-match "[a-z]" str)))
(defun h5-consume-the-next-input-character ()
""
(condition-case nil
(progn
(forward-char 1)
(char-before))
(end-of-buffer
:eof)))
;; <h4><dfn>Tokenization</dfn></h4>
;;
;; Implementations must act as if they used the following state machine
;; to tokenize HTML.
(defvar *h5-curstate* nil)
(defvar *h5-prevstate* nil)
(defvar *h5-statestart* 1)
(defun h5-switch-state (state)
""
(when (eq state :previous)
(setq state *h5-prevstate*))
;; (message "STATE [[[%s]]]->[[[%s]]] at %d" *h5-curstate* state (point))
(when *h5-curstate*
(put-text-property *h5-statestart* (point)
'h5-state *h5-curstate*))
(setq *h5-prevstate* *h5-curstate*)
(setq *h5-curstate* state)
(setq *h5-statestart* (point)))
(defun h5-current-state ()
""
*h5-curstate*)
;; Most states consume a single character, which may have
;; various side-effects, and either switches the state machine to a new
;; state to <em>reconsume</em> the same character, or switches it to a
;; new state (to consume the next character), or repeats the same state
;; (to consume the next character). Some states have more complicated
;; behavior and can consume several characters before switching to
;; another state. In some cases, the tokenizer state is also changed by
;; the tree construction stage.
;;
;; The exact behavior of certain states depends on the <span>insertion
;; mode</span> and the <span>stack of open elements</span>.
(defvar h5-insertion-mode nil)
(defvar h5-stack-of-open-elements nil)
;; Certain states also use a <dfn><var>temporary buffer</var></dfn> to
;; track progress.
(defvar h5-tmpbuf nil)
(defun h5-clear-tmpbuf ()
""
(unless (bufferp h5-tmpbuf)
(setq h5-tmpbuf (get-buffer-create " *h5-tmp*")))
(with-current-buffer h5-tmpbuf
(delete-region (point-min) (point-max))))
(defun h5-append-to-tmpbuf (val)
""
(with-current-buffer h5-tmpbuf
(insert val)))
(defun h5-tmpbuf ()
""
(with-current-buffer h5-tmpbuf
(buffer-substring (point-min) (point-max))))
(defun h5-emit-tmpbuf ()
""
(h5-emit-string
(with-current-buffer h5-tmpbuf
(buffer-substring-no-properties (point-min) (point-max)))))
;; The output of the tokenization step is a series of zero or more of
;; the following tokens: DOCTYPE, start tag, end tag, comment,
;; character, end-of-file.
(defstruct h5-doctype-token
;; DOCTYPE tokens have a name, a public identifier, a system
;; identifier, and a <i>force-quirks flag</i>.
;; When a DOCTYPE token is created, its name, public identifier, and
;; system identifier must be marked as missing (which is a distinct
;; state from the empty string),
(name 'missing :type string)
(public-id 'missing :type string)
(system-id 'missing :type string)
;; and the <i>force-quirks flag</i> must be set to <i>off</i> (its
;; other state is <i>on</i>).
(force-quirks nil :type boolean))
;; and a list of attributes, each of which has a name and a value.
(defstruct h5-attr
(name "" :type string)
(value "" :type string)
;; internal to html5-tok.el
(duplicate nil :type boolean))
(defstruct h5-tag-token
;; Start and end tag tokens have a tag name,
(name "" :type string)
;; When a start or end tag token is created, its <i>self-closing
;; flag</i> must be unset (its other state is that it be set),
(self-closing nil :type boolean)
;; and its attributes list must be empty.
(attributes nil :type list))
(defstruct (h5-start-tag-token (:include h5-tag-token)))
(defstruct (h5-end-tag-token (:include h5-tag-token)))
;; Comment and character tokens have data.
(defstruct h5-comment-token
(data "" :type string))
;; (defun h5-emit-charbuf ()
;; ""
;; (let ((str (h5-clear-charbuf)))
;; (when (> (length str) 0)
;; (h5-emit str))))
(defun h5-emit (&optional token)
""
(unless token
(setq token *h5-curtok*))
(when token
(when (eq token *h5-curtok*)
(setq *h5-curtok* nil)
(setq *h5-curattr* nil))
;; We don't immediately emit character tokens, but accumulate them
;; for emitting en mass.
;; (when (and (not (numberp token))
;; (not (stringp token))
;; h5-charbuf-nonempty)
;; (h5-emit-charbuf))
(cond ((eq token :parse-error)
(throw 'h5-emit :parse-error))
((h5-start-tag-token-p token)
(h5-inside 'start-tag)
(setq h5-last-start-tag-emitted token)
;; When a start tag token is emitted with its <i>self-closing
;; flag</i> set, if the flag is not <dfn title="acknowledge
;; self-closing flag">acknowledged</dfn> when it is processed
;; by the tree construction stage, that is a <span>parse
;; error</span>.
(throw 'h5-emit token))
((h5-end-tag-token-p token)
(h5-inside 'end-tag)
;; When an end tag token is emitted with attributes, that is
;; a <span>parse error</span>.
(when (h5-tag-token-attributes token)
(h5-parse-error))
;; When an end tag token is emitted with its <i>self-closing
;; flag</i> set, that is a <span>parse error</span>.
(when (h5-tag-token-self-closing token)
(h5-parse-error))
(throw 'h5-emit token))
((h5-comment-token-p token)
(h5-inside 'comment)
(throw 'h5-emit token))
((h5-doctype-token-p token)
(h5-inside 'doctype)
(throw 'h5-emit token))
((eq token :eof)
(throw 'h5-emit :eof))
((numberp token)
;; (setq h5-charbuf-nonempty t)
;; (h5-append-to-charbuf token)
(throw 'h5-emit token)
)
(t
(throw 'h5-emit token)))))
(defun h5-emit-string (str)
""
(dolist (char (string-to-list str))
(h5-emit char)))
(defvar h5-last-start-tag-emitted nil)
;; An <dfn>appropriate end tag token</dfn> is an end tag token whose tag
;; name matches the tag name of the last start tag to have been emitted
;; from this tokenizer, if any.
(defun h5-appropriate-end-tag-token (tok)
""
(if h5-last-start-tag-emitted
(equal (h5-tag-token-name h5-last-start-tag-emitted)
(h5-tag-token-name tok))
;; If no start tag has been emitted from this tokenizer, then no end
;; tag token is appropriate.
nil))
;; Before each step of the tokenizer, the user agent must first check
;; the <span>parser pause flag</span>. If it is true, then the tokenizer
;; must abort the processing of any nested invocations of the tokenizer,
;; yielding control back to the caller.
(defun h5-data-state ()
""
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?&) ;; <dt>U+0026 AMPERSAND (&amp;)</dt>
;; <dd>Switch to the <span>character reference in data
;; state</span>.</dd>
(h5-switch-state
'h5-character-reference-in-data-state))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>tag open state</span>.</dd>
(h5-switch-state 'h5-tag-open-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd>Emit an end-of-file token.</dd>
(h5-emit :eof))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-character-reference-in-data-state ()
""
(let ((attempt
;; Attempt to <span>consume a character reference</span>,
;; with no <span>additional allowed character</span>.
(let ((*h5-additional-allowed-character* nil))
(h5-consume-a-character-reference))))
(cond ((eq attempt nil) ;; If nothing is returned,
;; emit a U+0026 AMPERSAND character (&amp;) token.
(h5-emit ?&))
(t ;; Otherwise,
;; emit the character token that was returned.
(h5-emit attempt))))
;; Finally, switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state))
(defun h5-RCDATA-state ()
""
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?&) ;; <dt>U+0026 AMPERSAND (&amp;)</dt>
;; <dd>Switch to the <span>character reference in RCDATA
;; state</span>.</dd>
(h5-switch-state
'h5-character-reference-in-RCDATA-state))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>RCDATA less-than sign
;; state</span>.</dd>
(h5-switch-state 'h5-RCDATA-less-than-sign-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd>Emit an end-of-file token.</dd>
(h5-emit :eof))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-character-reference-in-RCDATA-state ()
""
(let ((attempt
;; Attempt to <span>consume a character reference</span>,
;; with no <span>additional allowed character</span>.
(let ((*h5-additional-allowed-character* nil))
(h5-consume-a-character-reference))))
(cond ((eq attempt nil) ;; If nothing is returned,
;; emit a U+0026 AMPERSAND character (&amp;) token.
(h5-emit ?&))
(t ;; Otherwise,
;; emit the character token that was returned.
(h5-emit attempt))))
;; Finally, switch to the <span>RCDATA state</span>.
(h5-switch-state 'h5-RCDATA-state))
(defun h5-RAWTEXT-state ()
""
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>RAWTEXT less-than sign
;; state</span>.</dd>
(h5-switch-state 'h5-RAWTEXT-less-than-sign-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd>Emit an end-of-file token.</dd>
(h5-emit :eof))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-state ()
""
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data less-than sign
;; state</span>.</dd>
(h5-switch-state 'h5-script-data-less-than-sign-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd>Emit an end-of-file token.</dd>
(h5-emit :eof))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-PLAINTEXT-state ()
""
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char :eof) ;; <dt>EOF</dt>
;; <dd>Emit an end-of-file token.</dd>
(h5-emit :eof))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-tag-open-state ()
""
(setq *h5-last-tag-open* (1- (point)))
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(case-fold-search nil)
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?!) ;; <dt>U+0021 EXCLAMATION MARK (!)</dt>
;; (h5-emit-charbuf)
;; <dd>Switch to the <span>markup declaration open
;; state</span>.</dd>
(h5-switch-state 'h5-markup-declaration-open-state))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; (h5-emit-charbuf)
;; <dd>Switch to the <span>end tag open state</span>.</dd>
(h5-switch-state 'h5-end-tag-open-state))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; (h5-emit-charbuf)
(setq *h5-curtok*
;; <dd>Create a new start tag token,
(make-h5-start-tag-token
;; set its tag name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point),
:name (downcase char-str)))
;; then switch to the <span>tag name state</span>.
(h5-switch-state 'h5-tag-name-state))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; (h5-emit-charbuf)
(setq *h5-curtok*
;; <dd>Create a new start tag token,
(make-h5-start-tag-token
;; set its tag name to the <span>current input
;; character</span>,
:name char-str))
;; then switch to the <span>tag name state</span>.
(h5-switch-state 'h5-tag-name-state))
((eq char ??) ;; <dt>U+003F QUESTION MARK (?)</dt>
;; (h5-emit-charbuf)
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>bogus comment state</span>.</dd>
(h5-switch-state 'h5-bogus-comment-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state)))))
(defun h5-end-tag-open-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; set its tag name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point),
:name (downcase char-str)))
;; then switch to the <span>tag name state</span>.
(h5-switch-state 'h5-tag-name-state))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; set its tag name to the <span>current input
;; character</span>,
:name char-str))
;; then switch to the <span>tag name state</span>.
(h5-switch-state 'h5-tag-name-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and a U+002F SOLIDUS character token.
(h5-emit ?/)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>bogus comment state</span>.</dd>
(h5-switch-state 'h5-bogus-comment-state)))))
(defun h5-tag-name-state () ;; <dfn>Tag name state</dfn>
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(tok *h5-curtok*))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>before attribute name
;; state</span>.</dd>
(h5-switch-state 'h5-before-attribute-name-state))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Switch to the <span>self-closing start tag
;; state</span>.</dd>
(h5-switch-state 'h5-self-closing-start-tag-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the current tag token's tag name.</dd>
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
(downcase char-str))))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current tag token's tag name.</dd>
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
char-str))))))
(defun h5-RCDATA-less-than-sign-state ()
""
;; <!-- identical to the RAWTEXT less-than sign state, except s/RAWTEXT/RCDATA/g -->
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Set the <var>temporary buffer</var> to the empty string.
(h5-clear-tmpbuf)
;; Switch to the <span>RCDATA end tag open state</span>.</dd>
(h5-switch-state 'h5-RCDATA-end-tag-open-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>RCDATA state</span>.</dd>
(h5-switch-state 'h5-RCDATA-state)))))
(defun h5-RCDATA-end-tag-open-state ()
""
;; <!-- identical to the RAWTEXT (and Script data) end tag open state, except s/RAWTEXT/RCDATA/g -->
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; and set its tag name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point).
:name (downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Finally, switch to the <span>RCDATA end tag name
;; state</span>.
(h5-switch-state 'h5-RCDATA-end-tag-name-state))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; and set its tag name to the <span>current input
;; character</span>.
:name char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Finally, switch to the <span>RCDATA end tag name
;; state</span>.
(h5-switch-state 'h5-RCDATA-end-tag-name-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>RCDATA state</span>.</dd>
(h5-switch-state 'h5-RCDATA-state)))))
(defun h5-RCDATA-end-tag-name-state ()
""
;; <!-- identical to the RAWTEXT (and Script data) end tag name state, except s/RAWTEXT/RCDATA/g -->
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(tok *h5-curtok*)
(anything-else
(lambda ()
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; a character token for each of the characters in the
;; <var>temporary buffer</var> (in the order they were added
;; to the buffer),
(h5-emit-tmpbuf)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>RCDATA state</span>.</dd>
(h5-switch-state 'h5-RCDATA-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>before attribute name
;; state</span>.
(h5-switch-state 'h5-before-attribute-name-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>self-closing start tag
;; state</span>.
(h5-switch-state 'h5-self-closing-start-tag-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
(progn
;; then emit the current tag token
(h5-emit tok)
;; and switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state))
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
(downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-RAWTEXT-less-than-sign-state ()
""
;; <!-- identical to the RCDATA less-than sign state, except s/RCDATA/RAWTEXT/g -->
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Set the <var>temporary buffer</var> to the empty
;; string.
(h5-clear-tmpbuf)
;; Switch to the <span>RAWTEXT end tag open
;; state</span>.</dd>
(h5-switch-state 'h5-RAWTEXT-end-tag-open-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>RAWTEXT state</span>.</dd>
(h5-switch-state 'h5-RAWTEXT-state)))))
(defun h5-RAWTEXT-end-tag-open-state ()
""
;; <!-- identical to the RCDATA (and Script data) end tag open state, except s/RCDATA/RAWTEXT/g -->
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; and set its tag name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point).
:name (downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Finally, switch to the <span>RAWTEXT end tag name
;; state</span>.
(h5-switch-state 'h5-RAWTEXT-end-tag-name-state))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; and set its tag name to the <span>current input
;; character</span>.
:name char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Finally, switch to the <span>RAWTEXT end tag name
;; state</span>.
(h5-switch-state 'h5-RAWTEXT-end-tag-name-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>RAWTEXT state</span>.</dd>
(h5-switch-state 'h5-RAWTEXT-state)))))
(defun h5-RAWTEXT-end-tag-name-state ()
""
;; <!-- identical to the RCDATA (and Script data) end tag name state, except s/RCDATA/RAWTEXT/g -->
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(tok *h5-curtok*)
(anything-else
(lambda ()
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; a character token for each of the characters in the
;; <var>temporary buffer</var> (in the order they were added
;; to the buffer),
(h5-emit-tmpbuf)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>RAWTEXT state</span>.</dd>
(h5-switch-state 'h5-RAWTEXT-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>before attribute name
;; state</span>.
(h5-switch-state 'h5-before-attribute-name-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>self-closing start tag
;; state</span>.
(h5-switch-state 'h5-self-closing-start-tag-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
(progn
;; then emit the current tag token
(h5-emit)
;; and switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state))
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
(downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-script-data-less-than-sign-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Set the <var>temporary buffer</var> to the empty
;; string.
(h5-clear-tmpbuf)
;; Switch to the <span>script data end tag open
;; state</span>.</dd>
(h5-switch-state 'h5-script-data-end-tag-open-state))
((eq char ?!) ;; <dt>U+0021 EXCLAMATION MARK (!)</dt>
;; <dd>Switch to the <span>script data escape start
;; state</span>.
(h5-switch-state 'h5-script-data-escape-start-state)
;; Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and a U+0021 EXCLAMATION MARK character token.</dd>
(h5-emit ?!))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data state</span>.</dd>
(h5-switch-state 'h5-script-data-state)))))
(defun h5-script-data-end-tag-open-state ()
""
;; <!-- identical to the RCDATA (and RAWTEXT) end tag open state, except s/RCDATA/Script data/g -->
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; and set its tag name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point).
:name (downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Finally, switch to the <span>script data end tag name
;; state</span>.
(h5-switch-state 'h5-script-data-end-tag-name-state))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; and set its tag name to the <span>current input
;; character</span>.
:name char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Finally, switch to the <span>script data end tag name
;; state</span>.
(h5-switch-state 'h5-script-data-end-tag-name-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data state</span>.</dd>
(h5-switch-state 'h5-script-data-state)))))
(defun h5-script-data-end-tag-name-state ()
""
;; <!-- identical to the RCDATA (and RAWTEXT) end tag name state, except s/RCDATA/Script data/g -->
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(tok *h5-curtok*)
(anything-else
(lambda ()
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; a character token for each of the characters in the
;; <var>temporary buffer</var> (in the order they were added
;; to the buffer),
(h5-emit-tmpbuf)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data state</span>.</dd>
(h5-switch-state 'h5-script-data-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>before attribute name
;; state</span>.
(h5-switch-state 'h5-before-attribute-name-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt> <dd>If the
;; current end tag token is an <span>appropriate end tag
;; token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>self-closing start tag
;; state</span>.
(h5-switch-state 'h5-self-closing-start-tag-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
(progn
;; then emit the current tag token
(h5-emit)
;; and switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state))
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
(downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-script-data-escape-start-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>script data escape start dash
;; state</span>.
(h5-switch-state 'h5-script-data-escape-start-dash-state)
;; Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-)
(t ;; <dt>Anything else</dt>
;; <dd>Reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data state</span>.</dd>
(h5-switch-state 'h5-script-data-state))))))
(defun h5-script-data-escape-start-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>script data escaped dash dash
;; state</span>.
(h5-switch-state 'h5-script-data-escaped-dash-dash-state)
;; Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
(t ;; <dt>Anything else</dt>
;; <dd>Reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data state</span>.</dd>
(h5-switch-state 'h5-script-data-state)))))
(defun h5-script-data-escaped-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>script data escaped dash state</span>.
(h5-switch-state 'h5-script-data-escaped-dash-state)
;; Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data escaped less-than sign
;; state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-less-than-sign-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-escaped-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>script data escaped dash dash
;; state</span>.
(h5-switch-state 'h5-script-data-escaped-dash-dash-state)
;; Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data escaped less-than sign
;; state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-less-than-sign-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Switch to the <span>script data escaped state</span>.
(h5-switch-state 'h5-script-data-escaped-state)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-escaped-dash-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data escaped less-than sign
;; state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-less-than-sign-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>script data state</span>.
(h5-switch-state 'h5-script-data-state)
;; Emit a U+003E GREATER-THAN SIGN character token.</dd>
(h5-emit ?>))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Switch to the <span>script data escaped state</span>.
(h5-switch-state 'h5-script-data-escaped-state)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-escaped-less-than-sign-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Set the <var>temporary buffer</var> to the empty
;; string.
(h5-clear-tmpbuf)
;; Switch to the <span>script data escaped end tag open
;; state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-end-tag-open-state))
;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z</dt>
((h5-uppercase-p char)
;; <dd>Set the <var>temporary buffer</var> to the empty string.
(h5-clear-tmpbuf)
;; Append the lowercase version of the <span>current input
;; character</span> (add 0x0020 to the character's code
;; point) to the <var>temporary buffer</var>.
(h5-append-to-tmpbuf (downcase (char-str)))
;; Switch to the <span>script data double escape start
;; state</span>.
(h5-switch-state 'h5-script-data-double-escape-start-state)
;; Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char))
;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z</dt>
((h5-lowercase-p char)
;; <dd>Set the <var>temporary buffer</var> to the empty string.
(h5-clear-tmpbuf)
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf (char-str))
;; Switch to the <span>script data double escape start
;; state</span>.
(h5-switch-state 'h5-script-data-double-escape-start-state)
;; Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token
(h5-emit ?<)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data escaped state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-state)))))
(defun h5-script-data-escaped-end-tag-open-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; set its tag name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point),
:name (downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char)
;; Finally, switch to the <span>script data escaped end tag
;; name state</span>.
(h5-switch-state 'h5-script-data-escaped-end-tag-name-state))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char)
(setq *h5-curtok*
;; <dd>Create a new end tag token,
(make-h5-end-tag-token
;; set its tag name to the <span>current input
;; character</span>
:name char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char)
;; Finally, switch to the <span>script data escaped end tag
;; name state</span>.
(h5-switch-state 'h5-script-data-escaped-end-tag-name-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data escaped state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-state)))))
(defun h5-script-data-escaped-end-tag-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(tok *h5-curtok*)
(anything-else
(lambda ()
;; <dd>Emit a U+003C LESS-THAN SIGN character token,
(h5-emit ?<)
;; a U+002F SOLIDUS character token,
(h5-emit ?/)
;; a character token for each of the characters in the
;; <var>temporary buffer</var> (in the order they were added
;; to the buffer),
(h5-emit-tmpbuf)
;; and reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>Script data escaped state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>before attribute name
;; state</span>.
(h5-switch-state 'h5-before-attribute-name-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
;; then switch to the <span>self-closing start tag
;; state</span>.
(h5-switch-state 'h5-self-closing-start-tag-state)
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>If the current end tag token is an <span>appropriate
;; end tag token</span>,
(if (h5-appropriate-end-tag-token tok)
(progn
;; then emit the current tag token
(h5-emit tok)
;; and switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state))
;; Otherwise, treat it as per the "anything else" entry
;; below.</dd>
(funcall anything-else)))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
(downcase char-str)))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; current tag token's tag name.
(setf (h5-tag-token-name tok)
(concat (h5-tag-token-name tok)
char-str))
;; Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.</dd>
(h5-append-to-tmpbuf char-str))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-script-data-double-escape-start-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((or
;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dt>U+002F SOLIDUS (/)</dt>
;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
(and (stringp char-str)
(string-match "[/>]" char-str)))
;; <dd>If the <var>temporary buffer</var> is the string "<code
;; title="">script</code>",
(if (string-equal (h5-tmpbuf) "script")
;; then switch to the <span>script data double escaped
;; state</span>.
(h5-switch-state 'h5-script-data-double-escaped-state)
;; Otherwise, switch to the <span>script data escaped
;; state</span>.
(h5-switch-state 'h5-script-data-escaped-state)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the <var>temporary buffer</var>.
(h5-append-to-tmpbuf (downcase char-str))
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char))
(t ;; <dt>Anything else</dt>
;; <dd>Reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data escaped state</span>.</dd>
(h5-switch-state 'h5-script-data-escaped-state)))))
(defun h5-script-data-double-escaped-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>script data double escaped dash
;; state</span>.
(h5-switch-state 'h5-script-data-double-escaped-dash-state)
;; Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data double escaped
;; less-than sign state</span>.
(h5-switch-state 'h5-script-data-double-escaped-less-than-sign-state)
;; Emit a U+003C LESS-THAN SIGN character token.</dd>
(h5-emit ?<))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-double-escaped-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>script data double escaped dash
;; dash state</span>.
(h5-switch-state 'h5-script-data-double-escaped-dash-dash-state)
;; Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data double escaped
;; less-than sign state</span>.
(h5-switch-state 'h5-script-data-double-escaped-less-than-sign-state)
;; Emit a U+003C LESS-THAN SIGN character token.</dd>
(h5-emit ?<))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Switch to the <span>script data double escaped
;; state</span>.
(h5-switch-state 'h5-script-data-double-escaped-state)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-double-escaped-dash-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Emit a U+002D HYPHEN-MINUS character token.</dd>
(h5-emit ?-))
((eq char ?<) ;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dd>Switch to the <span>script data double escaped
;; less-than sign state</span>.
(h5-switch-state 'h5-script-data-double-escaped-less-than-sign-state)
;; Emit a U+003C LESS-THAN SIGN character token.</dd>
(h5-emit ?<))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>script data state</span>.
(h5-switch-state 'h5-script-data-state)
;; Emit a U+003E GREATER-THAN SIGN character token.</dd>
(h5-emit ?>))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Switch to the <span>script data double escaped
;; state</span>.
(h5-switch-state 'h5-script-data-double-escaped-state)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))))
(defun h5-script-data-double-escaped-less-than-sign-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Set the <var>temporary buffer</var> to the empty
;; string.
(h5-clear-tmpbuf)
;; Switch to the <span>script data double escape end
;; state</span>.
(h5-switch-state 'h5-script-data-double-escape-end-state)
;; Emit a U+002F SOLIDUS character token.</dd>
(h5-emit ?/))
(t ;; <dt>Anything else</dt>
;; <dd>Reconsume the <span>current input character</span>
(backward-chaar 1)
;; in the <span>script data double escaped state</span>.</dd>
(h5-switch-state 'h5-script-data-double-escaped-state)))))
(defun h5-script-data-double-escape-end-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((or
;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dt>U+002F SOLIDUS (/)</dt>
;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
(and (stringp char-str)
(string-match "[/>]" char-str)))
;; <dd>If the <var>temporary buffer</var> is the string
;; "<code title="">script</code>",
(if (string-equal (h5-tmpbuf) "script")
;; then switch to the <span>script data escaped state</span>.
(h5-switch-state 'h5-script-data-escaped-state)
;; Otherwise, switch to the <span>script data double escaped
;; state</span>.
(h5-switch-state 'h5-script-data-double-escaped-state)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char)))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the <var>temporary buffer</var>.
(h5-append-to-tmpbuf (downcase char-str))
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char))
(;; <dt>U+0061 LATIN SMALL LETTER A through to U+007A LATIN
;; SMALL LETTER Z</dt>
(h5-lowercase-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; <var>temporary buffer</var>.
(h5-append-to-tmpbuf char-str)
;; Emit the <span>current input character</span> as a
;; character token.</dd>
(h5-emit char))
(t ;; <dt>Anything else</dt>
;; <dd>Reconsume the <span>current input character</span>
(backward-char 1)
;; in the <span>script data double escaped state</span>.</dd>
(h5-switch-state 'h5-script-data-double-escaped-state)))))
(defun h5-before-attribute-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(anything-else
(lambda ()
(setq *h5-curattr*
;; <dd>Start a new attribute in the current tag token.
(make-h5-attr
;; Set that attribute's name to the <span>current input
;; character</span>,
:name char-str
;; and its value to the empty string.
:value ""))
(push *h5-curattr*
(h5-tag-token-attributes *h5-curtok*))
;; Switch to the <span>attribute name state</span>.</dd>
(h5-switch-state 'h5-attribute-name-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Switch to the <span>self-closing start tag
;; state</span>.</dd>
(h5-switch-state 'h5-self-closing-start-tag-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curattr*
;; <dd>Start a new attribute in the current tag token.
(make-h5-attr
;; Set that attribute's name to the lowercase version of the
;; <span>current input character</span>
:name (downcase char-str)
;; and its value to the empty string.
:value ""))
(push *h5-curattr*
(h5-tag-token-attributes *h5-curtok*))
;; Switch to the <span>attribute name state</span>.</dd>
(h5-switch-state 'h5-attribute-name-state))
(;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dt>U+003D EQUALS SIGN (=)</dt>
(and (stringp char-str)
(string-match "[\"'<=]" char-str))
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Treat it as per the "anything else" entry below.</dd>
(funcall anything-else))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-attribute-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(leave
(lambda ()
;; When the user agent leaves the attribute name state (and
;; before emitting the tag token, if appropriate),
(let ((name (and *h5-curattr* (h5-attr-name *h5-curattr*)))
(attrs
(cond ((h5-start-tag-token-p *h5-curtok*)
(h5-start-tag-token-attributes *h5-curtok*))
((h5-end-tag-token-p *h5-curtok*)
(h5-end-tag-token-attributes *h5-curtok*))
(t
;; INCONCEIVABLE!!!!
(error "*h5-curtok* [[[%s]]] neither start nor end tag"
*h5-curtok*)))))
;; the complete attribute's name must be compared to the
;; other attributes on the same token;
(dolist (attr attrs)
;; if there is already an attribute on the token with
;; the exact same name,
(when (and (not (eq attr *h5-curattr*))
(equal name (h5-attr-name attr)))
;; then this is a <span>parse error</span>
(h5-parse-error)
;; and the new attribute must be dropped, along with
;; the value that gets associated with it (if
;; any).
(setf (h5-attr-duplicate *h5-curattr*) t))))))
(anything-else
(lambda ()
;; <dd>Append the <span>current input character</span> to
;; the current attribute's name.</dd>
(setf (h5-attr-name *h5-curattr*)
(concat (h5-attr-name *h5-curattr*)
char-str)))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>after attribute name
;; state</span>.</dd>
(funcall leave)
(h5-switch-state 'h5-after-attribute-name-state))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Switch to the <span>self-closing start tag
;; state</span>.</dd>
(funcall leave)
(h5-switch-state 'h5-self-closing-start-tag-state))
((eq char ?=) ;; <dt>U+003D EQUALS SIGN (=)</dt>
;; <dd>Switch to the <span>before attribute value
;; state</span>.</dd>
(funcall leave)
(h5-switch-state 'h5-before-attribute-value-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(funcall leave)
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current input
;; character</span> (add 0x0020 to the character's code point) to
;; the current attribute's name.</dd>
(setf (h5-attr-name *h5-curattr*)
(concat (h5-attr-name *h5-curattr*)
(downcase char-str))))
(;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
(and (stringp char-str)
(string-match "[\"'<]" char-str))
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Treat it as per the "anything else" entry below.</dd>
(funcall anything-else))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-after-attribute-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(anything-else
(lambda ()
(setq *h5-curattr*
;; <dd>Start a new attribute in the current tag token.
(make-h5-attr
;; Set that attribute's name to the <span>current
;; input character</span>,
:name char-str
;; and its value to the empty string.
:value ""))
(push *h5-curattr*
(h5-tag-token-attributes *h5-curtok*))
;; Switch to the <span>attribute name state</span>.</dd>
(h5-switch-state 'h5-attribute-name-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Switch to the <span>self-closing start tag
;; state</span>.</dd>
(h5-switch-state 'h5-self-closing-start-tag-state))
((eq char ?=) ;; <dt>U+003D EQUALS SIGN (=)</dt>
;; <dd>Switch to the <span>before attribute value
;; state</span>.</dd>
(h5-switch-state 'h5-before-attribute-value-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curattr*
;; <dd>Start a new attribute in the current tag token.
(make-h5-attr
;; Set that attribute's name to the lowercase version
;; of the <span>current input character</span> (add
;; 0x0020 to the character's code point),
:name (downcase char-str)
;; and its value to the empty string.
:value ""))
(push *h5-curattr*
(h5-tag-token-attributes *h5-curtok*))
;; Switch to the <span>attribute name state</span>.</dd>
(h5-switch-state 'h5-attribute-name-state))
(;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
(and (stringp char-str)
(string-match "[\"'<]" char-str))
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Treat it as per the "anything else" entry below.</dd>
(funcall anything-else))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-before-attribute-value-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(anything-else
(lambda ()
;; <dd>Append the <span>current input character</span> to
;; the current attribute's value.
(setf (h5-attr-value *h5-curattr*)
(concat (h5-attr-value *h5-curattr*)
char-str))
;; Switch to the <span>attribute value (unquoted)
;; state</span>.</dd>
(h5-switch-state 'h5-attribute-value-unquoted-state))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Switch to the <span>attribute value (double-quoted)
;; state</span>.</dd>
(h5-switch-state 'h5-attribute-value-double-quoted-state))
((eq char ?&) ;; <dt>U+0026 AMPERSAND (&amp;)</dt>
;; <dd>Switch to the <span>attribute value (double-quoted)
;; state</span>.</dd>
(h5-switch-state 'h5-attribute-value-unquoted-state)
;; and reconsume this <span>current input character</span>.</dd>
(backward-char 1))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Switch to the <span>attribute value (single-quoted)
;; state</span>.</dd>
(h5-switch-state 'h5-attribute-value-single-quoted-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
(;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dt>U+003D EQUALS SIGN (=)</dt>
;; <dt>U+0060 GRAVE ACCENT (`)</dt>
(and (stringp char-str)
(string-match "[<=`]" char-str))
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Treat it as per the "anything else" entry below.</dd>
(funcall anything-else))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-attribute-value-double-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Switch to the <span>after attribute value (quoted)
;; state</span>.</dd>
(h5-switch-state 'h5-after-attribute-value-quoted-state))
((eq char ?&) ;; <dt>U+0026 AMPERSAND (&amp;)</dt>
;; <dd>Switch to the <span>character reference in attribute
;; value state</span>,
(h5-switch-state 'h5-character-reference-in-attribute-value-state)
;; with the <span>additional allowed character</span>
;; being U+0022 QUOTATION MARK (&quot;).</dd>
(setq *h5-additional-allowed-character* ?\"))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current attribute's value.</dd>
(setf (h5-attr-value *h5-curattr*)
(concat (h5-attr-value *h5-curattr*)
char-str))))))
(defun h5-attribute-value-single-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Switch to the <span>after attribute value (quoted)
;; state</span>.</dd>
(h5-switch-state 'h5-after-attribute-value-quoted-state))
((eq char ?&) ;; <dt>U+0026 AMPERSAND (&amp;)</dt>
;; <dd>Switch to the <span>character reference in attribute
;; value state</span>,
(h5-switch-state 'h5-character-reference-in-attribute-value-state)
;; with the <span>additional allowed character</span>
;; being U+0027 APOSTROPHE (').</dd>
(setq *h5-additional-allowed-character* ?'))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current attribute's value.</dd>
(setf (h5-attr-value *h5-curattr*)
(concat (h5-attr-value *h5-curattr*)
char-str))))))
(defun h5-attribute-value-unquoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char))
(anything-else
(lambda ()
;; <dd>Append the <span>current input character</span> to
;; the current attribute's value.</dd>
(setf (h5-attr-value *h5-curattr*)
(concat (h5-attr-value *h5-curattr*)
char-str)))))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>before attribute name
;; state</span>.</dd>
(h5-switch-state 'h5-before-attribute-name-state))
((eq char ?&) ;; <dt>U+0026 AMPERSAND (&amp;)</dt>
;; <dd>Switch to the <span>character reference in attribute
;; value state</span>,
(h5-switch-state 'h5-character-reference-in-attribute-value-state)
;; with the <span>additional allowed character</span>
;; being U+003E GREATER-THAN SIGN (&gt;).</dd>
(setq *h5-additional-allowed-character* ?>))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
(;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dt>U+003C LESS-THAN SIGN (&lt;)</dt>
;; <dt>U+003D EQUALS SIGN (=)</dt>
;; <dt>U+0060 GRAVE ACCENT (`)</dt>
(and (stringp char-str)
(string-match "[\"'<=`]" char-str))
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Treat it as per the "anything else" entry below.</dd>
(funcall anything-else))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(funcall anything-else)))))
(defun h5-character-reference-in-attribute-value-state ()
""
(let ((attempt
;; Attempt to <span>consume a character reference</span>.
(h5-consume-a-character-reference)))
(cond ((eq attempt nil) ;; If nothing is returned,
;; append a U+0026 AMPERSAND character (&amp;) to the current
;; attribute's value.
(setf (h5-attr-value *h5-curattr*)
(concat (h5-attr-value *h5-curattr*)
(string ?&))))
(t ;; Otherwise,
;; append the returned character token to the current
;; attribute's value.
(setf (h5-attr-value *h5-curattr*)
(concat (h5-attr-value *h5-curattr*)
(string attempt))))))
;; Finally, switch back to the attribute value state that switched
;; into this state.
(h5-switch-state :previous))
(defun h5-after-attribute-value-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>before attribute name
;; state</span>.</dd>
(h5-switch-state 'h5-before-attribute-name-state))
((eq char ?/) ;; <dt>U+002F SOLIDUS (/)</dt>
;; <dd>Switch to the <span>self-closing start tag
;; state</span>.</dd>
(h5-switch-state 'h5-self-closing-start-tag-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the character
(backward-char 1)
;; in the <span>before attribute name state</span>.</dd>
(h5-switch-state 'h5-before-attribute-name-state)))))
(defun h5-self-closing-start-tag-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Set the <i>self-closing flag</i> of the current tag
;; token.
(setf (h5-tag-token-self-closing *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current tag token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>before attribute name state</span>.</dd>
(h5-switch-state 'h5-before-attribute-name-state)))))
(defun h5-bogus-comment-state ()
""
(let ((beginning (point))
char)
;; Consume every character up to and including the first U+003E
;; GREATER-THAN SIGN character (&gt;) or the end of the file (EOF),
;; whichever comes first.
(while (not (memq char '(?> :eof)))
(setq char (h5-consume-the-next-input-character)))
;; Emit a comment token
(h5-emit
(make-h5-comment-token
;; whose data is the concatenation of all the characters starting
;; from and including the character that caused the state machine
;; to switch into the bogus comment state, up to and including the
;; character immediately before the last consumed character (i.e.
;; up to the character just before the U+003E or EOF character).
;; (If the comment was started by the end of the file (EOF), the
;; token is empty.)
:data (buffer-substring (1- beginning) (point))))
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; If the end of the file was reached, reconsume the EOF
;; character.
(when (eq char :eof)
(backward-char 1))))
(defun h5-markup-declaration-open-state ()
""
(cond
(;; If the next two characters are both U+002D HYPHEN-MINUS
;; characters (-),
(looking-at "--")
;; consume those two characters,
(forward-char 2)
;; create a comment token whose data is the empty string,
(setq *h5-curtok*
(make-h5-comment-token :data ""))
;; and switch to the <span>comment start state</span>.
(h5-switch-state 'h5-comment-start-state))
(;; Otherwise, if the next seven characters are an <span>ASCII
;; case-insensitive</span> match for the word "DOCTYPE",
(let ((case-fold-search t))
(looking-at "doctype"))
;; then consume those characters
(forward-char 7)
;; and switch to the <span>DOCTYPE state</span>.
(h5-switch-state 'h5-DOCTYPE-state))
(;; Otherwise, if the <span>insertion mode</span> is "<span
;; title="insertion mode: in foreign content">in foreign
;; content</span>"
(and (eq h5-insertion-mode :in-foreign-content)
;; and the <span>current node</span> is not an element in the
;; <span>HTML namespace</span>
(throw 'not-implemented (point))
;; and the next seven characters are an <span>case-sensitive</span>
;; match for the string "[CDATA[" (the five uppercase letters
;; "CDATA" with a U+005B LEFT SQUARE BRACKET character before and
;; after),
)
;; then consume those characters
(throw 'not-implemented (point))
;; and switch to the <span>CDATA section state</span>.
(throw 'not-implemented (point)))
(t ;; Otherwise
;; this is a <span>parse error</span>.
(h5-parse-error)
;; Switch to the <span>bogus comment state</span>.
(h5-switch-state 'h5-bogus-comment-state)
;; The next character that is consumed, if any, is the first
;; character that will be in the comment.
)))
(defun h5-comment-start-state ()
""
;; Consume the <span>next input character</span>:
(let ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>comment start dash
;; state</span>.</dd>
(h5-switch-state 'h5-comment-start-dash-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the comment token.</dd> <!-- see comment in
;; comment end state -->
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; comment token's data.
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
(string char)))
;; Switch to the <span>comment state</span>.</dd>
(h5-switch-state 'h5-comment-state)))))
(defun h5-comment-start-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>comment end state</span></dd>
(h5-switch-state 'h5-comment-end-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the comment token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd> <!-- see comment
;; in comment end state -->
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
;; <dd>Append a U+002D HYPHEN-MINUS character
;; (-) and the <span>current input
;; character</span> to the comment token's
;; data.
(string ?- char)))
;; Switch to the <span>comment state</span>.</dd>
(h5-switch-state 'h5-comment-state)))))
(defun h5-comment-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>comment end dash state</span></dd>
(h5-switch-state 'h5-comment-end-dash-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd> <!-- see comment in
;; comment end state -->
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; comment token's data.</dd>
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
(string char)))))))
(defun h5-comment-end-dash-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>comment end state</span></dd>
(h5-switch-state 'h5-comment-end-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd> <!-- see comment in
;; comment end state -->
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
;; <dd>Append a U+002D HYPHEN-MINUS character
;; (-) and the <span>current input
;; character</span> to the comment token's
;; data.
(string ?- char)))
;; Switch to the <span>comment state</span>.</dd>
(h5-switch-state 'h5-comment-state)))))
(defun h5-comment-end-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond ((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the comment token.</dd>
(h5-emit))
(;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Append two U+002D HYPHEN-MINUS characters (-) and the
;; <span>current input character</span> to the comment
;; token's data.
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
(string ?- ?- char)))
;; Switch to the <span>comment end space state</span>.</dd>
(h5-switch-state 'h5-comment-end-space-state))
((eq char ?!) ;; <dt>U+0021 EXCLAMATION MARK (!)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>comment end bang state</span>.</dd>
(h5-switch-state 'h5-comment-end-bang-state))
((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Append a U+002D HYPHEN-MINUS character (-) to the comment
;; token's data.</dd>
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
(string ?-))))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd> <!-- For security
;; reasons: otherwise, hostile user could put a <script> in a
;; comment e.g. in a blog comment and then DOS the server so
;; that the end tag isn't read, and then the commented
;; <script> tag would be treated as live code -->
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Append two U+002D HYPHEN-MINUS characters (-) and the
;; <span>current input character</span> to the comment
;; token's data.
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
(string ?- ?- char)))
;; Switch to the <span>comment state</span>.</dd>
(h5-switch-state 'h5-comment-state)))))
(defun h5-comment-end-bang-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Append two U+002D HYPHEN-MINUS characters (-) and a
;; U+0021 EXCLAMATION MARK character (!) to the comment
;; token's data.
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
"--!"))
;; Switch to the <span>comment end dash state</span>.</dd>
(h5-switch-state 'h5-comment-end-dash-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the comment token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd> <!-- see comment in
;; comment end state -->
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append two U+002D HYPHEN-MINUS characters (-), a U+0021
;; EXCLAMATION MARK character (!), and the <span>current input
;; character</span> to the comment token's data.
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
"--!" (string char)))
;; Switch to the <span>comment state</span>.</dd>
(h5-switch-state 'h5-comment-state)))))
(defun h5-comment-end-space-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Append the <span>current input character</span> to the
;; comment token's data.</dd>
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
char-str)))
((eq char ?-) ;; <dt>U+002D HYPHEN-MINUS (-)</dt>
;; <dd>Switch to the <span>comment end dash
;; state</span>.</dd>
(h5-switch-state 'h5-comment-end-dash-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the comment token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Emit the comment token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd> <!-- see comment in
;; comment end state -->
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; comment token's data.
(setf (h5-comment-token-data *h5-curtok*)
(concat (h5-comment-token-data *h5-curtok*)
char-str))
;; Switch to the <span>comment state</span>.</dd>
(h5-switch-state 'h5-comment-state)))))
(defun h5-DOCTYPE-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>before DOCTYPE name
;; state</span>.</dd>
(h5-switch-state 'h5-before-DOCTYPE-name-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
(setq *h5-curtok*
;; Create a new DOCTYPE token.
(make-h5-doctype-token
;; Set its <i>force-quirks flag</i> to <i>on</i>.
:force-quirks t))
;; Emit the token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Reconsume the character
(backward-char 1)
;; in the <span>before DOCTYPE name state</span>.</dd>
(h5-switch-state 'h5-before-DOCTYPE-name-state)))))
(defun h5-before-DOCTYPE-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
(setq *h5-curtok*
;; <dd>Create a new DOCTYPE token.
(make-h5-doctype-token
;; Set the token's name to the lowercase version of the
;; <span>current input character</span> (add 0x0020 to the
;; character's code point).
:name char-str))
;; Switch to the <span>DOCTYPE name state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-name-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
(setq *h5-curtok*
;; Create a new DOCTYPE token.
(make-h5-doctype-token
;; Set its <i>force-quirks flag</i> to <i>on</i>.
:force-quirks t))
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
(setq *h5-curtok*
;; Create a new DOCTYPE token.
(make-h5-doctype-token
;; Set its <i>force-quirks flag</i> to <i>on</i>.
:force-quirks t))
;; Emit the token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'data-state))
(t ;; <dt>Anything else</dt>
(setq *h5-curtok*
;; <dd>Create a new DOCTYPE token.
(make-h5-doctype-token
;; Set the token's name to the <span>current input
;; character</span>.
:name char-str))
;; Switch to the <span>DOCTYPE name state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-name-state)))))
(defun h5-DOCTYPE-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((case-fold-search nil)
(char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>after DOCTYPE name state</span>.</dd>
(h5-switch-state 'h5-after-DOCTYPE-name-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current DOCTYPE token.</dd>
(h5-emit))
(;; <dt>U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN
;; CAPITAL LETTER Z</dt>
(h5-uppercase-p char-str)
;; <dd>Append the lowercase version of the <span>current
;; input character</span> (add 0x0020 to the character's code
;; point) to the current DOCTYPE token's name.</dd>
(setf (h5-doctype-token-name *h5-curtok*)
(concat (h5-doctype-token-name *h5-curtok*)
char-str)))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to
;; the current DOCTYPE token's name.</dd>
(setf (h5-doctype-token-name *h5-curtok*)
(concat (h5-doctype-token-name *h5-curtok*)
char-str))))))
(defun h5-after-DOCTYPE-name-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-stte 'h5-data-state))
(;; If the six characters starting from the <span>current
;; input character</span> are an <span>ASCII
;; case-insensitive</span> match for the word "PUBLIC",
(progn
(backward-char 1)
(looking-at "public"))
;; then consume those characters
(forward-char 6)
;; and switch to the <span>after DOCTYPE public keyword
;; state</span>.
(h5-switch-state 'h5-after-DOCTYPE-public-keyword-state))
(;; Otherwise, if the six characters starting from the
;; <span>current input character</span> are an <span>ASCII
;; case-insensitive</span> match for the word "SYSTEM",
(looking-at "system")
;; then consume those characters
(forward-char 6)
;; and switch to the <span>after DOCTYPE system keyword
;; state</span>.
(h5-switch-state 'h5-after-DOCTYPE-system-keyword-state))
(t ;; Otherwise,
(forward-char 1) ;; fix up position after moving back for p or s
;; this is the <span>parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-after-DOCTYPE-public-keyword-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>before DOCTYPE public identifier
;; state</span>.</dd>
(h5-switch-state 'h5-before-DOCTYPE-public-identifier-state))
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's public identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-public-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE public identifier
;; (double-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-public-identifier-double-quoted-state))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's public identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-public-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE public identifier
;; (single-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-public-identifier-single-quoted-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-word 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-before-DOCTYPE-public-identifier-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Set the DOCTYPE token's public identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-public-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE public identifier
;; (double-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-public-identifier-double-quoted-state))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Set the DOCTYPE token's public identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-public-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE public identifier
;; (single-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-public-identifier-double-quoted-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-DOCTYPE-public-identifier-double-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Switch to the <span>after DOCTYPE public identifier
;; state</span>.</dd>
(h5-switch-state 'h5-after-DOCTYPE-public-identifier-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current DOCTYPE token's public identifier.</dd>
(setf (h5-doctype-token-public-id *h5-curtok*)
(concat (h5-doctype-token-public-id *h5-curtok*)
(string char)))))))
(defun h5-DOCTYPE-public-identifier-single-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Switch to the <span>after DOCTYPE public identifier
;; state</span>.</dd>
(h5-switch-state 'h5-after-DOCTYPE-public-identifier-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current DOCTYPE token's public identifier.</dd>
(setf (h5-doctype-token-public-id *h5-curtok*)
(concat (h5-doctype-token-public-id *h5-curtok*)
(string char)))))))
(defun h5-after-DOCTYPE-public-identifier-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>between DOCTYPE public and system
;; identifiers state</span>.</dd>
(h5-switch-state 'h5-between-DOCTYPE-public-and-system-identifiers-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current DOCTYPE token.</dd>
(h5-emit))
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (double-quoted) state</span>.</dd>
(h5-swith-state 'h5-DOCTYPE-system-identifier-double-quoted-state))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (single-quoted) state</span>.</dd>
(h5-swith-state 'h5-DOCTYPE-system-identifier-single-quoted-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-between-DOCTYPE-public-and-system-identifiers-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current DOCTYPE token.</dd>
(h5-emit))
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (double-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-system-identifier-double-quoted-state))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (single-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-system-identifier-single-quoted-state))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-after-DOCTYPE-system-keyword-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Switch to the <span>before DOCTYPE system identifier
;; state</span>.</dd>
(h5-switch-state 'h5-before-DOCTYPE-system-identifier-state))
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (double-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-system-identifier-double-quoted-state))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (single-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-system-identifier-single-quoted-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-before-DOCTYPE-system-identifier-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (double-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-system-identifier-double-quoted-state))
((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Set the DOCTYPE token's system identifier to the empty
;; string (not missing),
(setf (h5-doctype-token-system-id *h5-curtok*) "")
;; then switch to the <span>DOCTYPE system identifier
;; (single-quoted) state</span>.</dd>
(h5-switch-state 'h5-DOCTYPE-system-identifier-single-quoted-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>bogus DOCTYPE state</span>.</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-DOCTYPE-system-identifier-double-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?\") ;; <dt>U+0022 QUOTATION MARK (&quot;)</dt>
;; <dd>Switch to the <span>after DOCTYPE system identifier
;; state</span>.</dd>
(h5-switch-state 'h5-after-DOCTYPE-system-identifier-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current DOCTYPE token's system identifier.</dd>
(setf (h5-doctype-token-system-id *h5-curtok*)
(concat (h5-doctype-token-system-id *h5-curtok*)
(string char)))))))
(defun h5-DOCTYPE-system-identifier-single-quoted-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?') ;; <dt>U+0027 APOSTROPHE (')</dt>
;; <dd>Switch to the <span>after DOCTYPE system identifier
;; state</span>.</dd>
(h5-switch-state 'h5-after-DOCTYPE-system-identifier-state))
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit that DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Append the <span>current input character</span> to the
;; current DOCTYPE token's system identifier.</dd>
(setf (h5-doctype-token-system-id *h5-curtok*)
(concat (h5-doctype-token-system-id *h5-curtok*)
(string char)))))))
(defun h5-after-DOCTYPE-system-identifier-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character))
(char-str (if (numberp char) (string char) char)))
(cond (;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dd>Ignore the character.</dd>
)
((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the current DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Set the DOCTYPE token's <i>force-quirks flag</i> to
;; <i>on</i>.
(setf (h5-doctype-token-force-quirks *h5-curtok*) t)
;; Emit that DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd><span>Parse error</span>.
(h5-parse-error)
;; Switch to the <span>bogus DOCTYPE state</span>. (This does
;; <em>not</em> set the DOCTYPE token's <i>force-quirks
;; flag</i> to <i>on</i>.)</dd>
(h5-switch-state 'h5-bogus-DOCTYPE-state)))))
(defun h5-bogus-DOCTYPE-state ()
""
;; Consume the <span>next input character</span>:
(let* ((char (h5-consume-the-next-input-character)))
(cond ((eq char ?>) ;; <dt>U+003E GREATER-THAN SIGN (&gt;)</dt>
;; <dd>Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; Emit the DOCTYPE token.</dd>
(h5-emit))
((eq char :eof) ;; <dt>EOF</dt>
;; <dd>Emit the DOCTYPE token.
(h5-emit)
;; Reconsume the EOF character
(backward-char 1)
;; in the <span>data state</span>.</dd>
(h5-switch-state 'h5-data-state))
(t ;; <dt>Anything else</dt>
;; <dd>Ignore the character.</dd>
))))
(defun h5-CDATA-section-state ()
""
;; Consume every character up to the next occurrence of the three
;; character sequence U+005D RIGHT SQUARE BRACKET U+005D RIGHT SQUARE
;; BRACKET U+003E GREATER-THAN SIGN (<code title="">]]></code>), or
;; the end of the file (EOF), whichever comes first.
(let ((beginning (point))
(found (re-search-forward "]]>" nil 'advance)))
;; Emit a series of character tokens consisting of all the
;; characters consumed except the matching three character sequence
;; at the end (if one was found before the end of the file).
(h5-emit-string
(buffer-substring beginning (if found (3- found) (point))))
;; Switch to the <span>data state</span>.
(h5-switch-state 'h5-data-state)
;; If the end of the file was reached, reconsume the EOF character.
(unless found
(backward-char 1))))
;; Tokenizing character references
;; This section defines how to <dfn>consume a character reference</dfn>.
;; This definition is used when parsing character references <span
;; title="character reference in data state">in text</span> and <span
;; title="character reference in attribute value state">in
;; attributes</span>.
(defvar h5-charref-overrides
'((#x00 . #xFFFD) (#x0D . #x000D) (#x80 . #x20AC) (#x81 . #x0081)
(#x82 . #x201A) (#x83 . #x0192) (#x84 . #x201E) (#x85 . #x2026)
(#x86 . #x2020) (#x87 . #x2021) (#x88 . #x02C6) (#x89 . #x2030)
(#x8A . #x0160) (#x8B . #x2039) (#x8C . #x0152) (#x8D . #x008D)
(#x8E . #x017D) (#x8F . #x008F) (#x90 . #x0090) (#x91 . #x2018)
(#x92 . #x2019) (#x93 . #x201C) (#x94 . #x201D) (#x95 . #x2022)
(#x96 . #x2013) (#x97 . #x2014) (#x98 . #x02DC) (#x99 . #x2122)
(#x9A . #x0161) (#x9B . #x203A) (#x9C . #x0153) (#x9D . #x009D)
(#x9E . #x017E) (#x9F . #x0178)))
(defun h5-consume-a-character-reference ()
""
;; The behavior depends on the identity of the next character (the
;; one immediately after the U+0026 AMPERSAND character):
(let* ((char (char-after))
(char-str (if (numberp char) (string char) char))
(char2 (char-after (1+ (point))))
hexadecimal-flag
range
num
num-start
num-end)
(cond ((or ;; <dt>U+0009 CHARACTER TABULATION</dt>
;; <dt>U+000A LINE FEED (LF)</dt>
;; <dt>U+000C FORM FEED (FF)</dt>
;; <!--<dt>U+000D CARRIAGE RETURN (CR)</dt>-->
;; <dt>U+0020 SPACE</dt>
(h5-space-p char-str)
;; <dt>U+003C LESS-THAN SIGN</dt>
;; <dt>U+0026 AMPERSAND</dt>
(and (stringp char-str)
(string-match "[<&]" char-str))
;; <dt>EOF</dt>
(eq (1+ (point)) (point-max))
;; <dt>The <dfn>additional allowed character</dfn>, if there
;; is one</dt>
(eq char *h5-additional-allowed-character*))
;; <dd>Not a character reference. No characters are consumed,
;; and nothing is returned. (This is not an error,
;; either.)</dd>
)
((eq char ?#) ;; <dt>U+0023 NUMBER SIGN (#)</dt>
;; Consume the U+0023 NUMBER SIGN.
(forward-char 1)
;; The behavior further depends on the character after the
;; U+0023 NUMBER SIGN:
(setq range
(cond ((or
;; <dt>U+0078 LATIN SMALL LETTER X</dt>
(eq char2 ?x)
;; <dt>U+0058 LATIN CAPITAL LETTER X</dt>
(eq char2 ?X))
;; Consume the X.
(setq hexadecimal-flag t)
(forward-char 1)
;; Follow the steps below, but using the range of
;; characters U+0030 DIGIT ZERO (0) to U+0039 DIGIT NINE
;; (9), U+0061 LATIN SMALL LETTER A to U+0066 LATIN
;; SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER A to
;; U+0046 LATIN CAPITAL LETTER F (in other words, 0-9,
;; A-F, a-f).
;;
;; When it comes to interpreting the number, interpret
;; it as a hexadecimal number.
"[0-9A-Za-z]")
(t
"[0-9]")))
;; Consume as many characters as match the range of
;; characters given above.
(setq num-start (point)
num-end nil)
(while (looking-at range)
(forward-char 1))
(cond (;; If no characters match the range, then don't
;; consume any characters
(eq num-start (point))
;; (and unconsume the U+0023 NUMBER SIGN character
;; and, if appropriate, the X character).
(backward-char (if hexadecimal-flag 2 1))
;; This is a <span>parse error</span>
(h5-parse-error)
;; nothing is returned.
nil)
(t ;; Otherwise,
(setq num-end (point))
;; if the next character is a U+003B SEMICOLON,
(if (looking-at ";")
;; consume that too.
(forward-char 1)
;; If it isn't, there is a <span>parse error</span>.
(h5-parse-error))
;; If one or more characters match the range, then
;; take them all and interpret the string of
;; characters as a number (either hexadecimal or
;; decimal as appropriate).
(setq num
(string-to-number
(buffer-substring num-start num-end)
(if hexadecimal-flag 16 10)))
(let ((override (assoc num h5-charref-overrides)))
;; If that number is one of the numbers in the first
;; column of the following table,
(cond (override
;; then this is a <span>parse error</span>.
(h5-parse-error)
;; Find the row with that number in the first
;; column, and return a character token for
;; the Unicode character given in the second
;; column of that row.
(cdr override))
((or ;; Otherwise,
;; if the number is in the range 0xD800 to
;; 0xDFFF<!-- surrogates not allowed; see
;; the comment in the "preprocessing the
;; input stream" section for details -->
(and (>= num #xD800) (<= num #xDFFF))
;; or is greater than 0x10FFFF,
(> num #x10FFFF))
;; then this is a <span>parse error</span>.
(h5-parse-error)
;; Return a U+FFFD REPLACEMENT CHARACTER.
#xFFFD)
(t ;; Otherwise
;; return a character token for the Unicode
;; character whose code point is that number.
num))))))
(t
;; Consume the maximum number of characters possible, with
;; the consumed characters matching one of the identifiers in
;; the first column of the <span>named character
;; references</span> table (in a <span>case-sensitive</span>
;; manner).
(let ((beginning (point))
(has-subtrie t)
match)
(while has-subtrie
(forward-char 1)
(setq has-subtrie (h5-trie-subtrie
html5-named-character-references
(buffer-substring beginning (point)))))
(backward-char 1)
(setq match
(h5-trie-member-p html5-named-character-references
(buffer-substring beginning (point))))
(cond ((eq match nil) ;; If no match can be made
;; then no characters are consumed,
(goto-char beginning)
;; and nothing is returned.
(prog1 nil
;; In this case, if the characters after the
;; U+0026 AMPERSAND character (&amp;) consist of a
;; sequence of one or more characters in the range
;; U+0030 DIGIT ZERO (0) to U+0039 DIGIT NINE (9),
;; U+0061 LATIN SMALL LETTER A to U+007A LATIN
;; SMALL LETTER Z, and U+0041 LATIN CAPITAL LETTER
;; A to U+005A LATIN CAPITAL LETTER Z, followed by
;; a U+003B SEMICOLON character (;),
(when (looking-at "([0-9a-zA-Z]+);")
;; then this is a <span>parse error</span>.
(h5-parse-error))))
((and
;; If the character reference is being consumed
;; <span title="character reference in attribute
;; value state">as part of an attribute</span>,
(eq (h5-current-state)
'h5-character-reference-in-attribute-value-state)
;; and the last character matched is not a U+003B
;; SEMICOLON character (;),
(not (eq (char-before) ?\;))
;; and the next character is either
(or
;; a U+003D EQUALS SIGN character (=) or in the
;; range U+0030 DIGIT ZERO (0) to U+0039 DIGIT
;; NINE (9), U+0041 LATIN CAPITAL LETTER A to
;; U+005A LATIN CAPITAL LETTER Z, or U+0061 LATIN
;; SMALL LETTER A to U+007A LATIN SMALL LETTER Z
(string-match "[=0-9A-Z-a-z]"
(string (char-after)))))
;; then, for historical reasons, all the characters
;; that were matched after the U+0026 AMPERSAND
;; character (&amp;) must be unconsumed,
(goto-char beginning)
;; and
;; nothing is returned.
nil)
;; Otherwise,
(t ;; a character reference is parsed.
;; If the last character matched is not a U+003B
;; SEMICOLON character (;),
(when (not (eq (char-before) ?\;))
;; there is a <span>parse error</span>.
(h5-parse-error))
;; Return a character token for the character
;; corresponding to the character reference name (as
;; given by the second column of the <span>named
;; character references</span> table).
match)))))))
(defun html5-tok-forward (&optional from initial-state)
""
(when from
(goto-char from))
(let ((*h5-curtok* nil)
(*h5-curattr* nil)
(*h5-curstate* nil)
(*h5-prevstate* nil)
(*h5-statestart* (or from (point))))
;; The state machine must start in the <span>data state</span>.
(h5-switch-state (or initial-state 'h5-data-state))
;; (h5-clear-charbuf)
(catch 'h5-emit
(while t
(if (fboundp (h5-current-state))
(funcall (h5-current-state))
;; If this *ever* happens, it's because I'm an idiot.
(error "Unknown state %s" (h5-current-state)))))))
(provide 'html5-tok)
;;; html5-tok.el ends here
Something went wrong with that request. Please try again.