Permalink
Browse files

Initial implementation of the HTML5 tokenizer in Emacs Lisp.

  • Loading branch information...
1 parent 09e8907 commit f6e6f08643fc88848eac997bcbe72d2a9ff4bbdf @hober committed Sep 29, 2010
Showing with 5,774 additions and 2,143 deletions.
  1. +1 −0 .gitignore
  2. +15 −0 Makefile
  3. +116 −0 h5-maint.el
  4. +2,177 −2,140 html5-ncr.el
  5. +3,424 −0 html5-tok.el
  6. +40 −3 tools/build-ncr.py
  7. +1 −0 tools/extract-parsing-algorithm.py
View
@@ -1,3 +1,4 @@
+html5lib
syntax
webapps
*.elc
View
@@ -20,6 +20,8 @@
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+EMACS=emacs
+
dummy:
echo "USAGE: $(make) [html5-ncr.el]"
@@ -38,6 +40,16 @@ html5-ncr.el: webapps tools/build-ncr.py
html5-langs.el: tools/build-langs.py language-subtag-registry
python tools/build-langs.py language-subtag-registry > html5-langs.el
+test: html5lib/testdata/tokenizer/test1.test
+ @$(EMACS) -batch -l h5-maint.el -f h5-run-tests html5lib/testdata/tokenizer/*.test
+
+.el.elc:
+ @$(EMACS) -batch -f batch-byte-compile $*.el \
+ || (echo "Perhaps you should specifcy LOAD_PATH to make?" \
+ "(e.g. \"gmake LOAD_PATH=~/elisp\".)" \
+ && echo "Please see README for compilation instructions." \
+ && exit 1)
+
## External repositories
# (non-normative) RELAX NG schema for HTML5
@@ -49,6 +61,9 @@ webapps:
# Two- and three-letter language codes
language-subtag-registry:
curl -O http://www.iana.org/assignments/language-subtag-registry
+html5lib/testdata/tokenizer/test1.test: html5lib
+html5lib:
+ hg clone https://html5lib.googlecode.com/hg/ html5lib
update:
cd relaxng; svn up
View
@@ -0,0 +1,116 @@
+(add-to-list 'load-path default-directory)
+(mapc (lambda (dir) (add-to-list 'load-path dir))
+ (parse-colon-path (getenv "LOAD_PATH")))
+
+(require 'json)
+(require 'html5-tok)
+
+(defun h5-explore-text-properties ()
+ (when (< (point) (point-max))
+ (describe-text-properties (point))))
+
+(define-minor-mode h5-explore-text-properties-mode
+ "Explore the text properties in this buffer."
+ nil " xtp" nil
+ (if explore-text-properties-mode
+ (add-hook 'post-command-hook 'h5-explore-text-properties nil t)
+ (remove-hook 'post-command-hook 'h5-explore-text-properties t)))
+
+(defun h5-munge-state-name (state-name)
+ (intern (format "h5-%s"
+ (mapconcat 'identity (split-string state-name) "-"))))
+
+(defsubst h5-test-input (test)
+ (cdr (assoc 'input test)))
+(defsubst h5-test-initial-states (test)
+ (cdr (assoc 'initialStates test)))
+(defsubst h5-test-output (test)
+ (cdr (assoc 'output test)))
+(defsubst h5-test-description (test)
+ (cdr (assoc 'description test)))
+
+(defun h5-coalesce-chars (in-toks)
+ (let ((out-toks '())
+ (in-charrun nil)
+ (charrun '()))
+ (dolist (tok in-toks)
+ (cond
+ ((and (numberp tok) (not in-charrun))
+ (setq in-charrun t
+ charrun '())
+ (push tok charrun))
+ ((and (numberp tok) in-charrun)
+ (push tok charrun))
+ (in-charrun
+ (push (apply 'string (nreverse charrun)) out-toks)
+ (push tok out-toks)
+ (setq in-charrun nil
+ charrun '()))
+ (t
+ (push tok out-toks))))
+ (if in-charrun
+ (push (apply 'string (nreverse charrun)) out-toks))
+ (nreverse out-toks)))
+
+(defun h5-compare-token-to-expected (actual expected)
+ (cond
+ ((stringp actual)
+ (and (listp expected)
+ (eq (car expected) 'Character)
+ (string-equal actual (cadr expected))))
+ (t
+ (error "comparing apples and oranges?"))))
+
+(defun h5-compare-tokens-to-expected-output (actual expected)
+ (every
+ 'identity
+ (loop for actual-tok in actual
+ for expected-tok in expected
+ collect (h5-compare-token-to-expected actual-tok expected-tok))))
+
+(defun h5-run-test-1 (test initial-state)
+ (let ((tokens '())
+ (token t))
+ (with-temp-buffer
+ (insert (h5-test-input test))
+ (goto-char (point-min))
+ (while (not (eq token :eof))
+ (setq token (html5-tok-forward
+ nil
+ (if initial-state (h5-munge-state-name initial-state))))
+ (setq initial-state nil)
+ (unless (eq token :eof)
+ (push token tokens)))
+ (setq tokens (h5-coalesce-chars (nreverse tokens)))
+ (h5-compare-tokens-to-expected-output tokens (h5-test-output test)))))
+
+(defun h5-run-test (test)
+ (let ((initial-states (or (h5-test-initial-states test) (list "data state"))))
+ (condition-case e
+ (dolist (initial-state initial-states)
+ (h5-run-test-1 test initial-state))
+ (error
+ (princ (format "Blew up: %s!\n" e))
+ nil))))
+
+(defun h5-run-tests ()
+ (let ((total 0)
+ (passed 0))
+ (dolist (testfile command-line-args-left)
+ (princ (format "Running tests from %s:\n"
+ (file-name-nondirectory testfile)))
+ (let ((testdata
+ (let ((json-object-type 'alist)
+ (json-array-type 'list))
+ (json-read-file testfile))))
+ (dolist (test (cdr (assoc 'tests testdata)))
+ (incf total)
+ (princ (format "\t'%s': %s\n"
+ (h5-test-description test)
+ (if (h5-run-test test)
+ (prog1 "PASS" (incf passed))
+ "FAIL"))))))
+ (princ (format "Passed %d out of %d tests (%02.1f%%)"
+ passed total
+ (* 100 (/ (float passed) (float total))))))
+ (setq command-line-args-left nil))
Oops, something went wrong.

0 comments on commit f6e6f08

Please sign in to comment.