Permalink
Browse files

making the super awesome sax parser go

  • Loading branch information...
1 parent f2ffa56 commit 0200a465839003b8bf4c03d1f218aebd9f638217 @tenderlove tenderlove committed Sep 21, 2008
Showing with 260 additions and 58 deletions.
  1. +53 −3 Manifest.txt
  2. +77 −52 ext/nokogiri/xml_sax_parser.c
  3. +40 −3 lib/nokogiri/xml/sax/document.rb
  4. +21 −0 lib/nokogiri/xml/sax/parser.rb
  5. +69 −0 test/xml/sax/test_parser.rb
View
@@ -2,40 +2,90 @@ History.txt
Manifest.txt
README.txt
Rakefile
-ext/nokogiri/Makefile
+ext/nokogiri/conftest.dSYM/Contents/Info.plist
+ext/nokogiri/conftest.dSYM/Contents/Resources/DWARF/conftest
ext/nokogiri/extconf.rb
ext/nokogiri/html_document.c
ext/nokogiri/html_document.h
-ext/nokogiri/mkmf.log
ext/nokogiri/native.c
ext/nokogiri/native.h
ext/nokogiri/xml_document.c
ext/nokogiri/xml_document.h
ext/nokogiri/xml_node.c
ext/nokogiri/xml_node.h
+ext/nokogiri/xml_node_set.c
+ext/nokogiri/xml_node_set.h
+ext/nokogiri/xml_sax_parser.c
+ext/nokogiri/xml_sax_parser.h
+ext/nokogiri/xml_text.c
+ext/nokogiri/xml_text.h
ext/nokogiri/xml_xpath.c
ext/nokogiri/xml_xpath.h
ext/nokogiri/xslt_stylesheet.c
ext/nokogiri/xslt_stylesheet.h
lib/nokogiri.rb
-lib/nokogiri/generated_interface.rb
+lib/nokogiri/css.rb
+lib/nokogiri/css/generated_tokenizer.rb
+lib/nokogiri/css/node.rb
+lib/nokogiri/css/parser.rb
+lib/nokogiri/css/parser.y
+lib/nokogiri/css/tokenizer.rb
+lib/nokogiri/css/tokenizer.rex
+lib/nokogiri/css/xpath_visitor.rb
+lib/nokogiri/decorators.rb
+lib/nokogiri/decorators/hpricot.rb
+lib/nokogiri/decorators/hpricot/node.rb
+lib/nokogiri/decorators/hpricot/node_set.rb
+lib/nokogiri/decorators/hpricot/xpath_visitor.rb
+lib/nokogiri/hpricot.rb
lib/nokogiri/html.rb
+lib/nokogiri/html/builder.rb
lib/nokogiri/html/document.rb
lib/nokogiri/version.rb
lib/nokogiri/xml.rb
+lib/nokogiri/xml/builder.rb
lib/nokogiri/xml/document.rb
lib/nokogiri/xml/node.rb
lib/nokogiri/xml/node_set.rb
+lib/nokogiri/xml/sax.rb
+lib/nokogiri/xml/sax/document.rb
+lib/nokogiri/xml/sax/parser.rb
+lib/nokogiri/xml/text.rb
lib/nokogiri/xml/text_node.rb
lib/nokogiri/xml/xpath.rb
lib/nokogiri/xslt.rb
lib/nokogiri/xslt/stylesheet.rb
nokogiri.gemspec
+test/css/test_parser.rb
+test/css/test_tokenizer.rb
test/files/staff.xml
test/files/staff.xslt
test/files/tlm.html
test/helper.rb
+test/hpricot/files/basic.xhtml
+test/hpricot/files/boingboing.html
+test/hpricot/files/cy0.html
+test/hpricot/files/immob.html
+test/hpricot/files/pace_application.html
+test/hpricot/files/tenderlove.html
+test/hpricot/files/uswebgen.html
+test/hpricot/files/utf8.html
+test/hpricot/files/week9.html
+test/hpricot/files/why.xml
+test/hpricot/load_files.rb
+test/hpricot/test_alter.rb
+test/hpricot/test_builder.rb
+test/hpricot/test_parser.rb
+test/hpricot/test_paths.rb
+test/hpricot/test_preserved.rb
+test/hpricot/test_xml.rb
+test/html/test_builder.rb
test/html/test_document.rb
+test/test_convert_xpath.rb
test/test_nokogiri.rb
+test/test_xslt_transforms.rb
+test/xml/sax/test_parser.rb
test/xml/test_document.rb
test/xml/test_node.rb
+test/xml/test_node_set.rb
+test/xml/test_text.rb
@@ -1,5 +1,11 @@
#include <xml_sax_parser.h>
+/*
+ * call-seq:
+ * parse_memory(data)
+ *
+ * Parse the document stored in +data+
+ */
static VALUE parse_memory(VALUE self, VALUE data)
{
xmlSAXHandlerPtr handler;
@@ -12,114 +18,132 @@ static VALUE parse_memory(VALUE self, VALUE data)
return data;
}
-static void internal_subset( void * ctx,
- const xmlChar *name,
- const xmlChar *external_id,
- const xmlChar *system_id )
+static VALUE native_parse_file(VALUE self, VALUE data)
+{
+ xmlSAXHandlerPtr handler;
+ Data_Get_Struct(self, xmlSAXHandler, handler);
+ xmlSAXUserParseFile( handler,
+ (void *)self,
+ StringValuePtr(data)
+ );
+ return data;
+}
+
+static void start_document(void * ctx)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("internal_subset"), 3,
- rb_str_new2((char *)name),
- rb_str_new2((char *)external_id),
- rb_str_new2((char *)system_id));
+ rb_funcall(doc, rb_intern("start_document"), 0);
}
-/* Not using these yet...
-static int is_standalone(void * ctx)
+static void end_document(void * ctx)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- if(Qtrue == rb_funcall(doc, rb_intern("standalone?"), 0))
- return 1;
-
- return 0;
+ rb_funcall(doc, rb_intern("end_document"), 0);
}
-static int has_internal_subset(void * ctx)
+static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- if(Qtrue == rb_funcall(doc, rb_intern("internal_subset?"), 0))
- return 1;
+ VALUE attributes = rb_ary_new();
+ const xmlChar * attr;
+ int i = 0;
+ if(atts) {
+ while((attr = atts[i]) != NULL) {
+ rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((const char *)attr));
+ i++;
+ }
+ }
- return 0;
+ rb_funcall( doc,
+ rb_intern("start_element"),
+ 2,
+ rb_str_new2((const char *)name),
+ attributes
+ );
}
-static int has_external_subset(void * ctx)
+static void end_element(void * ctx, const xmlChar *name)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- if(Qtrue == rb_funcall(doc, rb_intern("external_subset?"), 0))
- return 1;
+ rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((const char *)name));
+}
- return 0;
+static void characters_func(void * ctx, const xmlChar * ch, int len)
+{
+ VALUE self = (VALUE)ctx;
+ VALUE doc = rb_funcall(self, rb_intern("document"), 0);
+ VALUE str = rb_str_new((const char *)ch, (long)len);
+ rb_funcall(doc, rb_intern("characters"), 1, str);
}
-*/
-static void start_document(void * ctx)
+static void comment_func(void * ctx, const xmlChar * value)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("start_document"), 0);
+ VALUE str = rb_str_new2((const char *)value);
+ rb_funcall(doc, rb_intern("comment"), 1, str);
}
-static void end_document(void * ctx)
+static void warning_func(void * ctx, const char *msg, ...)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("end_document"), 0);
+ char * message;
+
+ va_list args;
+ va_start(args, msg);
+ vasprintf(&message, msg, args);
+ va_end(args);
+
+ rb_funcall(doc, rb_intern("warning"), 1, rb_str_new2(message));
+ free(message);
}
-static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
+static void error_func(void * ctx, const char *msg, ...)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- VALUE attributes = rb_ary_new();
- xmlChar * attr;
- int i = 0;
- if(atts) {
- while(attr = atts[i]) {
- rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((char *)attr));
- i++;
- }
- }
+ char * message;
- rb_funcall( doc,
- rb_intern("start_element"),
- 2,
- rb_str_new2((char *)name),
- attributes
- );
+ va_list args;
+ va_start(args, msg);
+ vasprintf(&message, msg, args);
+ va_end(args);
+
+ rb_funcall(doc, rb_intern("error"), 1, rb_str_new2(message));
+ free(message);
}
-static void end_element(void * ctx, const xmlChar *name)
+static void cdata_block(void * ctx, const xmlChar * value, int len)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((char *)name));
+ VALUE string = rb_str_new((const char *)value, (long)len);
+ rb_funcall(doc, rb_intern("cdata_block"), 1, string);
}
static void deallocate(xmlSAXHandlerPtr handler)
{
- /* FIXME */
free(handler);
}
static VALUE allocate(VALUE klass)
{
xmlSAXHandlerPtr handler = calloc(1, sizeof(xmlSAXHandler));
- handler->internalSubset = internal_subset;
- /*
- handler->isStandalone = is_standalone;
- handler->hasInternalSubset = has_internal_subset;
- handler->hasExternalSubset = has_external_subset;
- */
handler->startDocument = start_document;
handler->endDocument = end_document;
handler->startElement = start_element;
handler->endElement = end_element;
+ handler->characters = characters_func;
+ handler->comment = comment_func;
+ handler->warning = warning_func;
+ handler->error = error_func;
+ handler->cdataBlock = cdata_block;
return Data_Wrap_Struct(klass, NULL, deallocate, handler);
}
@@ -131,4 +155,5 @@ void init_xml_sax_parser()
rb_const_get(mNokogiriXmlSax, rb_intern("Parser"));
rb_define_alloc_func(klass, allocate);
rb_define_method(klass, "parse_memory", parse_memory, 1);
+ rb_define_private_method(klass, "native_parse_file", native_parse_file, 1);
}
@@ -2,20 +2,57 @@ module Nokogiri
module XML
module SAX
class Document
- def internal_subset name, external_id, system_id
- end
-
+ ###
+ # Called when document starts parsing
def start_document
end
+ ###
+ # Called when document ends parsing
def end_document
end
+ ###
+ # Called at the beginning of an element
+ # +name+ is the name of the tag with +attrs+ as attributes
def start_element name, attrs = []
end
+ ###
+ # Called at the end of an element
+ # +name+ is the tag name
def end_element name
end
+
+ ###
+ # Characters read between a tag
+ # +string+ contains the character data
+ def characters string
+ end
+
+ ###
+ # Called when comments are encountered
+ # +string+ contains the comment data
+ def comment string
+ end
+
+ ###
+ # Called on document warnings
+ # +string+ contains the warning
+ def warning string
+ end
+
+ ###
+ # Called on document errors
+ # +string+ contains the error
+ def error string
+ end
+
+ ###
+ # Called when cdata blocks are found
+ # +string+ contains the cdata content
+ def cdata_block string
+ end
end
end
end
@@ -6,6 +6,27 @@ class Parser
def initialize(doc = SAX::Document.new)
@document = doc
end
+
+ ###
+ # Parse given +thing+ which may be a string containing xml, or an
+ # IO object.
+ def parse thing
+ parse_memory(thing.is_a?(IO) ? thing.read : thing)
+ end
+
+ ###
+ # Parse given +io+
+ def parse_io io
+ parse_memory io.read
+ end
+
+ ###
+ # Parse a file with +filename+
+ def parse_file filename
+ raise Errno::ENOENT unless File.exists?(filename)
+ raise Errno::EISDIR if File.directory?(filename)
+ native_parse_file filename
+ end
end
end
end
Oops, something went wrong.

0 comments on commit 0200a46

Please sign in to comment.