Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

making the super awesome sax parser go

  • Loading branch information...
commit 0200a465839003b8bf4c03d1f218aebd9f638217 1 parent f2ffa56
@tenderlove tenderlove authored
View
56 Manifest.txt
@@ -2,40 +2,90 @@ History.txt
Manifest.txt
README.txt
Rakefile
-ext/nokogiri/Makefile
+ext/nokogiri/conftest.dSYM/Contents/Info.plist
+ext/nokogiri/conftest.dSYM/Contents/Resources/DWARF/conftest
ext/nokogiri/extconf.rb
ext/nokogiri/html_document.c
ext/nokogiri/html_document.h
-ext/nokogiri/mkmf.log
ext/nokogiri/native.c
ext/nokogiri/native.h
ext/nokogiri/xml_document.c
ext/nokogiri/xml_document.h
ext/nokogiri/xml_node.c
ext/nokogiri/xml_node.h
+ext/nokogiri/xml_node_set.c
+ext/nokogiri/xml_node_set.h
+ext/nokogiri/xml_sax_parser.c
+ext/nokogiri/xml_sax_parser.h
+ext/nokogiri/xml_text.c
+ext/nokogiri/xml_text.h
ext/nokogiri/xml_xpath.c
ext/nokogiri/xml_xpath.h
ext/nokogiri/xslt_stylesheet.c
ext/nokogiri/xslt_stylesheet.h
lib/nokogiri.rb
-lib/nokogiri/generated_interface.rb
+lib/nokogiri/css.rb
+lib/nokogiri/css/generated_tokenizer.rb
+lib/nokogiri/css/node.rb
+lib/nokogiri/css/parser.rb
+lib/nokogiri/css/parser.y
+lib/nokogiri/css/tokenizer.rb
+lib/nokogiri/css/tokenizer.rex
+lib/nokogiri/css/xpath_visitor.rb
+lib/nokogiri/decorators.rb
+lib/nokogiri/decorators/hpricot.rb
+lib/nokogiri/decorators/hpricot/node.rb
+lib/nokogiri/decorators/hpricot/node_set.rb
+lib/nokogiri/decorators/hpricot/xpath_visitor.rb
+lib/nokogiri/hpricot.rb
lib/nokogiri/html.rb
+lib/nokogiri/html/builder.rb
lib/nokogiri/html/document.rb
lib/nokogiri/version.rb
lib/nokogiri/xml.rb
+lib/nokogiri/xml/builder.rb
lib/nokogiri/xml/document.rb
lib/nokogiri/xml/node.rb
lib/nokogiri/xml/node_set.rb
+lib/nokogiri/xml/sax.rb
+lib/nokogiri/xml/sax/document.rb
+lib/nokogiri/xml/sax/parser.rb
+lib/nokogiri/xml/text.rb
lib/nokogiri/xml/text_node.rb
lib/nokogiri/xml/xpath.rb
lib/nokogiri/xslt.rb
lib/nokogiri/xslt/stylesheet.rb
nokogiri.gemspec
+test/css/test_parser.rb
+test/css/test_tokenizer.rb
test/files/staff.xml
test/files/staff.xslt
test/files/tlm.html
test/helper.rb
+test/hpricot/files/basic.xhtml
+test/hpricot/files/boingboing.html
+test/hpricot/files/cy0.html
+test/hpricot/files/immob.html
+test/hpricot/files/pace_application.html
+test/hpricot/files/tenderlove.html
+test/hpricot/files/uswebgen.html
+test/hpricot/files/utf8.html
+test/hpricot/files/week9.html
+test/hpricot/files/why.xml
+test/hpricot/load_files.rb
+test/hpricot/test_alter.rb
+test/hpricot/test_builder.rb
+test/hpricot/test_parser.rb
+test/hpricot/test_paths.rb
+test/hpricot/test_preserved.rb
+test/hpricot/test_xml.rb
+test/html/test_builder.rb
test/html/test_document.rb
+test/test_convert_xpath.rb
test/test_nokogiri.rb
+test/test_xslt_transforms.rb
+test/xml/sax/test_parser.rb
test/xml/test_document.rb
test/xml/test_node.rb
+test/xml/test_node_set.rb
+test/xml/test_text.rb
View
129 ext/nokogiri/xml_sax_parser.c
@@ -1,5 +1,11 @@
#include <xml_sax_parser.h>
+/*
+ * call-seq:
+ * parse_memory(data)
+ *
+ * Parse the document stored in +data+
+ */
static VALUE parse_memory(VALUE self, VALUE data)
{
xmlSAXHandlerPtr handler;
@@ -12,97 +18,116 @@ static VALUE parse_memory(VALUE self, VALUE data)
return data;
}
-static void internal_subset( void * ctx,
- const xmlChar *name,
- const xmlChar *external_id,
- const xmlChar *system_id )
+static VALUE native_parse_file(VALUE self, VALUE data)
+{
+ xmlSAXHandlerPtr handler;
+ Data_Get_Struct(self, xmlSAXHandler, handler);
+ xmlSAXUserParseFile( handler,
+ (void *)self,
+ StringValuePtr(data)
+ );
+ return data;
+}
+
+static void start_document(void * ctx)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("internal_subset"), 3,
- rb_str_new2((char *)name),
- rb_str_new2((char *)external_id),
- rb_str_new2((char *)system_id));
+ rb_funcall(doc, rb_intern("start_document"), 0);
}
-/* Not using these yet...
-static int is_standalone(void * ctx)
+static void end_document(void * ctx)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- if(Qtrue == rb_funcall(doc, rb_intern("standalone?"), 0))
- return 1;
-
- return 0;
+ rb_funcall(doc, rb_intern("end_document"), 0);
}
-static int has_internal_subset(void * ctx)
+static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- if(Qtrue == rb_funcall(doc, rb_intern("internal_subset?"), 0))
- return 1;
+ VALUE attributes = rb_ary_new();
+ const xmlChar * attr;
+ int i = 0;
+ if(atts) {
+ while((attr = atts[i]) != NULL) {
+ rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((const char *)attr));
+ i++;
+ }
+ }
- return 0;
+ rb_funcall( doc,
+ rb_intern("start_element"),
+ 2,
+ rb_str_new2((const char *)name),
+ attributes
+ );
}
-static int has_external_subset(void * ctx)
+static void end_element(void * ctx, const xmlChar *name)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- if(Qtrue == rb_funcall(doc, rb_intern("external_subset?"), 0))
- return 1;
+ rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((const char *)name));
+}
- return 0;
+static void characters_func(void * ctx, const xmlChar * ch, int len)
+{
+ VALUE self = (VALUE)ctx;
+ VALUE doc = rb_funcall(self, rb_intern("document"), 0);
+ VALUE str = rb_str_new((const char *)ch, (long)len);
+ rb_funcall(doc, rb_intern("characters"), 1, str);
}
-*/
-static void start_document(void * ctx)
+static void comment_func(void * ctx, const xmlChar * value)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("start_document"), 0);
+ VALUE str = rb_str_new2((const char *)value);
+ rb_funcall(doc, rb_intern("comment"), 1, str);
}
-static void end_document(void * ctx)
+static void warning_func(void * ctx, const char *msg, ...)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("end_document"), 0);
+ char * message;
+
+ va_list args;
+ va_start(args, msg);
+ vasprintf(&message, msg, args);
+ va_end(args);
+
+ rb_funcall(doc, rb_intern("warning"), 1, rb_str_new2(message));
+ free(message);
}
-static void start_element(void * ctx, const xmlChar *name, const xmlChar **atts)
+static void error_func(void * ctx, const char *msg, ...)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- VALUE attributes = rb_ary_new();
- xmlChar * attr;
- int i = 0;
- if(atts) {
- while(attr = atts[i]) {
- rb_funcall(attributes, rb_intern("<<"), 1, rb_str_new2((char *)attr));
- i++;
- }
- }
+ char * message;
- rb_funcall( doc,
- rb_intern("start_element"),
- 2,
- rb_str_new2((char *)name),
- attributes
- );
+ va_list args;
+ va_start(args, msg);
+ vasprintf(&message, msg, args);
+ va_end(args);
+
+ rb_funcall(doc, rb_intern("error"), 1, rb_str_new2(message));
+ free(message);
}
-static void end_element(void * ctx, const xmlChar *name)
+static void cdata_block(void * ctx, const xmlChar * value, int len)
{
VALUE self = (VALUE)ctx;
VALUE doc = rb_funcall(self, rb_intern("document"), 0);
- rb_funcall(doc, rb_intern("end_element"), 1, rb_str_new2((char *)name));
+ VALUE string = rb_str_new((const char *)value, (long)len);
+ rb_funcall(doc, rb_intern("cdata_block"), 1, string);
}
static void deallocate(xmlSAXHandlerPtr handler)
{
- /* FIXME */
free(handler);
}
@@ -110,16 +135,15 @@ static VALUE allocate(VALUE klass)
{
xmlSAXHandlerPtr handler = calloc(1, sizeof(xmlSAXHandler));
- handler->internalSubset = internal_subset;
- /*
- handler->isStandalone = is_standalone;
- handler->hasInternalSubset = has_internal_subset;
- handler->hasExternalSubset = has_external_subset;
- */
handler->startDocument = start_document;
handler->endDocument = end_document;
handler->startElement = start_element;
handler->endElement = end_element;
+ handler->characters = characters_func;
+ handler->comment = comment_func;
+ handler->warning = warning_func;
+ handler->error = error_func;
+ handler->cdataBlock = cdata_block;
return Data_Wrap_Struct(klass, NULL, deallocate, handler);
}
@@ -131,4 +155,5 @@ void init_xml_sax_parser()
rb_const_get(mNokogiriXmlSax, rb_intern("Parser"));
rb_define_alloc_func(klass, allocate);
rb_define_method(klass, "parse_memory", parse_memory, 1);
+ rb_define_private_method(klass, "native_parse_file", native_parse_file, 1);
}
View
43 lib/nokogiri/xml/sax/document.rb
@@ -2,20 +2,57 @@ module Nokogiri
module XML
module SAX
class Document
- def internal_subset name, external_id, system_id
- end
-
+ ###
+ # Called when document starts parsing
def start_document
end
+ ###
+ # Called when document ends parsing
def end_document
end
+ ###
+ # Called at the beginning of an element
+ # +name+ is the name of the tag with +attrs+ as attributes
def start_element name, attrs = []
end
+ ###
+ # Called at the end of an element
+ # +name+ is the tag name
def end_element name
end
+
+ ###
+ # Characters read between a tag
+ # +string+ contains the character data
+ def characters string
+ end
+
+ ###
+ # Called when comments are encountered
+ # +string+ contains the comment data
+ def comment string
+ end
+
+ ###
+ # Called on document warnings
+ # +string+ contains the warning
+ def warning string
+ end
+
+ ###
+ # Called on document errors
+ # +string+ contains the error
+ def error string
+ end
+
+ ###
+ # Called when cdata blocks are found
+ # +string+ contains the cdata content
+ def cdata_block string
+ end
end
end
end
View
21 lib/nokogiri/xml/sax/parser.rb
@@ -6,6 +6,27 @@ class Parser
def initialize(doc = SAX::Document.new)
@document = doc
end
+
+ ###
+ # Parse given +thing+ which may be a string containing xml, or an
+ # IO object.
+ def parse thing
+ parse_memory(thing.is_a?(IO) ? thing.read : thing)
+ end
+
+ ###
+ # Parse given +io+
+ def parse_io io
+ parse_memory io.read
+ end
+
+ ###
+ # Parse a file with +filename+
+ def parse_file filename
+ raise Errno::ENOENT unless File.exists?(filename)
+ raise Errno::EISDIR if File.directory?(filename)
+ native_parse_file filename
+ end
end
end
end
View
69 test/xml/sax/test_parser.rb
@@ -7,6 +7,7 @@ class TestParser < Nokogiri::TestCase
class Doc < SAX::Document
attr_reader :start_elements, :start_document_called
attr_reader :end_elements, :end_document_called
+ attr_reader :data, :comments, :cdata_blocks
def start_document
@start_document_called = true
@@ -27,12 +28,80 @@ def end_element *args
(@end_elements ||= []) << args
super
end
+
+ def characters string
+ @data ||= []
+ @data += [string]
+ super
+ end
+
+ def comment string
+ @comments ||= []
+ @comments += [string]
+ super
+ end
+
+ def cdata_block string
+ @cdata_blocks ||= []
+ @cdata_blocks += [string]
+ super
+ end
end
def setup
@parser = XML::SAX::Parser.new(Doc.new)
end
+ def test_parse
+ File.open(XML_FILE, 'rb') { |f|
+ @parser.parse(f)
+ }
+ @parser.parse(File.read(XML_FILE))
+ end
+
+ def test_parse_io
+ File.open(XML_FILE, 'rb') { |f|
+ @parser.parse_io(f)
+ }
+ end
+
+ def test_parse_file
+ @parser.parse_file(XML_FILE)
+ assert_raises(Errno::ENOENT) {
+ @parser.parse_file('')
+ }
+ assert_raises(Errno::EISDIR) {
+ @parser.parse_file(File.expand_path(File.dirname(__FILE__)))
+ }
+ end
+
+ def test_ctag
+ @parser.parse_memory(<<-eoxml)
+ <p id="asdfasdf">
+ <![CDATA[ This is a comment ]]>
+ Paragraph 1
+ </p>
+ eoxml
+ assert_equal [' This is a comment '], @parser.document.cdata_blocks
+ end
+
+ def test_comment
+ @parser.parse_memory(<<-eoxml)
+ <p id="asdfasdf">
+ <!-- This is a comment -->
+ Paragraph 1
+ </p>
+ eoxml
+ assert_equal [' This is a comment '], @parser.document.comments
+ end
+
+ def test_characters
+ @parser.parse_memory(<<-eoxml)
+ <p id="asdfasdf">Paragraph 1</p>
+ eoxml
+ assert_equal ['Paragraph 1'], @parser.document.data
+ end
+
def test_end_document
@parser.parse_memory(<<-eoxml)
<p id="asdfasdf">Paragraph 1</p>
Please sign in to comment.
Something went wrong with that request. Please try again.