Permalink
Browse files

Merge pull request #167 from CodeFridge/feature/encoding_option2

Encoding and baseUrl can be specified for html
  • Loading branch information...
2 parents 74b5845 + a055efc commit 95c614c961e730fba8a523b8a8616c2149c1f35f @defunctzombie defunctzombie committed Sep 12, 2012
Showing with 88 additions and 15 deletions.
  1. +16 −8 lib/document.js
  2. +29 −7 src/xml_document.cc
  3. +10 −0 test/fixtures/parser.euc_jp.html
  4. +33 −0 test/html_parser.js
View
@@ -56,22 +56,22 @@ Document.prototype.childNodes = function() {
/// @return a string representation of the document
Document.prototype.toString = function() {
return this._toString();
-}
+};
/// @return the document version
Document.prototype.version = function() {
return this._version();
-}
+};
/// @return the document encoding
Document.prototype.encoding = function(encoding) {
return this._encoding(encoding);
-}
+};
/// @return whether the XmlDocument is valid
Document.prototype.validate = function(xsd) {
return this._validate(xsd);
-}
+};
/// @return array of namespaces in document
Document.prototype.namespaces = function() {
@@ -82,15 +82,23 @@ module.exports = Document;
/// parse a string into a html document
/// @param string html string to parse
+/// @param {encoding:string, baseUrl:string} opts html string to parse
/// @return a Document
-module.exports.fromHtml = function(string) {
- return bindings.fromHtml(string);
-}
+module.exports.fromHtml = function(string, opts) {
+ opts = opts || {};
+
+ // if for some reason user did not specify an object for the options
+ if (typeof(opts) !== 'object') {
+ throw new Error('fromHtml options must be an object');
+ }
+
+ return bindings.fromHtml(string, opts);
+};
/// parse a string into a xml document
/// @param string xml string to parse
/// @return a Document
module.exports.fromXml = function(string) {
return bindings.fromXml(string);
-}
+};
View
@@ -135,22 +135,44 @@ XmlDocument::FromHtml(const v8::Arguments& args)
{
v8::HandleScope scope;
+ v8::Local<v8::Object> options = args[1]->ToObject();
+ v8::Local<v8::Value> baseUrlOpt = options->Get(
+ v8::String::NewSymbol("baseUrl"));
+ v8::Local<v8::Value> encodingOpt = options->Get(
+ v8::String::NewSymbol("encoding"));
+
+ // the base URL that will be used for this HTML parsed document
+ v8::String::Utf8Value baseUrl_(baseUrlOpt->ToString());
+ const char * baseUrl = *baseUrl_;
+ if (!baseUrlOpt->IsString()) {
+ baseUrl = NULL;
+ }
+
+ // the encoding to be used for this document
+ // (leave NULL for libxml to autodetect)
+ v8::String::Utf8Value encoding_(encodingOpt->ToString());
+ const char * encoding = *encoding_;
+
+ if (!encodingOpt->IsString()) {
+ encoding = NULL;
+ }
+
v8::Local<v8::Array> errors = v8::Array::New();
xmlResetLastError();
xmlSetStructuredErrorFunc(reinterpret_cast<void *>(*errors),
XmlSyntaxError::PushToArray);
htmlDocPtr doc;
if (!node::Buffer::HasInstance(args[0])) {
- // Parse a string
- v8::String::Utf8Value str(args[0]->ToString());
- doc = htmlReadMemory(*str, str.length(), NULL, NULL, 0);
+ // Parse a string
+ v8::String::Utf8Value str(args[0]->ToString());
+ doc = htmlReadMemory(*str, str.length(), baseUrl, encoding, 0);
}
else {
- // Parse a buffer
- v8::Local<v8::Object> buf = args[0]->ToObject();
- doc = htmlReadMemory(node::Buffer::Data(buf), node::Buffer::Length(buf),
- NULL, NULL, 0);
+ // Parse a buffer
+ v8::Local<v8::Object> buf = args[0]->ToObject();
+ doc = htmlReadMemory(node::Buffer::Data(buf), node::Buffer::Length(buf),
+ baseUrl, encoding, 0);
}
xmlSetStructuredErrorFunc(NULL, NULL);
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=euc-jp" />
+ <title>テスト</title>
+ </head>
+ <body>
+ <div>テスト</div>
+ </body>
+</html>
View
@@ -32,6 +32,39 @@ module.exports.parse = function(assert) {
assert.done();
};
+// Although libxml defaults to a utf-8 encoding, if not specifically specified
+// it will guess the encoding based on meta http-equiv tags available
+// This test shows that the "guessed" encoding can be overridden
+module.exports.parse_force_encoding = function(assert) {
+ var filename = __dirname + '/fixtures/parser.euc_jp.html';
+
+ function attempt_parse(encoding, opts) {
+ var str = fs.readFileSync(filename, encoding);
+
+ var doc = libxml.parseHtml(str, opts);
+ assert.equal('html', doc.root().name());
+
+ // make sure libxml rewrite the meta charset of this document
+
+ // calling toString on the document ensure that it is converted to the
+ // correct internal format and the new meta tag is replaced
+ doc.root().toString();
+ var fixedCharset = doc.find('/html/head/meta/@content')[0].value();
+ assert.ok( fixedCharset.indexOf(opts.encoding.toUpperCase() ) !== -1);
+
+ assert.equal('テスト', doc.get('head/title').text());
+ assert.equal('テスト', doc.get('body/div').text());
+ }
+
+ // Parse via a string
+ attempt_parse('utf-8', {encoding: 'utf-8'});
+
+ // Parse via a Buffer
+ attempt_parse(null, {encoding: 'utf-8'});
+
+ assert.done();
+};
+
module.exports.parse_synonym = function(assert) {
assert.strictEqual(libxml.parseHtml, libxml.parseHtmlString);
assert.done();

0 comments on commit 95c614c

Please sign in to comment.