Skip to content

Commit

Permalink
Close nodejs#1149 IDNA and Punycode support in url.parse
Browse files Browse the repository at this point in the history
Using @bnoordhuis's punycode lib.

Close nodejs#1174 also
  • Loading branch information
jeremys authored and isaacs committed Jul 6, 2011
1 parent 2dfed9f commit 786c714
Show file tree
Hide file tree
Showing 5 changed files with 357 additions and 12 deletions.
2 changes: 2 additions & 0 deletions LICENSE
Expand Up @@ -69,3 +69,5 @@ The externally maintained libraries used by Node are:

- lib/buffer_ieee754.js is copyright 2008 Fair Oaks Labs, Inc. and released
under the New BSD license.

- lib/punycode.js is copyright 2011 Ben Noordhuis and released under the MIT license.
218 changes: 218 additions & 0 deletions lib/punycode.js
@@ -0,0 +1,218 @@
// Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

exports.encode = encode;
exports.decode = decode;

var TMIN = 1;
var TMAX = 26;
var BASE = 36;
var SKEW = 38;
var DAMP = 700; // initial bias scaler
var INITIAL_N = 128;
var INITIAL_BIAS = 72;

function adapt_bias(delta, n_points, is_first) {
// scale back, then increase delta
delta /= is_first ? DAMP : 2;
delta += ~~(delta / n_points);

var s = (BASE - TMIN);
var t = ~~((s * TMAX) / 2); // threshold=455

for (var k = 0; delta > t; k += BASE) {
delta = ~~(delta / s);
}

var a = (BASE - TMIN + 1) * delta;
var b = (delta + SKEW);

return k + ~~(a / b);
}

function next_smallest_codepoint(codepoints, n) {
var m = 0x110000; // unicode upper bound + 1

for (var i = 0, len = codepoints.length; i < len; ++i) {
var c = codepoints[i];
if (c >= n && c < m) {
m = c;
}
}

// sanity check - should not happen
if (m >= 0x110000) {
throw new Error('Next smallest code point not found.');
}

return m;
}

function encode_digit(d) {
return d + (d < 26 ? 97 : 22);
}

function decode_digit(d) {
if (d >= 48 && d <= 57) {
return d - 22; // 0..9
}
if (d >= 65 && d <= 90) {
return d - 65; // A..Z
}
if (d >= 97 && d <= 122) {
return d - 97; // a..z
}
throw new Error('Illegal digit #' + d);
}

function threshold(k, bias) {
if (k <= bias + TMIN) {
return TMIN;
}
if (k >= bias + TMAX) {
return TMAX;
}
return k - bias;
}

function encode_int(bias, delta) {
var result = [];

for (var k = BASE, q = delta;; k += BASE) {
var t = threshold(k, bias);
if (q < t) {
result.push(encode_digit(q));
break;
}
else {
result.push(encode_digit(t + ((q - t) % (BASE - t))));
q = ~~((q - t) / (BASE - t));
}
}

return result;
}

function encode(input) {
if (typeof input != 'string') {
throw new Error('Argument must be a string.');
}

input = input.split('').map(function(c) {
return c.charCodeAt(0);
});

var output = [];
var non_basic = [];

for (var i = 0, len = input.length; i < len; ++i) {
var c = input[i];
if (c < 128) {
output.push(c);
}
else {
non_basic.push(c);
}
}

var b, h;
b = h = output.length;

if (b) {
output.push(45); // delimiter '-'
}

var n = INITIAL_N;
var bias = INITIAL_BIAS;
var delta = 0;

for (var len = input.length; h < len; ++n, ++delta) {
var m = next_smallest_codepoint(non_basic, n);
delta += (m - n) * (h + 1);
n = m;

for (var i = 0; i < len; ++i) {
var c = input[i];
if (c < n) {
if (++delta == 0) {
throw new Error('Delta overflow.');
}
}
else if (c == n) {
// TODO append in-place?
// i.e. -> output.push.apply(output, encode_int(bias, delta));
output = output.concat(encode_int(bias, delta));
bias = adapt_bias(delta, h + 1, b == h);
delta = 0;
h++;
}
}
}

return String.fromCharCode.apply(String, output);
}

function decode(input) {
if (typeof input != 'string') {
throw new Error('Argument must be a string.');
}

// find basic code points/delta separator
var b = 1 + input.lastIndexOf('-');

input = input.split('').map(function(c) {
return c.charCodeAt(0);
});

// start with a copy of the basic code points
var output = input.slice(0, b ? (b - 1) : 0);

var n = INITIAL_N;
var bias = INITIAL_BIAS;

for (var i = 0, len = input.length; b < len; ++i) {
var org_i = i;

for (var k = BASE, w = 1;; k += BASE) {
var d = decode_digit(input[b++]);

// TODO overflow check
i += d * w;

var t = threshold(k, bias);
if (d < t) {
break;
}

// TODO overflow check
w *= BASE - t;
}

var x = 1 + output.length;
bias = adapt_bias(i - org_i, x, org_i == 0);
// TODO overflow check
n += ~~(i / x);
i %= x;

output.splice(i, 0, n);
}

return String.fromCharCode.apply(String, output);
}
54 changes: 44 additions & 10 deletions lib/url.js
Expand Up @@ -19,6 +19,8 @@
// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
// USE OR OTHER DEALINGS IN THE SOFTWARE.

var punycode = require('punycode');

exports.parse = urlParse;
exports.resolve = urlResolve;
exports.resolveObject = urlResolveObject;
Expand Down Expand Up @@ -183,24 +185,56 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
var part = hostparts[i];
if (!part) continue;
if (!part.match(hostnamePartPattern)) {
var validParts = hostparts.slice(0, i);
var notHost = hostparts.slice(i + 1);
var bit = part.match(hostnamePartStart);
if (bit) {
validParts.push(bit[1]);
notHost.unshift(bit[2]);
var newpart = '';
for (var j = 0, k = part.length; j < k; j++) {
if (part.charCodeAt(j) > 127) {
// we replace non-ASCII char with a temporary placeholder
// we need this to make sure size of hostname is not
// broken by replacing non-ASCII by nothing
newpart += 'x';
} else {
newpart += part[j];
}
}
if (notHost.length) {
rest = '/' + notHost.join('.') + rest
// we test again with ASCII char only
if (!newpart.match(hostnamePartPattern)) {
var validParts = hostparts.slice(0, i);
var notHost = hostparts.slice(i + 1);
var bit = part.match(hostnamePartStart);
if (bit) {
validParts.push(bit[1]);
notHost.unshift(bit[2]);
}
if (notHost.length) {
rest = '/' + notHost.join('.') + rest;
}
out.hostname = validParts.join('.');
break;
}
out.hostname = validParts.join('.');
break;
}
}
}

// hostnames are always lower case.
out.hostname = out.hostname.toLowerCase();

// IDNA Support: Returns a puny coded representation of "domain".
// It only converts the part of the domain name that
// has non ASCII characters. I.e. it dosent matter if
// you call it with a domain that already is in ASCII.
try {
var domainArray = out.hostname.split('.');
var newOut = [];
for (var i = 0; i < domainArray.length; ++i) {
var s = domainArray[i];
newOut.push(s.match(/[^A-Za-z0-9-]/) ?
'xn--' + punycode.encode(s) : s);
}
out.hostname = newOut.join('.');
} catch (e) {
// if encode fail for some reason, we just do the classic behavior.
}

out.host = ((out.auth) ? out.auth + '@' : '') +
(out.hostname || '') +
((out.port) ? ':' + out.port : '');
Expand Down
38 changes: 38 additions & 0 deletions test/simple/test-punycode.js
@@ -0,0 +1,38 @@
// Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

punycode = require('punycode');
assert = require('assert');

assert.equal(punycode.encode('ü'), 'tda');
assert.equal(punycode.encode('Goethe'), 'Goethe-');
assert.equal(punycode.encode('Bücher'), 'Bcher-kva');
assert.equal(punycode.encode(
'Willst du die Blüthe des frühen, die Früchte des späteren Jahres'),
'Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal');
assert.equal(punycode.encode('日本語'), 'wgv71a119e');

assert.equal(punycode.decode('tda'), 'ü');
assert.equal(punycode.decode('Goethe-'), 'Goethe');
assert.equal(punycode.decode('Bcher-kva'), 'Bücher');
assert.equal(punycode.decode(
'Willst du die Blthe des frhen, die Frchte des spteren Jahres-x9e96lkal'),
'Willst du die Blüthe des frühen, die Früchte des späteren Jahres');
assert.equal(punycode.decode('wgv71a119e'), '日本語');

0 comments on commit 786c714

Please sign in to comment.