Skip to content

Commit 29729ef

Browse files
committed
A new born
1 parent 89cc920 commit 29729ef

6 files changed

Lines changed: 74 additions & 67 deletions

File tree

README.md

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,38 @@
1-
# html-to-utf8
1+
# html-encode
2+
![Last version](https://img.shields.io/github/tag/Kikobeats/html-encode.svg?style=flat-square)
3+
[![Build Status](https://img.shields.io/travis/Kikobeats/html-encode/master.svg?style=flat-square)](https://travis-ci.org/Kikobeats/html-encode)
4+
[![Coverage Status](https://img.shields.io/coveralls/Kikobeats/html-encode.svg?style=flat-square)](https://coveralls.io/github/Kikobeats/html-encode)
5+
[![Dependency status](https://img.shields.io/david/Kikobeats/html-encode.svg?style=flat-square)](https://david-dm.org/Kikobeats/html-encode)
6+
[![Dev Dependencies Status](https://img.shields.io/david/dev/Kikobeats/html-encode.svg?style=flat-square)](https://david-dm.org/Kikobeats/html-encode#info=devDependencies)
7+
[![NPM Status](https://img.shields.io/npm/dm/html-encode.svg?style=flat-square)](https://www.npmjs.org/package/html-encode)
8+
[![Donate](https://img.shields.io/badge/donate-paypal-blue.svg?style=flat-square)](https://paypal.me/Kikobeats)
29

3-
A Node.js library for converting HTML documents of arbitrary encoding to UTF-8.
10+
> A Node.js library for converting HTML documents of arbitrary encoding into a target encoding (utf8, utf16, etc).
411
5-
### Installation
12+
### Install
613

7-
```shell
8-
npm install html-to-utf8
14+
```bash
15+
$ npm install html-encode
916
```
1017

1118
### Usage
1219

13-
```javascript
14-
var request = require('request')
15-
var toUTF8 = require('html-to-utf8')
16-
17-
request({
18-
url: 'http://www.rakuten.co.jp',
19-
encoding: null // stop request from decoding response
20-
}, function (err, resp, buffer) {
21-
if (err) {
22-
console.error(err.stack)
23-
return
24-
}
25-
26-
var htmlInUTF8 = toUTF8(buffer, resp.headers['content-type'])
27-
})
20+
```js
21+
'use strict'
22+
23+
const got = require('got')
24+
const toUTF8 = require('html-encode')('utf-8')
25+
const url = process.argv[2]
26+
27+
;(async () => {
28+
const { body: buffer, headers } = await got(url, { encoding: null })
29+
const str = toUTF8(buffer, headers['content-type'])
30+
console.log(str)
31+
})()
2832
```
2933

34+
See more at [examples](/examples).
35+
3036
### License
3137

3238
The code is available under [MIT license](LICENSE).

examples/http.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
'use strict'
22

3-
var got = require('got')
4-
var toUTF8 = require('..')
3+
const got = require('got')
4+
const toUTF8 = require('..')('utf-8')
55

66
const url = process.argv[2]
77
;(async () => {
88
try {
99
const { body: buffer, headers } = await got(url, { encoding: null })
10-
var str = toUTF8(buffer, headers['content-type'])
10+
const str = toUTF8(buffer, headers['content-type'])
1111
console.log(str)
1212
} catch (err) {
1313
throw err

examples/stream.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
'use strict'
22

33
var got = require('got')
4-
var toUTF8 = require('..')
4+
var toUTF8 = require('..')('utf-8')
55

66
const url = process.argv[2]
77

index.js

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,28 @@
11
'use strict'
22

3-
var jschardet = require('jschardet')
4-
var isBuffer = require('is-buffer')
5-
var iconv = require('iconv-lite')
6-
var charset = require('charset')
3+
const jschardet = require('jschardet')
4+
const isBuffer = require('is-buffer')
5+
const iconv = require('iconv-lite')
6+
const charset = require('charset')
77

8-
var charsetRegex = /charset=["]*([^>"\s]+)/i
8+
const REGEX_CHARSET = /charset=["]*([^>"\s]+)/i
99

10-
module.exports = function ensureUTF8 (buffer, contentType) {
11-
if (!isBuffer(buffer)) throw new TypeError('content should be a buffer.')
12-
13-
var encoding = getEncoding(buffer, contentType)
14-
15-
return encoding === 'utf8'
16-
? buffer.toString('utf8')
17-
: iconv.decode(buffer, encoding).replace(charsetRegex, 'utf-8')
10+
const inferredEncoding = content => {
11+
const charset = jschardet.detect(content)
12+
return charset && charset.encoding
1813
}
1914

20-
function getEncoding (content, contentType) {
21-
return (
15+
module.exports = targetEncoding => {
16+
if (!targetEncoding) throw new TypeError('Need to provide a target encoding.')
17+
18+
const getEncoding = (content, contentType) =>
2219
charset({ 'content-type': contentType }, content) ||
2320
inferredEncoding(content) ||
24-
'utf8'
25-
)
26-
}
21+
targetEncoding
2722

28-
function inferredEncoding (content) {
29-
var charset = jschardet.detect(content)
30-
return charset && charset.encoding
23+
return (buffer, contentType) => {
24+
if (!isBuffer(buffer)) throw new TypeError('content should be a buffer.')
25+
const encoding = getEncoding(buffer, contentType)
26+
return iconv.decode(buffer, encoding).replace(REGEX_CHARSET, targetEncoding)
27+
}
3128
}

package.json

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
{
22
"name": "html-to-utf8",
3-
"description": "Convert html documents of arbitrary encoding to UTF-8",
4-
"homepage": "https://github.com/spect88/html-to-utf8#readme",
5-
"version": "0.1.0",
3+
"description": "A Node.js library for converting HTML documents of arbitrary encoding into a target encoding (utf8, utf16, etc).",
4+
"homepage": "https://documentup.com/Kikobeats/html-encode",
5+
"version": "0.0.0",
66
"main": "index.js",
77
"author": {
8-
"email": "spect88@gmail.com",
9-
"name": "Tomasz Szczęśniak-Szlagowski"
8+
"name": "Kiko Beats",
9+
"url": "https://github.com/Kikobeats"
1010
},
1111
"repository": {
1212
"type": "git",
13-
"url": "git+https://github.com/spect88/html-to-utf8.git"
13+
"url": "git+https://github.com/kikobeats/html-encode.git"
1414
},
1515
"bugs": {
16-
"url": "https://github.com/spect88/html-to-utf8/issues"
16+
"url": "https://github.com/kikobeats/html-encode/issues"
1717
},
1818
"keywords": [
1919
"encoding",
20+
"encode",
21+
"arbitrary",
22+
"generic",
2023
"html",
21-
"utf8"
24+
"utf8",
25+
"utf16"
2226
],
2327
"dependencies": {
2428
"charset": "~1.0.1",

test/index.js

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,43 @@
11
'use strict'
22

3-
var expect = require('chai').expect
4-
var path = require('path')
5-
var fs = require('fs')
3+
const { expect } = require('chai')
4+
const path = require('path')
5+
const fs = require('fs')
66

7-
var toUTF8 = require('../index')
7+
const toUTF8 = require('../index')('utf-8')
88

99
describe('Encoding Converter', function () {
1010
it('properly decodes Shift-JIS html documents', function () {
11-
var buffer = loadExample('51242_54045.html')
12-
var output = toUTF8(buffer, 'text/html')
11+
const buffer = loadExample('51242_54045.html')
12+
const output = toUTF8(buffer, 'text/html')
1313

1414
expect(output).to.contain('或る日の小せん')
1515
})
1616

1717
it('properly decodes Windows-1250 html documents', function () {
18-
var buffer = loadExample('rp.pl.html')
19-
var output = toUTF8(buffer, 'windows-1250')
18+
const buffer = loadExample('rp.pl.html')
19+
const output = toUTF8(buffer, 'windows-1250')
2020

2121
expect(output).to.contain('majątków')
2222
})
2323

2424
it('guesses encoding even without meta tags or content type', function () {
25-
var buffer = loadExample('shiftjis.no.meta.html')
26-
var output = toUTF8(buffer, 'text/html')
25+
const buffer = loadExample('shiftjis.no.meta.html')
26+
const output = toUTF8(buffer, 'text/html')
2727

2828
expect(output).to.contain('次常用國字標準字體表')
2929
})
3030

3131
it('works for documents which already are UTF-8', function () {
32-
var buffer = loadExample('utf8.with.meta.html')
33-
var output = toUTF8(buffer, 'text/html')
32+
const buffer = loadExample('utf8.with.meta.html')
33+
const output = toUTF8(buffer, 'text/html')
3434

3535
expect(output).to.contain('日本語')
3636
})
3737

3838
it('Replace charset from the original buffer', function () {
39-
var buffer = loadExample('51242_54045.html')
40-
var output = toUTF8(buffer, 'text/html')
39+
const buffer = loadExample('51242_54045.html')
40+
const output = toUTF8(buffer, 'text/html')
4141

4242
expect(output).to.contain(
4343
'<meta http-equiv="Content-Type" content="text/html;utf-8" />'

0 commit comments

Comments
 (0)