Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add character encoding detection and decoding logic #206

Merged
merged 10 commits into from
Feb 15, 2024
40 changes: 39 additions & 1 deletion dist/lib/request.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,36 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const undici_1 = require("undici");
const iconv_lite_1 = require("iconv-lite");
const cheerio_1 = require("cheerio");
const chardet_1 = __importDefault(require("chardet"));
/**
* checks if an element exists
*/
const doesElementExist = (selector, attribute, $) => ($(selector).attr(attribute) && ($(selector).attr(attribute)?.length || 0) > 0);
/**
* gets the charset of the html
*/
function getCharset(body, buffer, $) {
if (doesElementExist('meta', 'charset', $)) {
return $('meta').attr('charset');
}
if (doesElementExist('head > meta[name="charset"]', 'content', $)) {
return $('head > meta[name="charset"]').attr('content');
}
if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) {
const content = $('head > meta[http-equiv="content-type"]').attr('content');
const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i;
return charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8';
}
if (body) {
return chardet_1.default.detect(Buffer.from(buffer));
}
return 'utf-8';
}
/**
* performs the fetch request and formats the body for ogs
*
Expand All @@ -17,7 +47,15 @@ async function requestAndResultsFormatter(options) {
headers: { Origin: options.url, Accept: 'text/html' },
...options.fetchOptions,
});
body = await response.text();
const bodyArrayBuffer = await response.arrayBuffer();
const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8');
const charset = getCharset(bodyText, bodyArrayBuffer, (0, cheerio_1.load)(bodyText));
if (charset.toLowerCase() === 'utf-8') {
body = bodyText;
}
else {
body = (0, iconv_lite_1.decode)(Buffer.from(bodyArrayBuffer), charset);
}
if (response && response.headers && response.headers.get('content-type') && !response.headers.get('content-type')?.includes('text/')) {
throw new Error('Page must return a header content-type with text/');
}
Expand Down
41 changes: 40 additions & 1 deletion lib/request.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,38 @@
import { fetch } from 'undici';
import { decode } from 'iconv-lite';
import { CheerioAPI, load } from 'cheerio';
import chardet from 'chardet';
import type { OpenGraphScraperOptions } from './types';

/**
* checks if an element exists
*/
const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => (
$(selector).attr(attribute) && ($(selector).attr(attribute)?.length || 0) > 0
);

/**
* gets the charset of the html
*/
function getCharset(body: string, buffer: Uint8Array, $: CheerioAPI) {
if (doesElementExist('meta', 'charset', $)) {
return $('meta').attr('charset');
}
if (doesElementExist('head > meta[name="charset"]', 'content', $)) {
return $('head > meta[name="charset"]').attr('content');
}
if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) {
const content = $('head > meta[http-equiv="content-type"]').attr('content');
const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i;
return charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8';
}
if (body) {
return chardet.detect(Buffer.from(buffer));
}

return 'utf-8';
}

/**
* performs the fetch request and formats the body for ogs
*
Expand All @@ -21,7 +53,14 @@ export default async function requestAndResultsFormatter(options: OpenGraphScrap
},
);

body = await response.text();
const bodyArrayBuffer = await response.arrayBuffer();
const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8');
const charset = getCharset(bodyText, bodyArrayBuffer, load(bodyText));
if (charset.toLowerCase() === 'utf-8') {
body = bodyText;
} else {
body = decode(Buffer.from(bodyArrayBuffer), charset);
}

if (response && response.headers && response.headers.get('content-type') && !response.headers.get('content-type')?.includes('text/')) {
throw new Error('Page must return a header content-type with text/');
Expand Down
17 changes: 17 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"dependencies": {
"chardet": "^2.0.0",
"cheerio": "^1.0.0-rc.12",
"iconv-lite": "^0.6.3",
"undici": "^6.6.2",
"validator": "^13.11.0"
},
Expand Down
24 changes: 18 additions & 6 deletions tests/integration/encoding.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ describe('encoding', function () {
context('should return correct Open Graph Info + charset info', function () {
it('rakuten', function () {
return ogs({
url: 'https://jshemas.github.io/openGraphScraperPages/rakuten',
// FIXME temporary use my own page
// url: 'https://jshemas.github.io/openGraphScraperPages/rakuten',
url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/rakuten',
}).then(function ({ error, result, response }) {
console.log('error:', error);
console.log('result:', result);
Expand All @@ -29,7 +31,9 @@ describe('encoding', function () {
expect(result.twitterImage).to.be.eql([{
url: 'https://r.r10s.jp/com/img/home/top/ogp.png',
}]);
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rakuten');
// FIXME temporary use my own page
// expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rakuten');
expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/rakuten');
expect(result.charset).to.be.eql('euc-jp');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
Expand Down Expand Up @@ -436,7 +440,9 @@ describe('encoding', function () {
});
});
it('tmall', function () {
return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/tmall' })
// FIXME temporary use my own page
// return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/tmall' })
return ogs({ url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/tmall' })
.then(function (data) {
const { error, result, response } = data;
console.log('error:', error);
Expand All @@ -461,7 +467,9 @@ describe('encoding', function () {
]);
expect(result.ogUrl).to.be.eql('https://detail.tmall.com/item.htm?id=605258110430');
expect(result.charset).to.be.eql('gbk');
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/tmall');
// FIXME temporary use my own page
// expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/tmall');
expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/tmall');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'favicon',
Expand Down Expand Up @@ -578,15 +586,19 @@ describe('encoding', function () {
});
});
it('abehiroshi', function () {
return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' })
// FIXME temporary use my own page
// return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' })
return ogs({ url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/abehiroshi' })
.then(function (data) {
const { error, result, response } = data;
console.log('error:', error);
console.log('result:', result);
expect(error).to.be.eql(false);
expect(result.ogTitle).to.be.eql('阿部寛のホームページ');
expect(result.charset).to.be.eql('Shift_JIS');
expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi');
// FIXME temporary use my own page
// expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi');
expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/abehiroshi');
expect(result.success).to.be.eql(true);
expect(result).to.have.all.keys(
'charset',
Expand Down
34 changes: 34 additions & 0 deletions tests/unit/openGraphScraper.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import sinon from 'sinon';

import chardet from 'chardet';
import { MockAgent, setGlobalDispatcher } from 'undici';
import { encode } from 'iconv-lite';
import ogs from '../../index';

const basicHTML = `
Expand Down Expand Up @@ -734,4 +735,37 @@ describe('return ogs', function () {
});
});
});

context('when the character encoding is not UTF-8', function () {
it('using just a url', function () {
const html = `
<html>
<head>
<meta charset="shift_jis">
<meta property="og:description" content="OG説明">
<meta property="og:title" content="OGタイトル">
<meta property="foo" content="バー">
</head>
<body>
<h1>こんにちは</h1>
<img width="360" src="test.png" alt="テスト画像">
<img width="360" alt="テスト画像2">
</body>
</html>
`;
const htmlBuffer = encode(html, 'shift_jis');
mockAgent.get('http://www.test.com')
.intercept({ path: '/' })
.reply(200, htmlBuffer);

return ogs({ url: 'www.test.com' })
.then(function (data) {
expect(data.result.success).to.be.eql(true);
expect(data.result.ogTitle).to.be.eql('OGタイトル');
expect(data.result.requestUrl).to.be.eql('http://www.test.com');
expect(data.html).to.be.eql(html);
expect(data.response).to.be.a('response');
});
});
});
});
7 changes: 5 additions & 2 deletions tests/unit/static.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { expect } from 'chai';
import sinon from 'sinon';
import { encode } from 'iconv-lite';

import { MockAgent, setGlobalDispatcher } from 'undici';

Expand Down Expand Up @@ -275,10 +276,11 @@ describe('static check meta tags', function () {
<meta name="twitter:image:src" content="https://r.r10s.jp/com/img/home/top/ogp.png">
</head></html>`;
/* eslint-enable max-len */
const metaHTMLBuffer = encode(metaHTML, 'sjis');

mockAgent.get('http://www.test.com')
.intercept({ path: '/' })
.reply(200, metaHTML);
.reply(200, metaHTMLBuffer);

return ogs({ url: 'www.test.com' })
.then(function (data) {
Expand Down Expand Up @@ -322,10 +324,11 @@ describe('static check meta tags', function () {
<meta name="twitter:image:src" content="https://r.r10s.jp/com/img/home/top/ogp.png">
</head></html>`;
/* eslint-enable max-len */
const metaHTMLBuffer = encode(metaHTML, 'euc-jp');

mockAgent.get('http://www.test.com')
.intercept({ path: '/' })
.reply(200, metaHTML);
.reply(200, metaHTMLBuffer);

return ogs({ url: 'www.test.com' })
.then(function (data) {
Expand Down