jshemas · jshemas · Feb 15, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/dist/lib/request.js b/dist/lib/request.js
@@ -1,6 +1,36 @@
 "use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
 Object.defineProperty(exports, "__esModule", { value: true });
 const undici_1 = require("undici");
+const iconv_lite_1 = require("iconv-lite");
+const cheerio_1 = require("cheerio");
+const chardet_1 = __importDefault(require("chardet"));
+/**
+ * checks if an element exists
+ */
+const doesElementExist = (selector, attribute, $) => ($(selector).attr(attribute) && ($(selector).attr(attribute)?.length || 0) > 0);
+/**
+ * gets the charset of the html
+ */
+function getCharset(body, buffer, $) {
+    if (doesElementExist('meta', 'charset', $)) {
+        return $('meta').attr('charset');
+    }
+    if (doesElementExist('head > meta[name="charset"]', 'content', $)) {
+        return $('head > meta[name="charset"]').attr('content');
+    }
+    if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) {
+        const content = $('head > meta[http-equiv="content-type"]').attr('content');
+        const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i;
+        return charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8';
+    }
+    if (body) {
+        return chardet_1.default.detect(Buffer.from(buffer));
+    }
+    return 'utf-8';
+}
 /**
  * performs the fetch request and formats the body for ogs
  *
@@ -17,7 +47,15 @@ async function requestAndResultsFormatter(options) {
             headers: { Origin: options.url, Accept: 'text/html' },
             ...options.fetchOptions,
         });
-        body = await response.text();
+        const bodyArrayBuffer = await response.arrayBuffer();
+        const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8');
+        const charset = getCharset(bodyText, bodyArrayBuffer, (0, cheerio_1.load)(bodyText));
+        if (charset.toLowerCase() === 'utf-8') {
+            body = bodyText;
+        }
+        else {
+            body = (0, iconv_lite_1.decode)(Buffer.from(bodyArrayBuffer), charset);
+        }
         if (response && response.headers && response.headers.get('content-type') && !response.headers.get('content-type')?.includes('text/')) {
             throw new Error('Page must return a header content-type with text/');
         }

diff --git a/lib/request.ts b/lib/request.ts
@@ -1,6 +1,38 @@
 import { fetch } from 'undici';
+import { decode } from 'iconv-lite';
+import { CheerioAPI, load } from 'cheerio';
+import chardet from 'chardet';
 import type { OpenGraphScraperOptions } from './types';
 
+/**
+ * checks if an element exists
+ */
+const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => (
+  $(selector).attr(attribute) && ($(selector).attr(attribute)?.length || 0) > 0
+);
+
+/**
+ * gets the charset of the html
+ */
+function getCharset(body: string, buffer: Uint8Array, $: CheerioAPI) {
+  if (doesElementExist('meta', 'charset', $)) {
+    return $('meta').attr('charset');
+  }
+  if (doesElementExist('head > meta[name="charset"]', 'content', $)) {
+    return $('head > meta[name="charset"]').attr('content');
+  }
+  if (doesElementExist('head > meta[http-equiv="content-type"]', 'content', $)) {
+    const content = $('head > meta[http-equiv="content-type"]').attr('content');
+    const charsetRegEx = /charset=([^()<>@,;:"/[\]?.=\s]*)/i;
+    return charsetRegEx.test(content) ? charsetRegEx.exec(content)[1] : 'UTF-8';
+  }
+  if (body) {
+    return chardet.detect(Buffer.from(buffer));
+  }
+
+  return 'utf-8';
+}
+
 /**
  * performs the fetch request and formats the body for ogs
  *
@@ -21,7 +53,14 @@ export default async function requestAndResultsFormatter(options: OpenGraphScrap
       },
     );
 
-    body = await response.text();
+    const bodyArrayBuffer = await response.arrayBuffer();
+    const bodyText = Buffer.from(bodyArrayBuffer).toString('utf-8');
+    const charset = getCharset(bodyText, bodyArrayBuffer, load(bodyText));
+    if (charset.toLowerCase() === 'utf-8') {
+      body = bodyText;
+    } else {
+      body = decode(Buffer.from(bodyArrayBuffer), charset);
+    }
 
     if (response && response.headers && response.headers.get('content-type') && !response.headers.get('content-type')?.includes('text/')) {
       throw new Error('Page must return a header content-type with text/');

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -27,6 +27,7 @@
   "dependencies": {
     "chardet": "^2.0.0",
     "cheerio": "^1.0.0-rc.12",
+    "iconv-lite": "^0.6.3",
     "undici": "^6.6.2",
     "validator": "^13.11.0"
   },

diff --git a/tests/integration/encoding.spec.ts b/tests/integration/encoding.spec.ts
@@ -6,7 +6,9 @@ describe('encoding', function () {
   context('should return correct Open Graph Info + charset info', function () {
     it('rakuten', function () {
       return ogs({
-        url: 'https://jshemas.github.io/openGraphScraperPages/rakuten',
+        // FIXME temporary use my own page
+        // url: 'https://jshemas.github.io/openGraphScraperPages/rakuten',
+        url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/rakuten',
       }).then(function ({ error, result, response }) {
         console.log('error:', error);
         console.log('result:', result);
@@ -29,7 +31,9 @@ describe('encoding', function () {
         expect(result.twitterImage).to.be.eql([{
           url: 'https://r.r10s.jp/com/img/home/top/ogp.png',
         }]);
-        expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rakuten');
+        // FIXME temporary use my own page
+        // expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/rakuten');
+        expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/rakuten');
         expect(result.charset).to.be.eql('euc-jp');
         expect(result.success).to.be.eql(true);
         expect(result).to.have.all.keys(
@@ -436,7 +440,9 @@ describe('encoding', function () {
         });
     });
     it('tmall', function () {
-      return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/tmall' })
+      // FIXME temporary use my own page
+      // return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/tmall' })
+      return ogs({ url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/tmall' })
         .then(function (data) {
           const { error, result, response } = data;
           console.log('error:', error);
@@ -461,7 +467,9 @@ describe('encoding', function () {
           ]);
           expect(result.ogUrl).to.be.eql('https://detail.tmall.com/item.htm?id=605258110430');
           expect(result.charset).to.be.eql('gbk');
-          expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/tmall');
+          // FIXME temporary use my own page
+          // expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/tmall');
+          expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/tmall');
           expect(result.success).to.be.eql(true);
           expect(result).to.have.all.keys(
             'favicon',
@@ -578,15 +586,19 @@ describe('encoding', function () {
         });
     });
     it('abehiroshi', function () {
-      return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' })
+      // FIXME temporary use my own page
+      // return ogs({ url: 'https://jshemas.github.io/openGraphScraperPages/abehiroshi' })
+      return ogs({ url: 'https://cm-dyoshikawa.github.io/openGraphScraperPages/abehiroshi' })
         .then(function (data) {
           const { error, result, response } = data;
           console.log('error:', error);
           console.log('result:', result);
           expect(error).to.be.eql(false);
           expect(result.ogTitle).to.be.eql('阿部寛のホームページ');
           expect(result.charset).to.be.eql('Shift_JIS');
-          expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi');
+          // FIXME temporary use my own page
+          // expect(result.requestUrl).to.be.eql('https://jshemas.github.io/openGraphScraperPages/abehiroshi');
+          expect(result.requestUrl).to.be.eql('https://cm-dyoshikawa.github.io/openGraphScraperPages/abehiroshi');
           expect(result.success).to.be.eql(true);
           expect(result).to.have.all.keys(
             'charset',

diff --git a/tests/unit/openGraphScraper.spec.ts b/tests/unit/openGraphScraper.spec.ts
@@ -3,6 +3,7 @@ import sinon from 'sinon';
 
 import chardet from 'chardet';
 import { MockAgent, setGlobalDispatcher } from 'undici';
+import { encode } from 'iconv-lite';
 import ogs from '../../index';
 
 const basicHTML = `
@@ -734,4 +735,37 @@ describe('return ogs', function () {
         });
     });
   });
+
+  context('when the character encoding is not UTF-8', function () {
+    it('using just a url', function () {
+      const html = `
+      <html>
+        <head>
+          <meta charset="shift_jis">
+          <meta property="og:description" content="OG説明">
+          <meta property="og:title" content="OGタイトル">
+          <meta property="foo" content="バー">
+        </head>
+        <body>
+          <h1>こんにちは</h1>
+          <img width="360" src="test.png" alt="テスト画像">
+          <img width="360" alt="テスト画像2">
+        </body>
+      </html>
+      `;
+      const htmlBuffer = encode(html, 'shift_jis');
+      mockAgent.get('http://www.test.com')
+        .intercept({ path: '/' })
+        .reply(200, htmlBuffer);
+
+      return ogs({ url: 'www.test.com' })
+        .then(function (data) {
+          expect(data.result.success).to.be.eql(true);
+          expect(data.result.ogTitle).to.be.eql('OGタイトル');
+          expect(data.result.requestUrl).to.be.eql('http://www.test.com');
+          expect(data.html).to.be.eql(html);
+          expect(data.response).to.be.a('response');
+        });
+    });
+  });
 });
diff --git a/tests/unit/static.spec.ts b/tests/unit/static.spec.ts
@@ -1,5 +1,6 @@
 import { expect } from 'chai';
 import sinon from 'sinon';
+import { encode } from 'iconv-lite';
 
 import { MockAgent, setGlobalDispatcher } from 'undici';
 
@@ -275,10 +276,11 @@ describe('static check meta tags', function () {
       <meta name="twitter:image:src" content="https://r.r10s.jp/com/img/home/top/ogp.png">
     </head></html>`;
     /* eslint-enable max-len */
+    const metaHTMLBuffer = encode(metaHTML, 'sjis');
 
     mockAgent.get('http://www.test.com')
       .intercept({ path: '/' })
-      .reply(200, metaHTML);
+      .reply(200, metaHTMLBuffer);
 
     return ogs({ url: 'www.test.com' })
       .then(function (data) {
@@ -322,10 +324,11 @@ describe('static check meta tags', function () {
       <meta name="twitter:image:src" content="https://r.r10s.jp/com/img/home/top/ogp.png">
     </head></html>`;
     /* eslint-enable max-len */
+    const metaHTMLBuffer = encode(metaHTML, 'euc-jp');
 
     mockAgent.get('http://www.test.com')
       .intercept({ path: '/' })
-      .reply(200, metaHTML);
+      .reply(200, metaHTMLBuffer);
 
     return ogs({ url: 'www.test.com' })
       .then(function (data) {