From eb88fc2053699b6a78394ce1ece58fcd6efe257d Mon Sep 17 00:00:00 2001 From: xxcdd <42600601+xxcdd@users.noreply.github.com> Date: Fri, 22 Mar 2024 16:06:57 +0800 Subject: [PATCH] getCoreContentText for any websites using mozilla/readability (#641) * getCoreContentText for any websites using https://github.com/mozilla/readability * improve use of @mozilla/readability --------- Co-authored-by: josc146 --- package-lock.json | 9 +++++++ package.json | 1 + src/utils/get-core-content-text.mjs | 42 ++++++++++++++++++++--------- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/package-lock.json b/package-lock.json index 3872d5a6..22ac4c04 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,6 +6,7 @@ "": { "name": "chatgptbox", "dependencies": { + "@mozilla/readability": "^0.5.0", "@nem035/gpt-3-encoder": "^1.1.7", "@picocss/pico": "^1.5.9", "@primer/octicons-react": "^18.3.0", @@ -2077,6 +2078,14 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@mozilla/readability": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", + "integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@nem035/gpt-3-encoder": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@nem035/gpt-3-encoder/-/gpt-3-encoder-1.1.7.tgz", diff --git a/package.json b/package.json index 52212e15..9956b537 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "lint" ], "dependencies": { + "@mozilla/readability": "^0.5.0", "@nem035/gpt-3-encoder": "^1.1.7", "@picocss/pico": "^1.5.9", "@primer/octicons-react": "^18.3.0", diff --git a/src/utils/get-core-content-text.mjs b/src/utils/get-core-content-text.mjs index bef4dc8f..54c004cb 100644 --- a/src/utils/get-core-content-text.mjs +++ b/src/utils/get-core-content-text.mjs @@ -1,9 +1,5 @@ import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs' - -function getArea(e) { - const rect = e.getBoundingClientRect() - return rect.width * rect.height -} +import { Readability, isProbablyReaderable } from '@mozilla/readability' const adapters = { 'scholar.google': ['#gs_res_ccl_mid'], @@ -17,6 +13,11 @@ const adapters = { 'new.qq.com': ['.content-article'], } +function getArea(e) { + const rect = e.getBoundingClientRect() + return rect.width * rect.height +} + function findLargestElement(e) { if (!e) { return null @@ -42,22 +43,39 @@ function findLargestElement(e) { return largestElement } -export function getCoreContentText() { - function getTextFrom(e) { - return e.innerText || e.textContent - } +function getTextFrom(e) { + return e.innerText || e.textContent +} +function postProcessText(text) { + return text + .trim() + .replaceAll(' ', '') + .replaceAll('\t', '') + .replaceAll('\n\n', '') + .replaceAll(',,', '') +} + +export function getCoreContentText() { for (const [siteName, selectors] of Object.entries(adapters)) { if (location.hostname.includes(siteName)) { const element = getPossibleElementByQuerySelector(selectors) - if (element) return getTextFrom(element) + if (element) return postProcessText(getTextFrom(element)) break } } const element = document.querySelector('article') if (element) { - return getTextFrom(element) + return postProcessText(getTextFrom(element)) + } + + if (isProbablyReaderable(document)) { + let article = new Readability(document.cloneNode(true), { + keepClasses: true, + }).parse() + console.log('readerable') + return postProcessText(article.textContent) } const largestElement = findLargestElement(document.body) @@ -79,5 +97,5 @@ export function getCoreContentText() { ret = getTextFrom(largestElement) console.log('use first') } - return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '') + return postProcessText(ret) }