Skip to content

Commit

Permalink
getCoreContentText for any websites using mozilla/readability (#641)
Browse files Browse the repository at this point in the history
* getCoreContentText for any websites using https://github.com/mozilla/readability

* improve use of @mozilla/readability

---------

Co-authored-by: josc146 <josStorer@outlook.com>
  • Loading branch information
xxcdd and josStorer committed Mar 22, 2024
1 parent a6fa0ed commit eb88fc2
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 12 deletions.
9 changes: 9 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Expand Up @@ -19,6 +19,7 @@
"lint"
],
"dependencies": {
"@mozilla/readability": "^0.5.0",
"@nem035/gpt-3-encoder": "^1.1.7",
"@picocss/pico": "^1.5.9",
"@primer/octicons-react": "^18.3.0",
Expand Down
42 changes: 30 additions & 12 deletions src/utils/get-core-content-text.mjs
@@ -1,9 +1,5 @@
import { getPossibleElementByQuerySelector } from './get-possible-element-by-query-selector.mjs'

function getArea(e) {
const rect = e.getBoundingClientRect()
return rect.width * rect.height
}
import { Readability, isProbablyReaderable } from '@mozilla/readability'

const adapters = {
'scholar.google': ['#gs_res_ccl_mid'],
Expand All @@ -17,6 +13,11 @@ const adapters = {
'new.qq.com': ['.content-article'],
}

function getArea(e) {
const rect = e.getBoundingClientRect()
return rect.width * rect.height
}

function findLargestElement(e) {
if (!e) {
return null
Expand All @@ -42,22 +43,39 @@ function findLargestElement(e) {
return largestElement
}

export function getCoreContentText() {
function getTextFrom(e) {
return e.innerText || e.textContent
}
function getTextFrom(e) {
return e.innerText || e.textContent
}

function postProcessText(text) {
return text
.trim()
.replaceAll(' ', '')
.replaceAll('\t', '')
.replaceAll('\n\n', '')
.replaceAll(',,', '')
}

export function getCoreContentText() {
for (const [siteName, selectors] of Object.entries(adapters)) {
if (location.hostname.includes(siteName)) {
const element = getPossibleElementByQuerySelector(selectors)
if (element) return getTextFrom(element)
if (element) return postProcessText(getTextFrom(element))
break
}
}

const element = document.querySelector('article')
if (element) {
return getTextFrom(element)
return postProcessText(getTextFrom(element))
}

if (isProbablyReaderable(document)) {
let article = new Readability(document.cloneNode(true), {
keepClasses: true,
}).parse()
console.log('readerable')
return postProcessText(article.textContent)
}

const largestElement = findLargestElement(document.body)
Expand All @@ -79,5 +97,5 @@ export function getCoreContentText() {
ret = getTextFrom(largestElement)
console.log('use first')
}
return ret.trim().replaceAll(' ', '').replaceAll('\n\n', '').replaceAll(',,', '')
return postProcessText(ret)
}

0 comments on commit eb88fc2

Please sign in to comment.