Skip to content

Commit fd2b8a6

Browse files
committed
fix: improve readibility tests
1 parent 0780418 commit fd2b8a6

File tree

2 files changed

+80
-53
lines changed

2 files changed

+80
-53
lines changed

src/markdown.ts

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,38 @@
11
import type { ElementNode, HandlerContext, MdreamRuntimeState, NodeEvent, TextNode } from './types'
2+
import { collectNodeContent } from './buffer-region'
3+
import {
4+
DEFAULT_BLOCK_SPACING,
5+
ELEMENT_NODE,
6+
NO_SPACING,
7+
NodeEventEnter,
8+
TAG_BLOCKQUOTE,
9+
TAG_LI,
10+
TAG_PRE,
11+
TEXT_NODE,
12+
} from './const'
213

314
/**
415
* Determines if spacing is needed between two characters
516
*/
617
function needsSpacing(lastChar: string, firstChar: string): boolean {
718
const noSpaceLastChars = new Set(['\n', ' ', '[', '>', '_', '*', '`', '|', '#', '<', '('])
819
const noSpaceFirstChars = new Set([' ', '\n', '\t', '_', '*', '`', '|', '>', '#'])
9-
20+
1021
return !noSpaceLastChars.has(lastChar) && !noSpaceFirstChars.has(firstChar)
1122
}
1223

1324
/**
1425
* Determines if spacing should be added before text content
1526
*/
1627
function shouldAddSpacingBeforeText(lastChar: string, lastNode: any, textNode: TextNode): boolean {
17-
return lastChar &&
18-
lastChar !== '\n' &&
19-
lastChar !== ' ' &&
20-
lastChar !== '[' &&
21-
lastChar !== '>' &&
22-
!lastNode?.tagHandler?.isInline &&
23-
textNode.value[0] !== ' '
28+
return lastChar
29+
&& lastChar !== '\n'
30+
&& lastChar !== ' '
31+
&& lastChar !== '['
32+
&& lastChar !== '>'
33+
&& !lastNode?.tagHandler?.isInline
34+
&& textNode.value[0] !== ' '
2435
}
25-
import { collectNodeContent } from './buffer-region'
26-
import {
27-
DEFAULT_BLOCK_SPACING,
28-
ELEMENT_NODE,
29-
NO_SPACING,
30-
NodeEventEnter,
31-
TAG_BLOCKQUOTE,
32-
TAG_LI,
33-
TAG_PRE,
34-
TEXT_NODE,
35-
} from './const'
3636

3737
/**
3838
* Process text node with plugin hooks
@@ -97,12 +97,12 @@ export function processHtmlEventToMarkdown(
9797
if (textNode.value === ' ' && lastChar === '\n') {
9898
return
9999
}
100-
100+
101101
// Add spacing before text if needed
102102
if (shouldAddSpacingBeforeText(lastChar, lastNode, textNode)) {
103103
textNode.value = ` ${textNode.value}`
104104
}
105-
105+
106106
collectNodeContent(textNode, textNode.value, state)
107107
}
108108
state.lastTextNode = textNode

src/plugins/readability.ts

Lines changed: 59 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ import { createPlugin } from '../pluggable/plugin'
193193
- Tag score: +15
194194
- Class "main-content" matches positive pattern: +10
195195
- Total score: +25 → Include
196-
*/
196+
*/
197197

198198
export interface ReadabilityOptions {
199199
/**
@@ -206,9 +206,9 @@ export interface ReadabilityOptions {
206206
// Regular expressions for scoring based on scoring.md
207207
const REGEXPS = {
208208
// Positive patterns that suggest high-quality content
209-
positive: /article|body|content|entry|main|page|post|text|blog|story/i,
209+
positive: /article|body|content|entry|main|page|post|text|blog|story|recipe|ingredient|instruction|description|docs?|documentation|guide|tutorial|reference|manual/i,
210210
// Negative patterns that suggest low-quality content
211-
negative: /ad|banner|combx|comment|disqus|extra|foot|header|menu|meta|nav|promo|related|scroll|share|sidebar|sponsor|social|tags|widget|sitemap|copyright/i,
211+
negative: /ad|banner|combx|comment|disqus|extra|foot|header|menu|meta|nav|promo|related|scroll|share|sidebar|sponsor|social|tags|widget|sitemap|copyright|login|register|subscribe|newsletter|signup|category|author|date|publish|cta|button|apply|trial|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i,
212212
// Used for counting commas to determine complexity
213213
commas: /,/g,
214214
// Used for analyzing paragraph endings
@@ -232,8 +232,8 @@ const TagScores = {
232232
[TAG_BLOCKQUOTE]: 5, // Quoted content, usually important
233233

234234
// Code and pre-formatted content
235-
[TAG_PRE]: 5, // Preformatted text/code, high value
236-
[TAG_CODE]: 5, // Code content, high value
235+
[TAG_PRE]: 8, // Preformatted text/code, high value for documentation
236+
[TAG_CODE]: 6, // Code content, high value for documentation
237237

238238
// Media elements
239239
[TAG_IMG]: 3, // Images are typically content
@@ -254,9 +254,9 @@ const TagScores = {
254254
[TAG_TD]: 0, // Table cell, neutral
255255

256256
// List elements
257-
[TAG_UL]: -1, // Slightly penalize lists as they're often navigation
258-
[TAG_OL]: 0, // Ordered lists are more likely to be content
259-
[TAG_LI]: -2, // Increase penalty for list items to avoid nav lists
257+
[TAG_UL]: -8, // Higher penalty as lists are often navigation
258+
[TAG_OL]: -5, // Ordered lists still often navigation
259+
[TAG_LI]: -6, // Higher penalty for list items to avoid nav lists
260260
[TAG_DL]: 0, // Definition lists, neutral
261261
[TAG_DT]: 0, // Definition lists, neutral
262262
[TAG_DD]: 0, // Definition lists, neutral
@@ -270,10 +270,10 @@ const TagScores = {
270270
[TAG_H6]: 0, // Minor headers, neutral
271271

272272
// Navigation and structural elements (negative)
273-
[TAG_HEADER]: -15, // Page header, often not content
274-
[TAG_FOOTER]: -15, // Footer, rarely content
275-
[TAG_NAV]: -20, // Navigation, not content
276-
[TAG_ASIDE]: -15, // Sidebar, usually not main content
273+
[TAG_HEADER]: -15, // Page header, often not content but may contain article headers
274+
[TAG_FOOTER]: -25, // Footer, rarely content
275+
[TAG_NAV]: -30, // Navigation, not content
276+
[TAG_ASIDE]: -25, // Sidebar, usually not main content
277277

278278
// Form elements (negative)
279279
[TAG_FORM]: -8, // User input, not content
@@ -289,7 +289,7 @@ const TagScores = {
289289
[TAG_OBJECT]: -3, // Embedded content, often ads
290290

291291
// Links
292-
[TAG_A]: -3, // Link, more negative to avoid navigation-heavy areas
292+
[TAG_A]: -8, // Higher penalty to avoid navigation-heavy areas
293293

294294
// Text formatting
295295
[TAG_STRONG]: 1, // Emphasized text, slightly positive
@@ -323,33 +323,41 @@ function scoreClassAndId(node: ElementNode) {
323323
const className = node.attributes.class as string
324324

325325
// Check for specific strong negative patterns first
326-
if (/nav|menu|header|footer|sidebar/i.test(className)) {
327-
scoreAdjustment -= 25
326+
if (/nav|menu|header|footer|sidebar|ad-|advertisement|banner|promo|cta|button|apply|trial|engagement|sharing|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i.test(className)) {
327+
scoreAdjustment -= 35
328328
}
329329
// Then check for other negative patterns
330330
else if (REGEXPS.negative.test(className)) {
331-
scoreAdjustment -= 10 // -10 per scoring.md
331+
scoreAdjustment -= 15 // Increased penalty
332332
}
333333
// Only apply positive patterns if no negative patterns matched
334334
else if (REGEXPS.positive.test(className)) {
335335
scoreAdjustment += 10 // +10 per scoring.md
336+
// Special boost for documentation content
337+
if (/docs?|documentation|guide|tutorial|reference|manual|article/i.test(className)) {
338+
scoreAdjustment += 5 // Extra boost for docs
339+
}
336340
}
337341
}
338342

339343
if (node.attributes?.id) {
340344
const id = node.attributes.id as string
341345

342346
// Check for specific strong negative patterns first
343-
if (/nav|menu|header|footer|sidebar/i.test(id)) {
344-
scoreAdjustment -= 25
347+
if (/nav|menu|header|footer|sidebar|ad-|advertisement|banner|promo|cta|button|apply|trial|engagement|sharing|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i.test(id)) {
348+
scoreAdjustment -= 35
345349
}
346350
// Then check for other negative patterns
347351
else if (REGEXPS.negative.test(id)) {
348-
scoreAdjustment -= 10 // -10 per scoring.md
352+
scoreAdjustment -= 15 // Increased penalty
349353
}
350354
// Only apply positive patterns if no negative patterns matched
351355
else if (REGEXPS.positive.test(id)) {
352356
scoreAdjustment += 10 // +10 per scoring.md
357+
// Special boost for documentation content
358+
if (/docs?|documentation|guide|tutorial|reference|manual|article/i.test(id)) {
359+
scoreAdjustment += 5 // Extra boost for docs
360+
}
353361
}
354362
}
355363

@@ -399,9 +407,9 @@ export function readabilityPlugin() {
399407

400408
// Check for strong negative patterns that should override parent context
401409
const hasStrongNegativePattern = (
402-
(node.name && /nav|header|footer|aside/i.test(node.name))
403-
|| (node.attributes?.class && /nav|menu|header|footer|sidebar|hidden|copyright/i.test(node.attributes.class as string))
404-
|| (node.attributes?.id && /nav|menu|header|footer|sidebar|hidden|copyright/i.test(node.attributes.id as string))
410+
(node.name && /nav|header|footer|aside|form|fieldset|button/i.test(node.name))
411+
|| (node.attributes?.class && /nav|menu|header|footer|sidebar|hidden|copyright|ad-|advertisement|banner|promo|related|comment|login|register|subscribe|newsletter|category|meta|tag|cta|button|apply|trial|engagement|sharing|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i.test(node.attributes.class as string))
412+
|| (node.attributes?.id && /nav|menu|header|footer|sidebar|hidden|copyright|ad-|advertisement|banner|promo|related|comment|login|register|subscribe|newsletter|category|meta|tag|cta|button|apply|trial|engagement|sharing|likes|views|metrics|stats|breadcrumb|pagination|filter|sort|search/i.test(node.attributes.id as string))
405413
|| (node.attributes?.style && /display:\s*none|visibility:\s*hidden/i.test(node.attributes.style as string))
406414
|| (node.attributes && Object.keys(node.attributes).some(attr => attr.startsWith('aria-') && node.attributes![attr] === 'true' && /hidden|invisible/i.test(attr)))
407415
)
@@ -501,31 +509,50 @@ export function readabilityPlugin() {
501509
const linkDensity = linkTextLength / textLength
502510

503511
// Apply more aggressive link density penalty
504-
if (linkDensity > 0.5) {
512+
if (linkDensity > 0.4) {
505513
// For very high link density, apply severe penalty and mark as navigation-like
506-
if (linkDensity > 0.7) {
507-
node.context.score = node.context.score * 0.05 // 95% reduction
508-
// If we have very high link density, mark as navigation-like content
509-
if (linkTextLength > 100) {
514+
if (linkDensity > 0.6) {
515+
node.context.score = node.context.score * 0.02 // 98% reduction
516+
// If we have high link density, mark as navigation-like content
517+
if (linkTextLength > 50) {
510518
node.context.isHighLinkDensity = true
511519
}
512520
}
513521
else {
514522
// Scale score down based on link density
515-
node.context.score *= (1 - linkDensity * 1.5) // More aggressive scaling
523+
node.context.score *= (1 - linkDensity * 2.0) // Even more aggressive scaling
516524
}
517525
}
518-
else if (linkDensity > 0.25) { // Lower threshold for moderate link density
526+
else if (linkDensity > 0.2) { // Lower threshold for moderate link density
519527
// Even moderate link density should reduce score significantly
520-
node.context.score *= (1 - (linkDensity * 0.75))
528+
node.context.score *= (1 - (linkDensity * 1.0))
529+
}
530+
}
531+
532+
// Special penalty for elements with many links based on link density and text length
533+
if (linkTextLength > 0 && textLength > 0) {
534+
const linkRatio = linkTextLength / textLength
535+
const hasDocumentationMarkers = (
536+
(node.attributes?.class && /docs?|documentation|guide|tutorial|reference|manual|article|content/i.test(node.attributes.class as string))
537+
|| (node.attributes?.id && /docs?|documentation|guide|tutorial|reference|manual|article|content/i.test(node.attributes.id as string))
538+
|| (node.name && /main|article|section/i.test(node.name))
539+
)
540+
541+
if (linkRatio > 0.3 && linkTextLength > 30 && !hasDocumentationMarkers) {
542+
// This looks like navigation-heavy content (but not documentation with inline links)
543+
node.context.score -= 10
521544
}
522545
}
523546

524547
// Only exclude content with low scores to reduce fragmentation
525548
const finalScore = node.context.score
526549

527-
if (finalScore <= -10) {
528-
// Exclude content with low scores to filter out poor quality content
550+
if (finalScore <= -12) {
551+
// More aggressive exclusion threshold to filter out navigation and low-quality content
552+
createBufferRegion(node, state, false)
553+
}
554+
// Also exclude high link density content regardless of other scoring
555+
else if (node.context.isHighLinkDensity || (linkTextLength > 50 && textLength > 0 && (linkTextLength / textLength) > 0.5)) {
529556
createBufferRegion(node, state, false)
530557
}
531558
// Don't create inclusion regions dynamically - let content flow naturally

0 commit comments

Comments
 (0)