@@ -193,7 +193,7 @@ import { createPlugin } from '../pluggable/plugin'
193193 - Tag score: +15
194194 - Class "main-content" matches positive pattern: +10
195195 - Total score: +25 → Include
196- */
196+ */
197197
198198export interface ReadabilityOptions {
199199 /**
@@ -206,9 +206,9 @@ export interface ReadabilityOptions {
206206// Regular expressions for scoring based on scoring.md
207207const REGEXPS = {
208208 // Positive patterns that suggest high-quality content
209- positive : / a r t i c l e | b o d y | c o n t e n t | e n t r y | m a i n | p a g e | p o s t | t e x t | b l o g | s t o r y / i,
209+ positive : / a r t i c l e | b o d y | c o n t e n t | e n t r y | m a i n | p a g e | p o s t | t e x t | b l o g | s t o r y | r e c i p e | i n g r e d i e n t | i n s t r u c t i o n | d e s c r i p t i o n | d o c s ? | d o c u m e n t a t i o n | g u i d e | t u t o r i a l | r e f e r e n c e | m a n u a l / i,
210210 // Negative patterns that suggest low-quality content
211- negative : / a d | b a n n e r | c o m b x | c o m m e n t | d i s q u s | e x t r a | f o o t | h e a d e r | m e n u | m e t a | n a v | p r o m o | r e l a t e d | s c r o l l | s h a r e | s i d e b a r | s p o n s o r | s o c i a l | t a g s | w i d g e t | s i t e m a p | c o p y r i g h t / i,
211+ negative : / a d | b a n n e r | c o m b x | c o m m e n t | d i s q u s | e x t r a | f o o t | h e a d e r | m e n u | m e t a | n a v | p r o m o | r e l a t e d | s c r o l l | s h a r e | s i d e b a r | s p o n s o r | s o c i a l | t a g s | w i d g e t | s i t e m a p | c o p y r i g h t | l o g i n | r e g i s t e r | s u b s c r i b e | n e w s l e t t e r | s i g n u p | c a t e g o r y | a u t h o r | d a t e | p u b l i s h | c t a | b u t t o n | a p p l y | t r i a l | l i k e s | v i e w s | m e t r i c s | s t a t s | b r e a d c r u m b | p a g i n a t i o n | f i l t e r | s o r t | s e a r c h / i,
212212 // Used for counting commas to determine complexity
213213 commas : / , / g,
214214 // Used for analyzing paragraph endings
@@ -232,8 +232,8 @@ const TagScores = {
232232 [ TAG_BLOCKQUOTE ] : 5 , // Quoted content, usually important
233233
234234 // Code and pre-formatted content
235- [ TAG_PRE ] : 5 , // Preformatted text/code, high value
236- [ TAG_CODE ] : 5 , // Code content, high value
235+ [ TAG_PRE ] : 8 , // Preformatted text/code, high value for documentation
236+ [ TAG_CODE ] : 6 , // Code content, high value for documentation
237237
238238 // Media elements
239239 [ TAG_IMG ] : 3 , // Images are typically content
@@ -254,9 +254,9 @@ const TagScores = {
254254 [ TAG_TD ] : 0 , // Table cell, neutral
255255
256256 // List elements
257- [ TAG_UL ] : - 1 , // Slightly penalize lists as they're often navigation
258- [ TAG_OL ] : 0 , // Ordered lists are more likely to be content
259- [ TAG_LI ] : - 2 , // Increase penalty for list items to avoid nav lists
257+ [ TAG_UL ] : - 8 , // Higher penalty as lists are often navigation
258+ [ TAG_OL ] : - 5 , // Ordered lists still often navigation
259+ [ TAG_LI ] : - 6 , // Higher penalty for list items to avoid nav lists
260260 [ TAG_DL ] : 0 , // Definition lists, neutral
261261 [ TAG_DT ] : 0 , // Definition lists, neutral
262262 [ TAG_DD ] : 0 , // Definition lists, neutral
@@ -270,10 +270,10 @@ const TagScores = {
270270 [ TAG_H6 ] : 0 , // Minor headers, neutral
271271
272272 // Navigation and structural elements (negative)
273- [ TAG_HEADER ] : - 15 , // Page header, often not content
274- [ TAG_FOOTER ] : - 15 , // Footer, rarely content
275- [ TAG_NAV ] : - 20 , // Navigation, not content
276- [ TAG_ASIDE ] : - 15 , // Sidebar, usually not main content
273+ [ TAG_HEADER ] : - 15 , // Page header, often not content but may contain article headers
274+ [ TAG_FOOTER ] : - 25 , // Footer, rarely content
275+ [ TAG_NAV ] : - 30 , // Navigation, not content
276+ [ TAG_ASIDE ] : - 25 , // Sidebar, usually not main content
277277
278278 // Form elements (negative)
279279 [ TAG_FORM ] : - 8 , // User input, not content
@@ -289,7 +289,7 @@ const TagScores = {
289289 [ TAG_OBJECT ] : - 3 , // Embedded content, often ads
290290
291291 // Links
292- [ TAG_A ] : - 3 , // Link, more negative to avoid navigation-heavy areas
292+ [ TAG_A ] : - 8 , // Higher penalty to avoid navigation-heavy areas
293293
294294 // Text formatting
295295 [ TAG_STRONG ] : 1 , // Emphasized text, slightly positive
@@ -323,33 +323,41 @@ function scoreClassAndId(node: ElementNode) {
323323 const className = node . attributes . class as string
324324
325325 // Check for specific strong negative patterns first
326- if ( / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r / i. test ( className ) ) {
327- scoreAdjustment -= 25
326+ if ( / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r | a d - | a d v e r t i s e m e n t | b a n n e r | p r o m o | c t a | b u t t o n | a p p l y | t r i a l | e n g a g e m e n t | s h a r i n g | l i k e s | v i e w s | m e t r i c s | s t a t s | b r e a d c r u m b | p a g i n a t i o n | f i l t e r | s o r t | s e a r c h / i. test ( className ) ) {
327+ scoreAdjustment -= 35
328328 }
329329 // Then check for other negative patterns
330330 else if ( REGEXPS . negative . test ( className ) ) {
331- scoreAdjustment -= 10 // -10 per scoring.md
331+ scoreAdjustment -= 15 // Increased penalty
332332 }
333333 // Only apply positive patterns if no negative patterns matched
334334 else if ( REGEXPS . positive . test ( className ) ) {
335335 scoreAdjustment += 10 // +10 per scoring.md
336+ // Special boost for documentation content
337+ if ( / d o c s ? | d o c u m e n t a t i o n | g u i d e | t u t o r i a l | r e f e r e n c e | m a n u a l | a r t i c l e / i. test ( className ) ) {
338+ scoreAdjustment += 5 // Extra boost for docs
339+ }
336340 }
337341 }
338342
339343 if ( node . attributes ?. id ) {
340344 const id = node . attributes . id as string
341345
342346 // Check for specific strong negative patterns first
343- if ( / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r / i. test ( id ) ) {
344- scoreAdjustment -= 25
347+ if ( / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r | a d - | a d v e r t i s e m e n t | b a n n e r | p r o m o | c t a | b u t t o n | a p p l y | t r i a l | e n g a g e m e n t | s h a r i n g | l i k e s | v i e w s | m e t r i c s | s t a t s | b r e a d c r u m b | p a g i n a t i o n | f i l t e r | s o r t | s e a r c h / i. test ( id ) ) {
348+ scoreAdjustment -= 35
345349 }
346350 // Then check for other negative patterns
347351 else if ( REGEXPS . negative . test ( id ) ) {
348- scoreAdjustment -= 10 // -10 per scoring.md
352+ scoreAdjustment -= 15 // Increased penalty
349353 }
350354 // Only apply positive patterns if no negative patterns matched
351355 else if ( REGEXPS . positive . test ( id ) ) {
352356 scoreAdjustment += 10 // +10 per scoring.md
357+ // Special boost for documentation content
358+ if ( / d o c s ? | d o c u m e n t a t i o n | g u i d e | t u t o r i a l | r e f e r e n c e | m a n u a l | a r t i c l e / i. test ( id ) ) {
359+ scoreAdjustment += 5 // Extra boost for docs
360+ }
353361 }
354362 }
355363
@@ -399,9 +407,9 @@ export function readabilityPlugin() {
399407
400408 // Check for strong negative patterns that should override parent context
401409 const hasStrongNegativePattern = (
402- ( node . name && / n a v | h e a d e r | f o o t e r | a s i d e / i. test ( node . name ) )
403- || ( node . attributes ?. class && / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r | h i d d e n | c o p y r i g h t / i. test ( node . attributes . class as string ) )
404- || ( node . attributes ?. id && / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r | h i d d e n | c o p y r i g h t / i. test ( node . attributes . id as string ) )
410+ ( node . name && / n a v | h e a d e r | f o o t e r | a s i d e | f o r m | f i e l d s e t | b u t t o n / i. test ( node . name ) )
411+ || ( node . attributes ?. class && / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r | h i d d e n | c o p y r i g h t | a d - | a d v e r t i s e m e n t | b a n n e r | p r o m o | r e l a t e d | c o m m e n t | l o g i n | r e g i s t e r | s u b s c r i b e | n e w s l e t t e r | c a t e g o r y | m e t a | t a g | c t a | b u t t o n | a p p l y | t r i a l | e n g a g e m e n t | s h a r i n g | l i k e s | v i e w s | m e t r i c s | s t a t s | b r e a d c r u m b | p a g i n a t i o n | f i l t e r | s o r t | s e a r c h / i. test ( node . attributes . class as string ) )
412+ || ( node . attributes ?. id && / n a v | m e n u | h e a d e r | f o o t e r | s i d e b a r | h i d d e n | c o p y r i g h t | a d - | a d v e r t i s e m e n t | b a n n e r | p r o m o | r e l a t e d | c o m m e n t | l o g i n | r e g i s t e r | s u b s c r i b e | n e w s l e t t e r | c a t e g o r y | m e t a | t a g | c t a | b u t t o n | a p p l y | t r i a l | e n g a g e m e n t | s h a r i n g | l i k e s | v i e w s | m e t r i c s | s t a t s | b r e a d c r u m b | p a g i n a t i o n | f i l t e r | s o r t | s e a r c h / i. test ( node . attributes . id as string ) )
405413 || ( node . attributes ?. style && / d i s p l a y : \s * n o n e | v i s i b i l i t y : \s * h i d d e n / i. test ( node . attributes . style as string ) )
406414 || ( node . attributes && Object . keys ( node . attributes ) . some ( attr => attr . startsWith ( 'aria-' ) && node . attributes ! [ attr ] === 'true' && / h i d d e n | i n v i s i b l e / i. test ( attr ) ) )
407415 )
@@ -501,31 +509,50 @@ export function readabilityPlugin() {
501509 const linkDensity = linkTextLength / textLength
502510
503511 // Apply more aggressive link density penalty
504- if ( linkDensity > 0.5 ) {
512+ if ( linkDensity > 0.4 ) {
505513 // For very high link density, apply severe penalty and mark as navigation-like
506- if ( linkDensity > 0.7 ) {
507- node . context . score = node . context . score * 0.05 // 95 % reduction
508- // If we have very high link density, mark as navigation-like content
509- if ( linkTextLength > 100 ) {
514+ if ( linkDensity > 0.6 ) {
515+ node . context . score = node . context . score * 0.02 // 98 % reduction
516+ // If we have high link density, mark as navigation-like content
517+ if ( linkTextLength > 50 ) {
510518 node . context . isHighLinkDensity = true
511519 }
512520 }
513521 else {
514522 // Scale score down based on link density
515- node . context . score *= ( 1 - linkDensity * 1.5 ) // More aggressive scaling
523+ node . context . score *= ( 1 - linkDensity * 2.0 ) // Even more aggressive scaling
516524 }
517525 }
518- else if ( linkDensity > 0.25 ) { // Lower threshold for moderate link density
526+ else if ( linkDensity > 0.2 ) { // Lower threshold for moderate link density
519527 // Even moderate link density should reduce score significantly
520- node . context . score *= ( 1 - ( linkDensity * 0.75 ) )
528+ node . context . score *= ( 1 - ( linkDensity * 1.0 ) )
529+ }
530+ }
531+
532+ // Special penalty for elements with many links based on link density and text length
533+ if ( linkTextLength > 0 && textLength > 0 ) {
534+ const linkRatio = linkTextLength / textLength
535+ const hasDocumentationMarkers = (
536+ ( node . attributes ?. class && / d o c s ? | d o c u m e n t a t i o n | g u i d e | t u t o r i a l | r e f e r e n c e | m a n u a l | a r t i c l e | c o n t e n t / i. test ( node . attributes . class as string ) )
537+ || ( node . attributes ?. id && / d o c s ? | d o c u m e n t a t i o n | g u i d e | t u t o r i a l | r e f e r e n c e | m a n u a l | a r t i c l e | c o n t e n t / i. test ( node . attributes . id as string ) )
538+ || ( node . name && / m a i n | a r t i c l e | s e c t i o n / i. test ( node . name ) )
539+ )
540+
541+ if ( linkRatio > 0.3 && linkTextLength > 30 && ! hasDocumentationMarkers ) {
542+ // This looks like navigation-heavy content (but not documentation with inline links)
543+ node . context . score -= 10
521544 }
522545 }
523546
524547 // Only exclude content with low scores to reduce fragmentation
525548 const finalScore = node . context . score
526549
527- if ( finalScore <= - 10 ) {
528- // Exclude content with low scores to filter out poor quality content
550+ if ( finalScore <= - 12 ) {
551+ // More aggressive exclusion threshold to filter out navigation and low-quality content
552+ createBufferRegion ( node , state , false )
553+ }
554+ // Also exclude high link density content regardless of other scoring
555+ else if ( node . context . isHighLinkDensity || ( linkTextLength > 50 && textLength > 0 && ( linkTextLength / textLength ) > 0.5 ) ) {
529556 createBufferRegion ( node , state , false )
530557 }
531558 // Don't create inclusion regions dynamically - let content flow naturally
0 commit comments