Skip to content

Commit

Permalink
Update NYTimes Tech Beat
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Jun 2, 2018
1 parent 4e0ada4 commit 924acd1
Showing 1 changed file with 36 additions and 4 deletions.
40 changes: 36 additions & 4 deletions recipes/nytimes_tech.recipe
Expand Up @@ -19,7 +19,7 @@ def classes(classes):
class NYTimesTechnology(BasicNewsRecipe):
title = 'New York Times Technology Beat'
language = 'en'
description = 'The latest in technology from David Pogue'
description = 'The latest in technology - Gadgetwise'
publisher = 'The New York Times'
category = 'Technology'
oldest_article = 14
Expand All @@ -31,9 +31,41 @@ class NYTimesTechnology(BasicNewsRecipe):
(u'Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
]
keep_only_tags = [
dict(name='h1'),
classes('extended-byline story-body'),
dict(id='story'),
]
remove_tags = [
classes('visually-hidden newsletter-signup nocontent robots-nocontent hidden'),
dict(attrs={'aria-label':'tools'.split()}),
dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
dict(href='#site-content #site-index'.split()),
dict(attrs={'aria-hidden':'true'}),
dict(attrs={'data-videoid':True}),
dict(name='button meta link'.split()),
dict(id=lambda x: x and x.startswith('story-ad-')),
dict(name='head'),
dict(role='toolbar'),
dict(name='a', href=lambda x: x and '#story-continues-' in x),
dict(name='a', href=lambda x: x and '#whats-next' in x),
dict(id=lambda x: x and 'sharetools-' in x),
dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
dict(attrs={'class': lambda x: x and (
'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
]

def preprocess_html(self, soup):
# Add a space to the dateline
t = soup.find(**classes('dateline'))
if t is not None:
t.insert(0, ' ')

# Remove empty li tags
for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
if not li.contents and not li.string:
li.extract()

# Ensure the headline is first
h1 = soup.find('h1', itemprop='headline')
if h1 is not None:
h1.extract()
soup.find('body').contents.insert(0, h1)
return soup

0 comments on commit 924acd1

Please sign in to comment.