/
degentenaar.recipe
85 lines (70 loc) · 2.98 KB
/
degentenaar.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
www.nieuwsblad.be
'''
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class DeGentenaarOnline(BasicNewsRecipe):
title = 'De Gentenaar'
__author__ = 'Darko Miletic'
description = 'News from Belgium in Dutch'
publisher = 'De Gentenaar'
category = 'news, politics, Belgium'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
language = 'nl_BE'
lang = 'nl-BE'
direction = 'ltr'
html2lrf_options = [
'--comment', description, '--category', category, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + \
category + \
'"\noverride_css=" p {text-indent: 0cm; margin-top: 0em; margin-bottom: 0.5em} "'
keep_only_tags = [dict(name='span', attrs={
'id': ['lblArticleTitle', 'lblArticleIntroduction', 'lblArticleMainText']})]
remove_tags = [dict(name=['embed', 'object'])]
feeds = [
(u'Snelnieuws', u'http://feeds.nieuwsblad.be/nieuws/snelnieuws'),
(u'Binnenland', u'http://feeds.nieuwsblad.be/nieuws/binnenland'),
(u'Buitenland', u'http://feeds.nieuwsblad.be/nieuwsblad/buitenland'),
(u'Economie', u'http://feeds.nieuwsblad.be/economie/home'),
(u'Economie', u'http://feeds.nieuwsblad.be/economie/home'),
(u'Algemeen', u'http://feeds.nieuwsblad.be/life/algemeen'),
(u'Film', u'http://feeds.nieuwsblad.be/life/film'),
(u'Boek', u'http://feeds.nieuwsblad.be/life/boeken'),
(u'Muziek', u'http://feeds.nieuwsblad.be/life/muziek'),
(u'Podium', u'http://feeds.nieuwsblad.be/life/podium'),
(u'TV & radio', u'http://feeds.nieuwsblad.be/life/tv')
]
def print_version(self, url):
return url.replace('/Detail.aspx?articleid', '/PrintArticle.aspx?ArticleID')
def get_article_url(self, article):
return article.get('guid', None)
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll('span'):
item.name = 'div'
if item.get('id') == 'lblArticleTitle':
item.name = 'h3'
soup.html['lang'] = self.lang
soup.html['dir'] = self.direction
mlang = new_tag(soup, 'meta', [
("http-equiv", "Content-Language"), ("content", self.lang)])
mcharset = new_tag(soup, 'meta', [
("http-equiv", "Content-Type"), ("content", "text/html; charset=utf-8")])
soup.head.insert(0, mlang)
soup.head.insert(1, mcharset)
return soup