/
bbc.recipe
280 lines (248 loc) · 13.4 KB
/
bbc.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
import json
from calibre import prepare_string_for_xml
from calibre.web.feeds.recipes import BasicNewsRecipe
# Article JSON parser {{{
def serialize_image(block):
yield '<div>'
block = block['model']
img = block['image']
alt = prepare_string_for_xml(img.get('alt') or '', True)
for q in ('originalSrc', 'src'):
if q in img:
src = prepare_string_for_xml(img[q])
break
else:
raise ValueError('No src found in img block: {}'.format(img))
yield '<img src="{}" alt="{}"/>'.format(src, alt)
caption = block.get('caption')
if caption and caption.get('type') == 'text':
yield '<div>'
yield from serialize_paragraph(caption)
yield '</div>'
yield '</div>'
def block_tag(name, generator):
yield '<' + name + '>'
yield from generator
yield '</' + name + '>'
def serialize_paragraph(block):
block = block['model']
for x in block['blocks']:
xt = x['type']
if xt == 'fragment':
styles = []
model = x['model']
for attr in model['attributes']:
if attr == 'bold':
styles.append('font-weight: bold')
elif attr in ('italic', 'italics'):
styles.append('font-style: italic')
if styles:
prefix = '<span style="{}">'.format('; '.join(styles))
suffix = '</span>'
else:
prefix = suffix = ''
yield prefix + prepare_string_for_xml(model['text']) + suffix
elif xt == 'urlLink':
model = x['model']
yield '<a href="{}">{}</a>'.format(prepare_string_for_xml(model['locator'], True), prepare_string_for_xml(model['text']))
def serialize_list(block):
for x in block['model']['blocks']:
if x['type'] == 'listItem':
yield from block_tag('li', serialize_paragraph(x))
def serialize_text(block):
block = block['model']
for x in block['blocks']:
xt = x['type']
if xt == 'paragraph':
yield from block_tag('p', serialize_paragraph(x))
elif xt == 'unorderedList':
yield from block_tag('ul', serialize_list(x))
elif xt == 'orderedList':
yield from block_tag('ol', serialize_list(x))
else:
raise KeyError('Unknown block type: ' + x['type'])
def serialize_contributor(contributor):
if 'title' in contributor:
yield '<h3>' + prepare_string_for_xml(contributor['title']) + '</h3>'
if 'subtitle' in contributor:
yield '<div>' + prepare_string_for_xml(contributor['subtitle']) + '</div>'
def parse_article_json(root, abort_article):
data = root['data']
has_media_experience = False
for key in data:
if key.startswith('article?'):
article = data[key]['data']
break
elif key.startswith('media-experience?'):
has_media_experience = True
else:
if has_media_experience:
abort_article('Skipping video article')
return
raise KeyError('No article found in data keys: {}'.format(data.keys()))
lines = []
if article.get('headline'):
lines.append('<h1>{}</h1>'.format(prepare_string_for_xml(article['headline'])))
if article.get('contributor'):
lines.extend(serialize_contributor(article['contributor']))
for block in article['content']['model']['blocks']:
bt = block.get('type')
if bt == 'image':
lines.extend(serialize_image(block))
elif bt == 'text':
lines.extend(serialize_text(block))
return '<html><body id="main-content">' + '\n'.join(lines) + '</body></html>'
# }}}
class BBCNews(BasicNewsRecipe):
# Select / de-select the feeds you want in your ebook.
feeds = [
("News Home", "https://feeds.bbci.co.uk/news/rss.xml"),
("UK", "https://feeds.bbci.co.uk/news/uk/rss.xml"),
("World", "https://feeds.bbci.co.uk/news/world/rss.xml"),
# ("England", "https://feeds.bbci.co.uk/news/england/rss.xml"),
# ("Scotland", "https://feeds.bbci.co.uk/news/scotland/rss.xml"),
# ("Wales", "https://feeds.bbci.co.uk/news/wales/rss.xml"),
# ("N. Ireland", "https://feeds.bbci.co.uk/news/northern_ireland/rss.xml"),
# ("Africa", "https://feeds.bbci.co.uk/news/world/africa/rss.xml"),
# ("Asia", "https://feeds.bbci.co.uk/news/world/asia/rss.xml"),
# ("Europe", "https://feeds.bbci.co.uk/news/world/europe/rss.xml"),
# ("Latin America", "https://feeds.bbci.co.uk/news/world/latin_america/rss.xml"),
# ("Middle East", "https://feeds.bbci.co.uk/news/world/middle_east/rss.xml"),
("US & Canada", "https://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml"),
("Politics", "https://feeds.bbci.co.uk/news/politics/rss.xml"),
("Science/Environment",
"https://feeds.bbci.co.uk/news/science_and_environment/rss.xml"),
("Technology", "https://feeds.bbci.co.uk/news/technology/rss.xml"),
("Magazine", "https://feeds.bbci.co.uk/news/magazine/rss.xml"),
("Entertainment/Arts",
"https://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"),
# ("Health", "https://feeds.bbci.co.uk/news/health/rss.xml"),
# ("Education/Family", "https://feeds.bbci.co.uk/news/education/rss.xml"),
("Business", "https://feeds.bbci.co.uk/news/business/rss.xml"),
("Special Reports", "https://feeds.bbci.co.uk/news/special_reports/rss.xml"),
("Also in the News", "https://feeds.bbci.co.uk/news/also_in_the_news/rss.xml"),
# ("Newsbeat", "https://www.bbc.co.uk/newsbeat/rss.xml"),
# ("Click", "https://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"),
# ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "https://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"),
# ("Blog: Robert Peston (Business Editor)", "https://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"),
# ("Blog: Stephanie Flanders (Economics Editor)", "https://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"),
("Sport Front Page",
"https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml"),
# ("Football", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"),
# ("Cricket", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"),
# ("Rugby Union", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"),
# ("Rugby League", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"),
# ("Tennis", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"),
# ("Golf", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"),
# ("Motorsport", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"),
# ("Boxing", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"),
# ("Athletics", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"),
# ("Snooker", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"),
# ("Horse Racing", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"),
# ("Cycling", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"),
# ("Disability Sport", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"),
# ("Other Sport", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"),
# ("Olympics 2012", "https://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"),
# ("N. Ireland Politics", "https://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"),
# ("Scotland Politics", "https://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"),
# ("Scotland Business", "https://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"),
# ("E. Scotland, Edinburgh & Fife", "https://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"),
# ("W. Scotland & Glasgow", "https://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"),
# ("Highlands & Islands", "https://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"),
# ("NE. Scotland, Orkney & Shetland", "https://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"),
# ("South Scotland", "https://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"),
# ("Central Scotland & Tayside", "https://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"),
# ("Wales Politics", "https://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"),
# ("NW. Wales", "https://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"),
# ("NE. Wales", "https://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"),
# ("Mid. Wales", "https://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"),
# ("SW. Wales", "https://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"),
# ("SE. Wales", "https://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"),
# ("Newyddion - News in Welsh", "https://feeds.bbci.co.uk/newyddion/rss.xml"),
# ("Gwleidyddiaeth", "https://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"),
# ("Gogledd-Ddwyrain", "https://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"),
# ("Gogledd-Orllewin", "https://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"),
# ("Canolbarth", "https://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"),
# ("De-Ddwyrain", "https://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"),
# ("De-Orllewin", "https://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"),
]
# **** SELECT YOUR USER PREFERENCES ****
# Title to use for the ebook.
#
title = 'BBC News'
# A brief description for the ebook.
#
description = u'BBC web site ebook created using rss feeds.'
# The max number of articles which may be downloaded from each feed.
# I've never seen more than about 70 articles in a single feed in the
# BBC feeds.
#
max_articles_per_feed = 100
# The max age of articles which may be downloaded from each feed. This is
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
# half days). My default of 1.5 days is the last 36 hours, the point at
# which I've decided 'news' becomes 'old news', but be warned this is not
# so good for the blogs, technology, magazine, etc., and sports feeds.
# You may wish to extend this to 2-5 but watch out ebook creation time will
# increase as well. Setting this to 30 will get everything (AFAICT) as long
# as max_articles_per_feed remains set high (except for 'Click' which is
# v. low volume and its currently oldest article is 4th Feb 2011).
#
oldest_article = 1.5
# Number of simultaneous downloads. 20 is consistantly working fine on the
# BBC News feeds with no problems. Speeds things up from the defualt of 5.
# If you have a lot of feeds and/or have increased oldest_article above 2
# then you may wish to try increasing simultaneous_downloads to 25-30,
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
#
simultaneous_downloads = 20
# The format string for the date shown on the ebook's first page.
# List of all values: https://docs.python.org/library/time.html
# Default in news.py has a leading space so that's mirrored here.
# As with 'feeds' select/de-select by adding/removing the initial '#',
# only one timefmt should be selected, here's a few to choose from.
#
# [Fri, 14 Nov 2011] (Calibre default)
timefmt = ' [%a, %d %b %Y]'
# timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
# timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
# timefmt = ' [%d %b %Y]' # [14 Nov 2011]
# timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
# timefmt = ' [%Y-%m-%d]' # [2011-11-14]
# timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
#
# **** IMPORTANT ****
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING.
#
# I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :)
#
# **** IMPORTANT ****
#
# Author of this recipe.
__author__ = 'Kovid Goyal'
# Specify English as the language of the RSS feeds (ISO-639 code).
language = 'en_GB'
# Set publisher and publication type.
publication_type = 'newspaper'
encoding = 'utf-8'
use_embedded_content = False
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
ignore_duplicate_articles = {'title', 'url'}
resolve_internal_links = True
def preprocess_raw_html(self, raw_html, url):
q = '>window.__INITIAL_DATA__={'
idx = raw_html.find(q)
if idx < 0:
raise ValueError('Failed to find JSON')
data = raw_html[idx + len(q) - 1:]
idx = data.find('};</script>')
data = data[:idx+1]
root = json.loads(data)
return parse_article_json(root, self.abort_article)