-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Expand file tree
/
Copy pathnews.py
More file actions
1989 lines (1742 loc) · 82.6 KB
/
news.py
File metadata and controls
1989 lines (1742 loc) · 82.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Defines various abstract base classes that can be subclassed to create powerful news fetching recipes.
'''
__docformat__ = 'restructuredtext en'
import io
import os
import re
import sys
import time
import traceback
from collections import defaultdict
from contextlib import closing
from urllib.parse import urlparse, urlsplit
from calibre import __appname__, as_unicode, browser, force_unicode, iswindows, preferred_encoding, random_user_agent, strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup, CData, NavigableString, Tag
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.icu import numeric_sort_key
from calibre.utils.img import add_borders_to_image, image_to_data, save_cover_data_to
from calibre.utils.localization import _, canonicalize_lang, ngettext
from calibre.utils.logging import ThreadSafeWrapper
from calibre.utils.threadpool import NoResultsPending, ThreadPool, WorkRequest
from calibre.web import Recipe
from calibre.web.feeds import Feed, feed_from_xml, feeds_from_index, templates
from calibre.web.fetch.simple import AbortArticle, RecursiveFetcher
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.utils import prepare_masthead_image
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
def prefixed_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if candidate.startswith(x):
return True
return False
return {'attrs': {'class': matcher}}
class LoginFailed(ValueError):
pass
class DownloadDenied(ValueError):
pass
class BasicNewsRecipe(Recipe):
'''
Base class that contains logic needed in all recipes. By overriding
progressively more of the functionality in this class, you can make
progressively more customized/powerful recipes. For a tutorial introduction
to creating recipes, see :doc:`news`.
'''
#: The title to use for the e-book
title = _('Unknown News Source')
#: A couple of lines that describe the content this recipe downloads.
#: This will be used primarily in a GUI that presents a list of recipes.
description = ''
#: The author of this recipe
__author__ = __appname__
#: Minimum calibre version needed to use this recipe
requires_version = (0, 6, 0)
#: The language that the news is in. Must be an ISO-639 code either
#: two or three characters long
language = 'und'
#: Maximum number of articles to download from each feed. This is primarily
#: useful for feeds that don't have article dates. For most feeds, you should
#: use :attr:`BasicNewsRecipe.oldest_article`
max_articles_per_feed = 100
#: Oldest article to download from this news source. In days.
oldest_article = 7.0
#: Number of levels of links to follow on article webpages
recursions = 0
#: The default delay between consecutive downloads in seconds. The argument may be a
#: floating point number to indicate a more precise time. See :meth:`get_url_specific_delay`
#: to implement per URL delays.
delay = 0
#: Publication type
#: Set to newspaper, magazine or blog. If set to None, no publication type
#: metadata will be written to the opf file.
publication_type = 'unknown'
#: Number of simultaneous downloads. Set to 1 if the server is picky.
#: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
simultaneous_downloads = 5
#: Timeout for fetching files from server in seconds
timeout = 120.0
#: The format string for the date shown on the first page.
#: By default: Day_Name, Day_Number Month_Name Year
timefmt = ' [%a, %d %b %Y]'
#: List of feeds to download.
#: Can be either ``[url1, url2, ...]`` or ``[('title1', url1), ('title2', url2),...]``
feeds = None
#: Max number of characters in the short description
summary_length = 500
#: Convenient flag to disable loading of stylesheets for websites
#: that have overly complex stylesheets unsuitable for conversion
#: to e-book formats.
#: If True stylesheets are not downloaded and processed
no_stylesheets = False
#: Convenient flag to strip all JavaScript tags from the downloaded HTML
remove_javascript = True
#: If True the GUI will ask the user for a username and password
#: to use while downloading.
#: If set to "optional" the use of a username and password becomes optional
needs_subscription = False
#: If True the navigation bar is center aligned, otherwise it is left aligned
center_navbar = True
#: Specify an override encoding for sites that have an incorrect
#: charset specification. The most common being specifying ``latin1`` and
#: using ``cp1252``. If None, try to detect the encoding. If it is a
#: callable, the callable is called with two arguments: The recipe object
#: and the source to be decoded. It must return the decoded source.
encoding = None
#: Normally we try to guess if a feed has full articles embedded in it
#: based on the length of the embedded content. If `None`, then the
#: default guessing is used. If `True` then the we always assume the feeds has
#: embedded content and if `False` we always assume the feed does not have
#: embedded content.
use_embedded_content = None
#: Set to True and implement :meth:`get_obfuscated_article` to handle
#: websites that try to make it difficult to scrape content.
articles_are_obfuscated = False
#: Reverse the order of articles in each feed
reverse_article_order = False
#: Automatically extract all the text from downloaded article pages. Uses
#: the algorithms from the readability project. Setting this to True, means
#: that you do not have to worry about cleaning up the downloaded HTML
#: manually (though manual cleanup will always be superior).
auto_cleanup = False
#: Specify elements that the auto cleanup algorithm should never remove.
#: The syntax is a XPath expression. For example::
#:
#: auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
#: id="article-image"
#: auto_cleanup_keep = '//*[@class="important"]' will keep all elements
#: with class="important"
#: auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
#: will keep all divs with id="article-image" and spans
#: with class="important"
#:
auto_cleanup_keep = None
#: Specify any extra :term:`CSS` that should be added to downloaded :term:`HTML` files.
#: It will be inserted into `<style>` tags, just before the closing
#: `</head>` tag thereby overriding all :term:`CSS` except that which is
#: declared using the style attribute on individual :term:`HTML` tags.
#: Note that if you want to programmatically generate the extra_css override
#: the :meth:`get_extra_css()` method instead.
#: For example::
#:
#: extra_css = '.heading { font: serif x-large }'
#:
extra_css = None
#: If True empty feeds are removed from the output.
#: This option has no effect if parse_index is overridden in
#: the sub class. It is meant only for recipes that return a list
#: of feeds using `feeds` or :meth:`get_feeds`. It is also used if you use
#: the ignore_duplicate_articles option.
remove_empty_feeds = False
#: List of regular expressions that determines which links to follow.
#: If empty, it is ignored. Used only if is_link_wanted is
#: not implemented. For example::
#:
#: match_regexps = [r'page=[0-9]+']
#:
#: will match all URLs that have `page=some number` in them.
#:
#: Only one of :attr:`BasicNewsRecipe.match_regexps` or
#: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
match_regexps = []
#: List of regular expressions that determines which links to ignore.
#: If empty it is ignored. Used only if is_link_wanted is not
#: implemented. For example::
#:
#: filter_regexps = [r'ads\.doubleclick\.net']
#:
#: will remove all URLs that have `ads.doubleclick.net` in them.
#:
#: Only one of :attr:`BasicNewsRecipe.match_regexps` or
#: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
filter_regexps = []
#: Recipe specific options to control the conversion of the downloaded
#: content into an e-book. These will override any user or plugin specified
#: values, so only use if absolutely necessary. For example::
#:
#: conversion_options = {
#: 'base_font_size' : 16,
#: 'linearize_tables' : True,
#: }
#:
conversion_options = {}
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form::
#:
#: {
#: name : 'tag name', #e.g. 'div'
#: attrs : a dictionary, #e.g. {'class': 'advertisement'}
#: }
#:
#: All keys are optional. For a full explanation of the search criteria, see
#: `Beautiful Soup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/#searching-the-tree>`__
#: A common example::
#:
#: remove_tags = [dict(name='div', class_='advert')]
#:
#: This will remove all `<div class="advert">` tags and all
#: their children from the downloaded :term:`HTML`.
remove_tags = []
#: Remove all tags that occur after the specified tag.
#: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
#: For example::
#:
#: remove_tags_after = [dict(id='content')]
#:
#: will remove all
#: tags after the first element with `id="content"`.
remove_tags_after = None
#: Remove all tags that occur before the specified tag.
#: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
#: For example::
#:
#: remove_tags_before = dict(id='content')
#:
#: will remove all
#: tags before the first element with `id="content"`.
remove_tags_before = None
#: List of attributes to remove from all tags.
#: For example::
#:
#: remove_attributes = ['style', 'font']
remove_attributes = []
#: Keep only the specified tags and their children.
#: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
#: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
#: the tags that match the entries in this list. For example::
#:
#: keep_only_tags = [dict(id=['content', 'heading'])]
#:
#: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
keep_only_tags = []
#: List of :term:`regexp` substitution rules to run on the downloaded :term:`HTML`.
#: Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
#: a single match object and returns a string to replace the match. For example::
#:
#: preprocess_regexps = [
#: (re.compile(r'<!--Article ends here-->.*</body>', re.DOTALL|re.IGNORECASE),
#: lambda match: '</body>'),
#: ]
#:
#: will remove everything from `<!--Article ends here-->` to `</body>`.
preprocess_regexps = []
#: The CSS that is used to style the templates, i.e., the navigation bars and
#: the Tables of Contents. Rather than overriding this variable, you should
#: use `extra_css` in your recipe to customize look and feel.
template_css = '''
.article_date {
color: gray; font-family: monospace;
}
.article_description {
text-indent: 0pt;
}
a.article {
font-weight: bold; text-align:left;
}
a.feed {
font-weight: bold;
}
.calibre_navbar {
font-family:monospace;
}
'''
#: By default, calibre will use a default image for the masthead (Kindle only).
#: Override this in your recipe to provide a URL to use as a masthead.
masthead_url = None
#: By default, the cover image returned by get_cover_url() will be used as
#: the cover for the periodical. Overriding this in your recipe instructs
#: calibre to render the downloaded cover into a frame whose width and height
#: are expressed as a percentage of the downloaded cover.
#: cover_margins = (10, 15, '#ffffff') pads the cover with a white margin
#: 10px on the left and right, 15px on the top and bottom.
#: Color names are defined `here <https://www.imagemagick.org/script/color.php>`_.
#: Note that for some reason, white does not always work in Windows. Use
#: #ffffff instead
cover_margins = (0, 0, '#ffffff')
#: Set to a non empty string to disable this recipe.
#: The string will be used as the disabled message
recipe_disabled = None
#: Ignore duplicates of articles that are present in more than one section.
#: A duplicate article is an article that has the same title and/or URL.
#: To ignore articles with the same title, set this to::
#:
#: ignore_duplicate_articles = {'title'}
#:
#: To use URLs instead, set it to::
#:
#: ignore_duplicate_articles = {'url'}
#:
#: To match on title or URL, set it to::
#:
#: ignore_duplicate_articles = {'title', 'url'}
ignore_duplicate_articles = None
# The following parameters control how the recipe attempts to minimize
# image sizes. Note that if compression is enabled PNG images are converted
# to JPEG.
#: Set this to False to ignore all scaling and compression parameters and
#: pass images through unmodified. If True and the other compression
#: parameters are left at their default values, images will be scaled to fit
#: in the screen dimensions set by the output profile and compressed to size at
#: most (w * h)/16 where w x h are the scaled image dimensions.
compress_news_images = False
#: The factor used when auto compressing JPEG images. If set to None,
#: auto compression is disabled. Otherwise, the images will be reduced in size to
#: (w * h)/compress_news_images_auto_size bytes if possible by reducing
#: the quality level, where w x h are the image dimensions in pixels.
#: The minimum JPEG quality will be 5/100 so it is possible this constraint
#: will not be met. This parameter can be overridden by the parameter
#: compress_news_images_max_size which provides a fixed maximum size for images.
#: Note that if you enable scale_news_images_to_device then the image will
#: first be scaled and then its quality lowered until its size is less than
#: (w * h)/factor where w and h are now the *scaled* image dimensions. In
#: other words, this compression happens after scaling.
compress_news_images_auto_size = 16
#: Set JPEG quality so images do not exceed the size given (in KBytes).
#: If set, this parameter overrides auto compression via compress_news_images_auto_size.
#: The minimum JPEG quality will be 5/100 so it is possible this constraint
#: will not be met.
compress_news_images_max_size = None
#: Rescale images to fit in the device screen dimensions set by the output profile.
#: Ignored if no output profile is set.
scale_news_images_to_device = True
#: Maximum dimensions (w,h) to scale images to. If scale_news_images_to_device is True
#: this is set to the device screen dimensions set by the output profile unless
#: there is no profile set, in which case it is left at whatever value it has been
#: assigned (default None).
scale_news_images = None
#: If set to True then links in downloaded articles that point to other downloaded articles are
#: changed to point to the downloaded copy of the article rather than its original web URL. If you
#: set this to True, you might also need to implement :meth:`canonicalize_internal_url` to work
#: with the URL scheme of your particular website.
resolve_internal_links = False
#: Specify options specific to this recipe. These will be available for the user to customize
#: in the Advanced tab of the Fetch News dialog or at the ebook-convert command line. The options
#: are specified as a dictionary mapping option name to metadata about the option. For example::
#:
#: recipe_specific_options = {
#: 'edition_date': {
#: 'short': 'The issue date to download',
#: 'long': 'Specify a date in the format YYYY-mm-dd to download the issue corresponding to that date',
#: 'default': 'current',
#: }
#: }
#:
#: When the recipe is run, self.recipe_specific_options will be a dict mapping option name to the option value
#: specified by the user. When the option is unspecified by the user, it will have the value specified by 'default'.
#: If no default is specified, the option will not be in the dict at all, when unspecified by the user.
recipe_specific_options = None
#: The simulated browser engine to use when downloading from servers. The default is to use the Python mechanize
#: browser engine, which supports logging in. However, if you don't need logging in, consider changing this
#: to either 'webengine' which uses an actual Chromium browser to do the network requests or 'qt' which
#: uses the Qt Networking backend. Both 'webengine' and 'qt' support HTTP/2, which mechanize does not and
#: are thus harder to fingerprint for bot protection services.
browser_type = 'mechanize'
#: Set to False if you do not want to use gzipped transfers with the mechanize browser.
#: Note that some old servers flake out with gzip.
handle_gzip = True
# See the built-in recipes for examples of these settings.
def short_title(self):
return force_unicode(self.title, preferred_encoding)
def is_link_wanted(self, url, tag):
'''
Return True if the link should be followed or False otherwise. By
default, raises NotImplementedError which causes the downloader to
ignore it.
:param url: The URL to be followed
:param tag: The tag from which the URL was derived
'''
raise NotImplementedError()
def get_extra_css(self):
'''
By default returns `self.extra_css`. Override if you want to programmatically generate the
extra_css.
'''
return self.extra_css
def get_cover_url(self):
'''
Return a :term:`URL` to the cover image for this issue or `None`.
By default it returns the value of the member `self.cover_url` which
is normally `None`. If you want your recipe to download a cover for the e-book
override this method in your subclass, or set the member variable `self.cover_url`
before this method is called.
'''
return getattr(self, 'cover_url', None)
def get_masthead_url(self):
'''
Return a :term:`URL` to the masthead image for this issue or `None`.
By default it returns the value of the member `self.masthead_url` which
is normally `None`. If you want your recipe to download a masthead for the e-book
override this method in your subclass, or set the member variable `self.masthead_url`
before this method is called.
Masthead images are used in Kindle MOBI files.
'''
return getattr(self, 'masthead_url', None)
def get_feeds(self):
'''
Return a list of :term:`RSS` feeds to fetch for this profile. Each element of the list
must be a 2-element tuple of the form (title, url). If title is None or an
empty string, the title from the feed is used. This method is useful if your recipe
needs to do some processing to figure out the list of feeds to download. If
so, override in your subclass.
'''
if not self.feeds:
raise NotImplementedError()
if self.test:
return self.feeds[:self.test[0]]
return self.feeds
def get_url_specific_delay(self, url):
'''
Return the delay in seconds before downloading this URL. If you want to programmatically
determine the delay for the specified URL, override this method in your subclass, returning
self.delay by default for URLs you do not want to affect.
:return: A floating point number, the delay in seconds.
'''
return self.delay
@classmethod
def print_version(cls, url):
'''
Take a `url` pointing to the webpage with article content and return the
:term:`URL` pointing to the print version of the article. By default does
nothing. For example::
def print_version(self, url):
return url + '?&pagewanted=print'
'''
raise NotImplementedError()
@classmethod
def image_url_processor(cls, baseurl, url):
'''
Perform some processing on image urls (perhaps removing size restrictions for
dynamically generated images, etc.) and return the processed URL. Return None
or an empty string to skip fetching the image.
'''
return url
def preprocess_image(self, img_data, image_url):
'''
Perform some processing on downloaded image data. This is called on the raw
data before any resizing is done. Must return the processed raw data. Return
None to skip the image.
'''
return img_data
def get_browser(self, *args, **kwargs):
'''
Return a browser instance used to fetch documents from the web. By default
it returns a `mechanize <https://mechanize.readthedocs.io/en/latest/>`_
browser instance that supports cookies, ignores robots.txt, handles
refreshes and has a random common user agent.
To customize the browser override this method in your sub-class as::
def get_browser(self, *a, **kw):
br = super().get_browser(*a, **kw)
# Add some headers
br.addheaders += [
('My-Header', 'one'),
('My-Header2', 'two'),
]
# Set some cookies
br.set_cookie('name', 'value')
br.set_cookie('name2', 'value2', domain='.mydomain.com')
# Make a POST request with some data
br.open('https://someurl.com', {'username': 'def', 'password': 'pwd'}).read()
# Do a login via a simple web form (only supported with mechanize browsers)
if self.username is not None and self.password is not None:
br.open('https://www.nytimes.com/auth/login')
br.select_form(name='login')
br['USERID'] = self.username
br['PASSWORD'] = self.password
br.submit()
return br
'''
if 'user_agent' not in kwargs:
# More and more news sites are serving JPEG XR images to IE
ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False)
kwargs['user_agent'] = self.last_used_user_agent = ua
self.log('Using user agent:', kwargs['user_agent'])
if self.browser_type != 'mechanize':
from calibre.scraper.qt import Browser, WebEngineBrowser
return {'qt': Browser, 'webengine': WebEngineBrowser}[self.browser_type](
user_agent=kwargs['user_agent'], verify_ssl_certificates=kwargs.get('verify_ssl_certificates', False))
br = browser(*args, **kwargs)
br.addheaders += [('Accept', '*/*')]
if self.handle_gzip:
br.set_handle_gzip(True)
return br
def clone_browser(self, br):
'''
Clone the browser br. Cloned browsers are used for multi-threaded
downloads, since mechanize is not thread safe. The default cloning
routines should capture most browser customization, but if you do
something exotic in your recipe, you should override this method in
your recipe and clone manually.
Cloned browser instances use the same, thread-safe CookieJar by
default, unless you have customized cookie handling.
'''
if callable(getattr(br, 'clone_browser', None)):
return br.clone_browser()
# Uh-oh recipe using something exotic, call get_browser
return self.get_browser()
@property
def cloned_browser(self):
if hasattr(self.get_browser, 'is_base_class_implementation') and self.browser_type == 'mechanize':
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
return br
def get_article_url(self, article):
'''
Override in a subclass to customize extraction of the :term:`URL` that points
to the content for each article. Return the
article URL. It is called with `article`, an object representing a parsed article
from a feed. See `feedparser <https://pythonhosted.org/feedparser/>`_.
By default it looks for the original link (for feeds syndicated via a
service like FeedBurner or Pheedo) and if found,
returns that or else returns
`article.link <https://pythonhosted.org/feedparser/reference-entry-link.html>`_.
'''
for key in article.keys():
if key.endswith('_origlink'):
url = article[key]
if url and (url.startswith(('http://', 'https://'))):
return url
ans = article.get('link', None)
if not ans and getattr(article, 'links', None):
for item in article.links:
if item.get('rel', 'alternate') == 'alternate':
ans = item['href']
break
return ans
def skip_ad_pages(self, soup):
'''
This method is called with the source of each downloaded :term:`HTML` file, before
any of the cleanup attributes like remove_tags, keep_only_tags are
applied. Note that preprocess_regexps will have already been applied.
It is meant to allow the recipe to skip ad pages. If the soup represents
an ad page, return the HTML of the real page. Otherwise return
None.
`soup`: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__
instance containing the downloaded :term:`HTML`.
'''
return
def abort_article(self, msg=None):
''' Call this method inside any of the preprocess methods to abort the
download for the current article. Useful to skip articles that contain
inappropriate content, such as pure video articles. '''
raise AbortArticle(msg or _('Article download aborted'))
def preprocess_raw_html(self, raw_html, url):
'''
This method is called with the source of each downloaded :term:`HTML` file, before
it is parsed into an object tree. raw_html is a unicode string
representing the raw HTML downloaded from the web. url is the URL from
which the HTML was downloaded.
Note that this method acts *before* preprocess_regexps.
This method must return the processed raw_html as a unicode object.
'''
return raw_html
def preprocess_raw_html_(self, raw_html, url):
raw_html = self.preprocess_raw_html(raw_html, url)
if self.auto_cleanup:
try:
raw_html = self.extract_readable_article(raw_html, url)
except Exception:
self.log.exception(f'Auto cleanup of URL: {url!r} failed')
return raw_html
def preprocess_html(self, soup):
'''
This method is called with the source of each downloaded :term:`HTML` file, before
it is parsed for links and images. It is called after the cleanup as
specified by remove_tags etc.
It can be used to do arbitrarily powerful pre-processing on the :term:`HTML`.
It should return `soup` after processing it.
`soup`: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__
instance containing the downloaded :term:`HTML`.
'''
return soup
def postprocess_html(self, soup, first_fetch):
'''
This method is called with the source of each downloaded :term:`HTML` file, after
it is parsed for links and images.
It can be used to do arbitrarily powerful post-processing on the :term:`HTML`.
It should return `soup` after processing it.
:param soup: A `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc/>`__ instance containing the downloaded :term:`HTML`.
:param first_fetch: True if this is the first page of an article.
'''
return soup
def cleanup(self):
'''
Called after all articles have been download. Use it to do any cleanup like
logging out of subscription sites, etc.
'''
pass
def canonicalize_internal_url(self, url, is_link=True):
'''
Return a set of canonical representations of ``url``. The default
implementation uses just the server hostname and path of the URL,
ignoring any query parameters, fragments, etc. The canonical
representations must be unique across all URLs for this news source. If
they are not, then internal links may be resolved incorrectly.
:param is_link: Is True if the URL is coming from an internal link in
an HTML file. False if the URL is the URL used to
download an article.
'''
try:
parts = urlparse(url)
except Exception:
self.log.error(f'Failed to parse url: {url!r}, ignoring')
return frozenset()
nl = parts.netloc
path = parts.path or ''
if isinstance(nl, bytes):
nl = nl.decode('utf-8', 'replace')
if isinstance(path, bytes):
path = path.decode('utf-8', 'replace')
return frozenset({(nl, path.rstrip('/'))})
def index_to_soup(self, url_or_raw, raw=False, as_tree=False, save_raw=None):
'''
Convenience method that takes an URL to the index page and returns
a `BeautifulSoup <https://www.crummy.com/software/BeautifulSoup/bs4/doc>`__
of it.
`url_or_raw`: Either a URL or the downloaded index page as a string
'''
if re.match((br'\w+://' if isinstance(url_or_raw, bytes) else r'\w+://'), url_or_raw):
# We may be called in a thread (in the skip_ad_pages method), so
# clone the browser to be safe. We cannot use self.cloned_browser
# as it may or may not actually clone the browser, depending on if
# the recipe implements get_browser() or not
br = self.clone_browser(self.browser)
open_func = getattr(br, 'open_novisit', br.open)
with closing(open_func(url_or_raw, timeout=self.timeout)) as f:
_raw = f.read()
if not _raw:
raise RuntimeError(f'Could not fetch index from {url_or_raw}')
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, str) and self.encoding:
if callable(self.encoding):
_raw = self.encoding(_raw)
else:
_raw = _raw.decode(self.encoding, 'replace')
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode
from calibre.utils.cleantext import clean_xml_chars
if isinstance(_raw, str):
_raw = strip_encoding_declarations(_raw)
else:
_raw = xml_to_unicode(_raw, strip_encoding_pats=True, resolve_entities=True)[0]
_raw = clean_xml_chars(_raw)
if save_raw:
with open(save_raw, 'wb') as f:
f.write(_raw.encode('utf-8'))
if as_tree:
from html5_parser import parse
return parse(_raw)
return BeautifulSoup(_raw)
def extract_readable_article(self, html, url):
'''
Extracts main article content from 'html', cleans up and returns as a (article_html, extracted_title) tuple.
Based on the original readability algorithm by Arc90.
'''
from lxml.html import tostring
from calibre.ebooks.readability import readability
from calibre.utils.xml_parse import document_fromstring, fragment_fromstring
doc = readability.Document(html, self.log, url=url,
keep_elements=self.auto_cleanup_keep)
article_html = doc.summary()
extracted_title = doc.title()
try:
frag = fragment_fromstring(article_html)
except Exception:
doc = document_fromstring(article_html)
frag = doc.xpath('//body')[-1]
if frag.tag == 'html':
root = frag
elif frag.tag == 'body':
root = document_fromstring(
f'<html><head><title>{extracted_title}</title></head></html>')
root.append(frag)
else:
root = document_fromstring(
f'<html><head><title>{extracted_title}</title></head><body/></html>')
root.xpath('//body')[0].append(frag)
body = root.xpath('//body')[0]
has_title = False
for x in body.iterdescendants():
if x.text == extracted_title:
has_title = True
inline_titles = body.xpath('//h1|//h2')
if not has_title and not inline_titles:
heading = body.makeelement('h2')
heading.text = extracted_title
body.insert(0, heading)
raw_html = tostring(root, encoding='unicode')
return raw_html
def sort_index_by(self, index, weights):
'''
Convenience method to sort the titles in `index` according to `weights`.
`index` is sorted in place. Returns `index`.
`index`: A list of titles.
`weights`: A dictionary that maps weights to titles. If any titles
in index are not in weights, they are assumed to have a weight of 0.
'''
weights = defaultdict(int, weights)
index.sort(key=lambda x: weights[x])
return index
def parse_index(self):
'''
This method should be implemented in recipes that parse a website
instead of feeds to generate a list of articles. Typical uses are for
news sources that have a "Print Edition" webpage that lists all the
articles in the current print edition. If this function is implemented,
it will be used in preference to :meth:`BasicNewsRecipe.parse_feeds`.
It must return a list. Each element of the list must be a 2-element tuple
of the form ``('feed title', list of articles)``.
Each list of articles must contain dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). Obsolete
do not use, instead save the content to a temporary
file and pass a file:///path/to/temp/file.html as
the URL.
}
For an example, see the recipe for downloading `The Atlantic`.
In addition, you can add 'author' for the author of the article.
If you want to abort processing for some reason and have
calibre show the user a simple message instead of an error, call
:meth:`abort_recipe_processing`.
'''
raise NotImplementedError()
def abort_recipe_processing(self, msg):
'''
Causes the recipe download system to abort the download of this recipe,
displaying a simple feedback message to the user.
'''
from calibre.ebooks.conversion import ConversionUserFeedBack
raise ConversionUserFeedBack(_('Failed to download %s')%self.title,
msg)
def get_obfuscated_article(self, url):
'''
If you set `articles_are_obfuscated` this method is called with
every article URL. It should return the path to a file on the filesystem
that contains the article HTML. That file is processed by the recursive
HTML fetching engine, so it can contain links to pages/images on the web.
Alternately, you can return a dictionary of the form:
{'data': <HTML data>, 'url': <the resolved URL of the article>}. This avoids
needing to create temporary files. The `url` key in the dictionary is useful if
the effective URL of the article is different from the URL passed into this method,
for example, because of redirects. It can be omitted if the URL is unchanged.
This method is typically useful for sites that try to make it difficult to
access article content automatically.
'''
raise NotImplementedError()
def add_toc_thumbnail(self, article, src):
'''
Call this from populate_article_metadata with the src attribute of an
<img> tag from the article that is appropriate for use as the thumbnail
representing the article in the Table of Contents. Whether the
thumbnail is actually used is device dependent (currently only used by
the Kindles). Note that the referenced image must be one that was
successfully downloaded, otherwise it will be ignored.
'''
if not src or not hasattr(article, 'toc_thumbnail'):
return
src = src.replace('\\', '/')
if re.search(r'feed_\d+/article_\d+/images/img', src, flags=re.I) is None:
self.log.warn(f'Ignoring invalid TOC thumbnail image: {src!r}')
return
article.toc_thumbnail = re.sub(r'^.*?feed', 'feed',
src, flags=re.IGNORECASE)
def populate_article_metadata(self, article, soup, first):
'''
Called when each HTML page belonging to article is downloaded.
Intended to be used to get article metadata like author/summary/etc.
from the parsed HTML (soup).
:param article: A object of class :class:`calibre.web.feeds.Article`.
If you change the summary, remember to also change the text_summary
:param soup: Parsed HTML belonging to this article
:param first: True iff the parsed HTML is the first page of the article.
'''
pass
def postprocess_book(self, oeb, opts, log):
'''
Run any needed post processing on the parsed downloaded e-book.
:param oeb: An OEBBook object
:param opts: Conversion options
'''
pass
def __init__(self, options, log, progress_reporter):
'''
Initialize the recipe.
:param options: Parsed commandline options
:param log: Logging object
:param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
'''
self.log = ThreadSafeWrapper(log)
if not isinstance(self.title, str):
self.title = str(self.title, 'utf-8', 'replace')
self.debug = options.verbose > 1
self.output_dir = os.path.abspath(os.getcwd())
self.verbose = options.verbose
self.test = options.test
if self.test and not isinstance(self.test, tuple):
self.test = (2, 2)
self.username = options.username
self.password = options.password
self.lrf = options.lrf
self.output_profile = options.output_profile
self.touchscreen = getattr(self.output_profile, 'touchscreen', False)
if self.touchscreen:
self.template_css += self.output_profile.touchscreen_news_css
if self.test:
self.max_articles_per_feed = self.test[1]
self.simultaneous_downloads = min(4, self.simultaneous_downloads)
if self.debug:
self.verbose = True
self.report_progress = progress_reporter
if self.needs_subscription and (
self.username is None or self.password is None or (
not self.username and not self.password)):
if self.needs_subscription != 'optional':
raise ValueError(_('The "%s" recipe needs a username and password.')%self.title)
self.browser = self.get_browser()
self.image_map, self.image_counter = {}, 1
self.css_map = {}
web2disk_cmdline = ['web2disk',
'--timeout', str(self.timeout),
'--max-recursions', str(self.recursions),
'--delay', str(self.delay),
]
if self.verbose:
web2disk_cmdline.append('--verbose')
if self.no_stylesheets:
web2disk_cmdline.append('--dont-download-stylesheets')
for reg in self.match_regexps:
web2disk_cmdline.extend(['--match-regexp', reg])
for reg in self.filter_regexps:
web2disk_cmdline.extend(['--filter-regexp', reg])
if options.output_profile.short_name in ('default', 'tablet'):