-
-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
gazeta-prawna-calibre-v1.recipe
112 lines (94 loc) · 4.44 KB
/
gazeta-prawna-calibre-v1.recipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2020, Tomasz Jozwiak <tjozwiakgm@gmail.com>'
__author__ = u'Tomasz Jozwiak'
'''
gazetaprawna.pl
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class gazetaprawna(BasicNewsRecipe):
version = 2
title = u'Gazeta Prawna'
__author__ = u'Tomasz Jozwiak'
publisher = u'Infor Biznes'
max_articles_per_feed = 30
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
category = 'newspaper'
publication_type = 'newspaper'
description = 'Polski dziennik gospodarczy'
language = 'pl'
encoding = 'utf-8'
ignore_duplicate_articles = {'title', 'url'}
use_embedded_content = False
oldest_article = 1
extra_css = '''
.psavBigImgTitle {font-size:50%;}
.psavImgContent {font-size:50%;}
.leadDiv {font-weight: bold;}
.date {font-size:50%;}
.articleGate {font-style: italic; font-weight: normal; font-size:50%;}
'''
remove_tags_before = [
dict(name='div', attrs={'class': ['article']}),
dict(name='div', attrs={'itemprop': ['breadcrumb']})
]
remove_tags_after = [
dict(name='div', attrs={'class': ['articleBody', 'artPayWall', 'contentGalBottom', 'komentarze-forum']}),
]
remove_tags = [
dict(name='span', attrs={'class': ['psav_bigphoto', 'psav_speclinkarea', 'psav_video_target']}),
dict(name='div', attrs={'class': ['shareArticleButtons nowe2', 'artPayWall', 'contentGalBottom', 'contentGalTop', 'video-target', 'komentarze-forum']}),
dict(name=['link', 'meta', 'style']),
dict(name='div', attrs={'itemprop': ['breadcrumb']}),
dict(name='section', attrs={'class': ['videoSection']})
]
feeds = [
(u'Z ostatniej chwili', u'http://rss.gazetaprawna.pl/GazetaPrawna'),
(u'Biznes i prawo gospodarcze', u'http://rss.gazetaprawna.pl/GazetaPrawna-Biznes'),
(u'Prawo i wymiar sprawiedliwo\u015bci', u'http://rss.gazetaprawna.pl/GazetaPrawna-Prawo'),
(u'Praca i ubezpieczenia', u'http://rss.gazetaprawna.pl/GazetaPrawna-Praca'),
(u'Podatki i rachunkowo\u015b\u0107', u'http://rss.gazetaprawna.pl/GazetaPrawna-Podatki'),
(u'Finanse - waluty i notowania', u'http://rss.gazetaprawna.pl/GazetaPrawna-Finanse'),
]
def parse_feeds(self):
self.log(_('Gazeta Prawna overrode parse_feeds()'))
parsed_feeds = BasicNewsRecipe.parse_feeds(self)
for n, feed in enumerate(parsed_feeds):
for a, article in enumerate(feed):
article.text_summary = re.sub(r'<\!\[CDATA\[', "", article.text_summary)
article.text_summary = re.sub(r'\]\]', "", article.text_summary)
article.summary = article.text_summary
return parsed_feeds
def preprocess_html(self, soup):
for Img in soup.findAll(name='div', attrs={'class': ['psavBigImg']}):
for img_tag in Img.findAll(name='img', attrs={'data-src': True}):
img_tag['src'] = img_tag['data-src']
del img_tag['data-src']
# print(Img.prettify())
for span in soup.findAll(name='span'):
if len(self.tag_to_string(span)) > 1:
span.append(" ")
for locked in soup.findAll(name='div', attrs={'class': ['articleGate']}):
locked.append(u"Przejd\u017a do artyku\u0142u na GazetaPrawna.pl aby zalogowa\u0107 si\u0119 lub wykupi\u0107 dost\u0119p")
return soup
def populate_article_metadata(self, article, soup, first):
Img = soup.find(name='div', attrs={'class': ['psavBigImg']})
if Img:
img_tag = Img.find(name='img', attrs={'src': True})
if img_tag:
self.add_toc_thumbnail(article, img_tag['src'])
self.log(_('adding thumbnail: %s to Article') % (img_tag['src']))
article.author = 'Gazeta prawna.pl'
if len(article.title) > 80:
title = article.title[:80]
title = title.rsplit(None, 1)
article.title = title[0]
self.log(_('The title cuting in %s to keep the thumbnail visible') % (article.url))
def get_cover_url(self):
soup = self.index_to_soup(
'http://www.egazety.pl/infor/e-wydanie-dziennik-gazeta-prawna.html')
self.cover_url = soup.find("a", {"class": "image cover-preview"}).img['src']
return getattr(self, 'cover_url', self.cover_url)