Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Some work on recipes (mostly Polish)
- Loading branch information
fenuks
committed
Aug 14, 2017
1 parent
cf824e5
commit 94f2927
Showing
7 changed files
with
134 additions
and
174 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,72 +1,32 @@ | ||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai | ||
from calibre.web.feeds.news import BasicNewsRecipe | ||
from calibre.ebooks.BeautifulSoup import Comment | ||
import re | ||
|
||
|
||
class FilmOrgPl(BasicNewsRecipe): | ||
title = u'Film.org.pl' | ||
__author__ = 'fenuks' | ||
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce." # noqa | ||
description = u"Recenzje, analizy, artykuły, rankingi - wszystko o filmie dla miłośników kina. Opisy efektów specjalnych, wersji reżyserskich, remake'ów, sequeli. No i forum filmowe. Jedne z największych w Polsce." | ||
category = 'film' | ||
language = 'pl' | ||
extra_css = '.alignright {float:right; margin-left:5px;} .alignleft {float:left; margin-right:5px;} .recenzja-title {font-size: 150%; margin-top: 5px; margin-bottom: 5px;}' # noqa | ||
cover_url = 'http://film.org.pl/wp-content/themes/KMF/images/logo_kmf10.png' | ||
cover_url = 'http://film.org.pl/wp-content/uploads/2015/02/film.org.pl_film.org_.pl_kmfviolet4.png' | ||
ignore_duplicate_articles = {'title', 'url'} | ||
oldest_article = 7 | ||
max_articles_per_feed = 100 | ||
no_stylesheets = True | ||
remove_javascript = True | ||
remove_empty_feeds = True | ||
use_embedded_content = False | ||
remove_attributes = ['style'] | ||
preprocess_regexps = [ | ||
(re.compile(ur'<h3>Przeczytaj także:</h3>.*', re.IGNORECASE | re.DOTALL), lambda m: '</body>'), (re.compile(ur'</?center>', re.IGNORECASE | re.DOTALL), lambda m: ''), # noqa | ||
(re.compile(ur'<div>Artykuł</div>', re.IGNORECASE), lambda m: ''), | ||
(re.compile(ur'<div>Ludzie filmu</div>', re.IGNORECASE), lambda m: ''), | ||
(re.compile(ur'(<br ?/?>\s*?){2,}', re.IGNORECASE | re.DOTALL), lambda m: '')] | ||
keep_only_tags = [dict(name=['h11', 'h16', 'h17']), | ||
dict(attrs={'class': 'editor'})] | ||
remove_tags_after = dict(id='comments') | ||
remove_tags = [dict(name=['link', 'meta', 'style']), dict(name='img', attrs={'alt': ['Ludzie filmu', u'Artykuł']}), dict(id='comments'), dict( | ||
attrs={'style': 'border: 0pt none ; margin: 0pt; padding: 0pt;'}), dict(name='p', attrs={'class': 'rating'}), dict(attrs={'layout': 'button_count'})] | ||
feeds = [ | ||
(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), | ||
(u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), | ||
(u'Analiza', u'http://film.org.pl/a/analiza/feed/'), | ||
(u'Ranking', u'http://film.org.pl/a/ranking/feed/'), | ||
(u'Blog', u'http://film.org.pl/kmf/blog/feed/'), | ||
(u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), | ||
(u'Seriale', u'http://film.org.pl/a/seriale/feed/'), | ||
(u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), | ||
(u'VHS', u'http://film.org.pl/a/vhs-a/feed/')] | ||
use_embedded_content = True | ||
|
||
def append_page(self, soup, appendtag): | ||
tag = soup.find('div', attrs={'class': 'pagelink'}) | ||
if tag: | ||
for nexturl in tag.findAll('a'): | ||
url = nexturl['href'] | ||
soup2 = self.index_to_soup(url) | ||
pagetext = soup2.find(attrs={'class': 'editor'}) | ||
comments = pagetext.findAll( | ||
text=lambda text: isinstance(text, Comment)) | ||
for comment in comments: | ||
comment.extract() | ||
pos = len(appendtag.contents) | ||
appendtag.insert(pos, pagetext) | ||
for r in appendtag.findAll(attrs={'class': 'pagelink'}): | ||
r.extract() | ||
for r in appendtag.findAll(attrs={'id': 'comments'}): | ||
r.extract() | ||
for r in appendtag.findAll(attrs={'style': 'border: 0pt none ; margin: 0pt; padding: 0pt;'}): | ||
r.extract() | ||
for r in appendtag.findAll(attrs={'layout': 'button_count'}): | ||
r.extract() | ||
remove_attributes = ['style', 'width', 'height'] | ||
remove_tags = [dict(attrs={'class': 'shortcode-box right'})] | ||
|
||
def preprocess_html(self, soup): | ||
for c in soup.findAll('h11'): | ||
c.name = 'h1' | ||
self.append_page(soup, soup.body) | ||
for r in soup.findAll('br'): | ||
r.extract() | ||
return soup | ||
feeds = [ | ||
(u'Recenzje', u'http://film.org.pl/r/recenzje/feed/'), | ||
(u'Artyku\u0142', u'http://film.org.pl/a/artykul/feed/'), | ||
(u'Analiza', u'http://film.org.pl/a/analiza/feed/'), | ||
(u'Ranking', u'http://film.org.pl/a/ranking/feed/'), | ||
(u'Blog', u'http://film.org.pl/kmf/blog/feed/'), | ||
(u'Ludzie', u'http://film.org.pl/a/ludzie/feed/'), | ||
(u'Seriale', u'http://film.org.pl/a/seriale/feed/'), | ||
(u'Oceanarium', u'http://film.org.pl/a/ocenarium/feed/'), | ||
(u'VHS', u'http://film.org.pl/a/vhs-a/feed/'), ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,52 @@ | ||
import re | ||
from calibre.web.feeds.news import BasicNewsRecipe | ||
from calibre.ebooks.BeautifulSoup import BeautifulSoup | ||
import re | ||
|
||
|
||
class FilmWebPl(BasicNewsRecipe): | ||
title = u'FilmWeb' | ||
__author__ = 'fenuks' | ||
description = 'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy. Największa baza filmów, seriali i aktorów, repertuar kin i tv, ...' # noqa | ||
cover_url = 'http://gfx.filmweb.pl/n/logo-filmweb-bevel.jpg' | ||
category = 'movies' | ||
language = 'pl' | ||
title = 'FilmWeb' | ||
__author__ = 'fenuks' | ||
description = u'Filmweb.pl - Filmy takie jak Ty Filmweb to największy i najczęściej odwiedzany polski serwis filmowy.' | ||
cover_url = 'http://1.fwcdn.pl/an/867323/63321_1.11.jpg' | ||
category = 'movies' | ||
language = 'pl' | ||
index = 'http://www.filmweb.pl' | ||
oldest_article = 8 | ||
max_articles_per_feed = 100 | ||
no_stylesheets = True | ||
remove_empty_feeds = True | ||
ignore_duplicate_articles = {'title', 'url'} | ||
remove_javascript = True | ||
preprocess_regexps = [(re.compile(u'\(kliknij\,\ aby powiększyć\)', re.IGNORECASE), lambda m: ''), (re.compile( | ||
ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />')] # (re.compile(ur' | ', re.IGNORECASE), lambda m: '')] | ||
extra_css = '.hdrBig {font-size:22px;} ul {list-style-type:none; padding: 0; margin: 0;}' | ||
remove_attributes = ['style', ] | ||
keep_only_tags = [dict(attrs={'class': ['hdr hdr-super', 'newsContent']})] | ||
feeds = [(u'News / Filmy w produkcji', 'http://www.filmweb.pl/feed/news/category/filminproduction'), | ||
(u'News / Festiwale, nagrody i przeglądy', | ||
u'http://www.filmweb.pl/feed/news/category/festival'), | ||
(u'News / Seriale', u'http://www.filmweb.pl/feed/news/category/serials'), | ||
(u'News / Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), | ||
(u'News / Multimedia', | ||
u'http://www.filmweb.pl/feed/news/category/multimedia'), | ||
(u'News / Dystrybucja dvd / blu-ray', | ||
u'http://www.filmweb.pl/feed/news/category/video'), | ||
(u'News / Dystrybucja kinowa', | ||
u'http://www.filmweb.pl/feed/news/category/cinema'), | ||
(u'News / off', u'http://www.filmweb.pl/feed/news/category/off'), | ||
(u'News / Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'), | ||
(u'News / Organizacje branżowe', | ||
u'http://www.filmweb.pl/feed/news/category/organizations'), | ||
(u'News / Internet', u'http://www.filmweb.pl/feed/news/category/internet'), | ||
(u'News / Różne', u'http://www.filmweb.pl/feed/news/category/other'), | ||
(u'News / Kino polskie', | ||
u'http://www.filmweb.pl/feed/news/category/polish.cinema'), | ||
(u'News / Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), | ||
(u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), | ||
(u'Recenzje użytkowników', | ||
u'http://www.filmweb.pl/feed/user-reviews/latest') | ||
] | ||
|
||
def skip_ad_pages(self, soup): | ||
skip_tag = soup.find('a', attrs={'class': 'welcomeScreenButton'}) | ||
if skip_tag is not None: | ||
return self.index_to_soup(skip_tag['href'], raw=True) | ||
|
||
def postprocess_html(self, soup, first_fetch): | ||
for r in soup.findAll(attrs={'class': 'singlephoto'}): | ||
r['style'] = 'float:left; margin-right: 10px;' | ||
return soup | ||
use_embedded_content = False | ||
extra_css = ('.hdrBig {font-size:22px;} ul {list-style-type:none;} ' | ||
'ul.inline > li {display: inline;} ' | ||
'ul.sep-line > li + li::before {content: " | "} ' | ||
'ul.inline {padding:0px;} .vertical-align {display: inline-block;}') | ||
preprocess_regexps = [(re.compile(r'<body.+?</head>', re.DOTALL), lambda match: ''), # fix malformed HTML with 2 body tags... | ||
(re.compile(u'(?:<sup>)?\(kliknij\,\ aby powiększyć\)(?:</sup>)?', re.IGNORECASE), lambda m: ''), | ||
(re.compile(ur'(<br ?/?>\s*?<br ?/?>\s*?)+', re.IGNORECASE), lambda m: '<br />') | ||
] | ||
remove_tags = [dict(attrs={'class':['infoParent', 'likeBar', | ||
'droptions-box pull-right', 'photoDesc', 'imageLicense', 'play big', 'shadow embed__icon--svg']})] | ||
remove_attributes = ['style',] | ||
keep_only_tags = [dict(attrs={'class': ['newsHdr hdrWithAuthor ', 'reviewHdr', 'newsContent newsPage', 'newsContent']})] | ||
# remove_tags_before = dict(attrs={'class': 'hdr hdr-mega'}) | ||
# remove_tags_after = dict(attrs={'class': 'newsContent'}) | ||
feeds = [(u'Filmy', u'http://www.filmweb.pl/feed/news/category/film'), | ||
(u'Seriale', u'http://www.filmweb.pl/feed/news/category/serial'), | ||
(u'Box office', u'http://www.filmweb.pl/feed/news/category/boxoffice'), | ||
(u'Telewizja', u'http://www.filmweb.pl/feed/news/category/tv'), | ||
(u'Festiwale, nagrody i przeglądy', u'http://www.filmweb.pl/feed/news/category/festival'), | ||
(u'Multimedia', u'http://www.filmweb.pl/feed/news/category/multimedia'), | ||
(u'Dystrybucja dvd/blu-ray', u'http://www.filmweb.pl/feed/news/category/dvd'), | ||
(u'Gry wideo', u'http://www.filmweb.pl/feed/news/category/game'), | ||
(u'Różne', u'http://www.filmweb.pl/feed/news/category/other'), | ||
(u'Recenzje redakcji', u'http://www.filmweb.pl/feed/reviews/latest'), | ||
(u'Recenzje użytkowników', u'http://www.filmweb.pl/feed/user-reviews/latest') | ||
] | ||
|
||
def preprocess_html(self, soup): | ||
for a in soup('a'): | ||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: # noqa | ||
a['href'] = self.index + a['href'] # noqa | ||
for i in soup.findAll('a', attrs={'class': 'fn'}): | ||
i.insert(len(i), BeautifulSoup('<br />')) | ||
for i in soup.findAll('sup'): | ||
if not i.string or i.string.startswith('(kliknij'): | ||
i.extract() | ||
for r in soup.findAll(id=re.compile('photo-\d+')): | ||
r.extract() | ||
for r in soup.findAll(style=re.compile('float: ?left')): | ||
r['class'] = 'singlephoto' | ||
if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: | ||
a['href'] = self.index + a['href'] | ||
|
||
return soup |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from calibre.web.feeds.news import BasicNewsRecipe | ||
import re | ||
|
||
|
||
class ParisReview(BasicNewsRecipe): | ||
title = 'The Paris Review Blog' | ||
__author__ = 'fenuks' | ||
description = u'The Paris Review is a literary magazine featuring original writing, art, and in-depth interviews with famous writers.' | ||
# cover_url = '' | ||
category = 'culture' | ||
language = 'en' | ||
encoding = 'utf-8' | ||
oldest_article = 8 | ||
max_articles_per_feed = 100 | ||
no_stylesheets = True | ||
remove_empty_feeds = True | ||
ignore_duplicate_articles = {'title', 'url'} | ||
remove_javascript = True | ||
use_embedded_content = True | ||
# extra_css = '' | ||
# preprocess_regexps = [] | ||
# remove_attributes = ['style',] | ||
# keep_only_tags = [] | ||
remove_tags = [] | ||
# remove_tags_before = dict() | ||
remove_tags_after = dict() | ||
feeds = [('Posts', 'http://feeds.feedburner.com/TheParisReviewBlog')] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from calibre.web.feeds.news import BasicNewsRecipe | ||
import re | ||
|
||
|
||
class PublicDomainReview(BasicNewsRecipe): | ||
title = 'The Public Domain Review' | ||
__author__ = 'fenuks' | ||
description = u'Online journal dedicated to showcasing the most interesting and unusual out-of-copyright works available on the web' | ||
cover_url = 'http://publicdomainreview.org/wp-content/themes/pdr/assets/img/pdr-logo.gif' | ||
category = 'culture' | ||
language = 'en' | ||
encoding = 'utf-8' | ||
oldest_article = 14 | ||
max_articles_per_feed = 100 | ||
no_stylesheets = True | ||
remove_empty_feeds = True | ||
ignore_duplicate_articles = {'title', 'url'} | ||
remove_javascript = True | ||
use_embedded_content = False | ||
# extra_css = '' | ||
# preprocess_regexps = [] | ||
# remove_attributes = ['style',] | ||
keep_only_tags = [dict(name='article', attrs={'class': re.compile(r'post-\d+')})] | ||
remove_tags = [dict(attrs={'class': 'synved-social-container synved-social-container-share'})] | ||
# remove_tags_before = dict() | ||
remove_tags_after = dict(name='div', attrs={'class': 'entry-content'}) | ||
feeds = [('Posts', 'http://publicdomainreview.org/feed/')] |