Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update economist_espresso.recipe #2293

Merged
merged 2 commits into from
May 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 239 additions & 40 deletions recipes/economist_espresso.recipe
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,92 @@
https://www.economist.com/the-world-in-brief
'''

from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe, classes
import json
import time
from urllib.parse import quote, urlencode

from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse
from lxml import etree


def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
ans.text = text
parent.append(ans)
return ans


def process_node(node, html_parent):
ntype = node.get('type')
if ntype == 'tag':
c = html_parent.makeelement(node['name'])
c.attrib.update({k: v or '' for k, v in node.get('attribs', {}).items()})
html_parent.append(c)
for nc in node.get('children', ()):
process_node(nc, c)
elif ntype == 'text':
text = node.get('data')
if text:
text = replace_entities(text)
if len(html_parent):
t = html_parent[-1]
t.tail = (t.tail or '') + text
else:
html_parent.text = (html_parent.text or '') + text


def safe_dict(data, *names):
ans = data
for x in names:
ans = ans.get(x) or {}
return ans


class JSONHasNoContent(ValueError):
pass


def load_article_from_json(raw, root):
# open('/t/raw.json', 'w').write(raw)
data = json.loads(raw)
body = root.xpath('//body')[0]
article = E(body, 'article')
E(article, 'div', data['flyTitle'] , style='color: red; font-size:small; font-weight:bold;')
E(article, 'h1', data['title'], title=safe_dict(data, "url", "canonical") or '')
E(article, 'div', data['rubric'], style='font-style: italic; color:#202020;')
E(article, 'div', data['byline'], style='font-style: italic; color:#202020;')
main_image_url = safe_dict(data, 'image', 'main', 'url').get('canonical')
if main_image_url:
div = E(article, 'div')
try:
E(div, 'img', src=main_image_url)
except Exception:
pass
for node in data.get('text') or ():
process_node(node, article)

def cleanup_html_article(root):
main = root.xpath('//main')[0]
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
body.append(main)
main.set('id', '')
main.tag = 'article'
for x in root.xpath('//*[@style]'):
x.set('style', '')
for x in root.xpath('//button'):
x.getparent().remove(x)


def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})


def new_tag(soup, name, attrs=()):
Expand All @@ -13,64 +97,179 @@ def new_tag(soup, name, attrs=()):
return Tag(soup, name, attrs=attrs or None)


class NoArticles(Exception):
pass


def process_url(url):
if url.startswith('/'):
url = 'https://www.economist.com' + url
return url


class Espresso(BasicNewsRecipe):
title = 'The Economist Espresso'
language = 'en'
__author__ = 'unkn0wn'
encoding = 'utf-8'
masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'
cover_url = 'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png'
description = (
'Espresso is a rich, full-flavoured shot of daily global analysis'
' from the editors of The Economist to get you up to speed, fast.'
'Maximise your understanding of the most significant business, '
'economic, political and cultural developments globally.'
)
cover_url = 'https://downloadr2.apkmirror.com/wp-content/uploads/2021/10/75/615777cc6611b.png'
no_stylesheets = True
remove_attributes = ['height', 'width', 'style']
use_embedded_content = False
masthead_url = 'https://www.livemint.com/lm-img/dev/economist-logo-oneline.png'

extra_css = '''
h1 { text-align:center; }
._main-image, ._description, .sub { text-align:center; font-size:small; }
._quote-container { font-size:x-large; font-style:italic; color:#202020; }
em { color:#202020; }
img {display:block; margin:0 auto;}
'''

keep_only_tags = [
dict(name='main', attrs={'id':'content'})
]

remove_tags = [
classes('_podcast-promo _newsletter-promo-container _time-last-updated')
]

def parse_index(self):
return [
('Espresso',
[
{
'title': 'The World in Brief',
'url': 'https://www.economist.com/the-world-in-brief',
'description': 'Catch up quickly on the global stories that matter'
},
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
dict(attrs={'aria-label': "Article Teaser"}),
dict(attrs={
'class': [
'dblClkTrk', 'ec-article-info', 'share_inline_header',
'related-items', 'main-content-container', 'ec-topic-widget',
'teaser', 'blog-post__bottom-panel-bottom', 'blog-post__comments-label',
'blog-post__foot-note', 'blog-post__sharebar', 'blog-post__bottom-panel',
'newsletter-form','share-links-header','teaser--wrapped', 'latest-updates-panel__container',
'latest-updates-panel__article-link','blog-post__section'
]
),
]

def print_version(self, url):
return 'https://webcache.googleusercontent.com/search?q=cache:' + url

def preprocess_html(self, soup):
if h1 := soup.find('h1'):
if p := h1.find_next_sibling('p'):
p['class'] = 'sub'
for hr in soup.findAll(attrs={'class':['_gobbet', '_article']}):
nt = new_tag(soup, 'hr')
hr.append(nt)
return soup
}
),
dict(attrs={
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
classes(
'share-links-header teaser--wrapped latest-updates-panel__container'
' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
)
]
keep_only_tags = [dict(name='article', id=lambda x: not x)]
no_stylesheets = True
remove_attributes = ['data-reactid', 'width', 'height']

def get_browser(self, *args, **kwargs):
# Needed to bypass cloudflare
kwargs['user_agent'] = 'common_words/based'
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.addheaders += [('Accept-Language', 'en-GB,en-US;q=0.9,en;q=0.8')]
return br

def economist_return_index(self, ans):
if not ans:
raise NoArticles(
'Could not find any articles, either the '
'economist.com server is having trouble and you should '
'try later or the website format has changed and the '
'recipe needs to be updated.'
)
return ans

def parse_index(self):
# return self.economist_test_article()
# url = 'https://www.economist.com/weeklyedition/archive'
query = {
'query': 'query EspressoQuery($ref:String!){espresso:canonical(ref:$ref){...EspressoFragment __typename}}fragment EspressoFragment on Content{id type hasPart(size:1 sort:"datePublished:desc"){parts{id type rubric:description hasPart(sort:"publication.context.position:asc,datePublished:desc"){parts{...ArticleFragment __typename}__typename}__typename}__typename}__typename}fragment ArticleFragment on Content{ad{grapeshot{channels{name __typename}__typename}__typename}articleSection{internal{id title:headline __typename}__typename}audio{main{id duration(format:"seconds")source:channel{id __typename}url{canonical __typename}__typename}__typename}byline dateline dateModified datePublished dateRevised flyTitle:subheadline id image{...ImageInlineFragment ...ImageMainFragment ...ImagePromoFragment __typename}print{title:headline flyTitle:subheadline rubric:description section{id title:headline __typename}__typename}publication{id tegID title:headline flyTitle:subheadline datePublished regionsAllowed url{canonical __typename}__typename}rubric:description source:channel{id __typename}tegID text(format:"json")title:headline type url{canonical __typename}topic contentIdentity{forceAppWebview mediaType articleType __typename}__typename}fragment ImageInlineFragment on Media{inline{url{canonical __typename}width height __typename}__typename}fragment ImageMainFragment on Media{main{url{canonical __typename}width height __typename}__typename}fragment ImagePromoFragment on Media{promo{url{canonical __typename}id width height __typename}__typename}', # noqa
'operationName': 'EspressoQuery',
'variables': '{"ref":"/content/jakj5ed3rml75i8j0d5i74p8adf6eem4"}',
}
url = 'https://cp2-graphql-gateway.p.aws.economist.com/graphql?' + urlencode(query, safe='()!', quote_via=quote)
try:
raw = self.index_to_soup(url, raw=True)
except Exception:
raise ValueError('Server is not reachable, try again after some time.')
ans = self.economist_parse_index(raw)
return self.economist_return_index(ans)

def economist_parse_index(self, raw):
data = json.loads(raw)['data']['espresso']['hasPart']['parts'][0]
self.description = data['rubric']

ans = []
for part in safe_dict(data, "hasPart", "parts"):
title = safe_dict(part, "title")
pt = PersistentTemporaryFile('.html')
pt.write(json.dumps(part).encode('utf-8'))
pt.close()
url = 'file:///' + pt.name
ans.append({"title": title, "url": url})
return [('The world in brief', ans)]

def populate_article_metadata(self, article, soup, first):
article.url = soup.find('h1')['title']

def preprocess_raw_html(self, raw, url):
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
body = '<html><body><article></article></body></html>'
root = parse(body)
load_article_from_json(raw, root)

for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
img = list(parse(noscript[0].text).iter('img'))
if img:
p = noscript[0].getparent()
idx = p.index(noscript[0])
p.insert(idx, p.makeelement('img', src=img[0].get('src')))
p.remove(noscript[0])
for x in root.xpath('//*[name()="script" or name()="style" or name()="source" or name()="meta"]'):
x.getparent().remove(x)
# the economist uses <small> for small caps with a custom font
for init in root.xpath('//span[@data-caps="initial"]'):
init.set('style', 'font-weight:bold;')
for x in root.xpath('//small'):
if x.text and len(x) == 0:
x.text = x.text.upper()
x.tag = 'span'
x.set('style', 'font-variant: small-caps')
for h2 in root.xpath('//h2'):
h2.tag = 'h4'
for x in root.xpath('//figcaption'):
x.set('style', 'text-align:center; font-size:small;')
for x in root.xpath('//cite'):
x.tag = 'blockquote'
x.set('style', 'color:#404040;')
raw = etree.tostring(root, encoding='unicode')
return raw


def eco_find_image_tables(self, soup):
for x in soup.findAll('table', align=['right', 'center']):
if len(x.findAll('font')) in (1, 2) and len(x.findAll('img')) == 1:
yield x

def postprocess_html(self, soup, first):
for img in soup.findAll('img', srcset=True):
del img['srcset']
for table in list(self.eco_find_image_tables(soup)):
caption = table.find('font')
img = table.find('img')
div = new_tag(soup, 'div')
div['style'] = 'text-align:left;font-size:70%'
ns = NavigableString(self.tag_to_string(caption))
div.insert(0, ns)
div.insert(1, new_tag(soup, 'br'))
del img['width']
del img['height']
img.extract()
div.insert(2, img)
table.replaceWith(div)
return soup

def canonicalize_internal_url(self, url, is_link=True):
if url.endswith('/print'):
url = url.rpartition('/')[0]
return BasicNewsRecipe.canonicalize_internal_url(self, url, is_link=is_link)


def get_login_cookies(username, password):
print(33333333333, username, password)


if __name__ == '__main__':
import sys
get_login_cookies(sys.argv[-2], sys.argv[-1])
Loading