Skip to content

Commit

Permalink
Update Mediapart
Browse files Browse the repository at this point in the history
  • Loading branch information
kovidgoyal committed Sep 9, 2020
1 parent a45c2fa commit 91f95e5
Showing 1 changed file with 16 additions and 27 deletions.
43 changes: 16 additions & 27 deletions recipes/mediapart.recipe
Expand Up @@ -14,10 +14,16 @@ from calibre.web.feeds import feeds_from_index
from datetime import date, timedelta


def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={
'class': lambda x: x and frozenset(x.split()).intersection(q)})


class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
__author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
description = 'Global news in french from news site Mediapart'
description = 'Global news in French from news site Mediapart'
publication_type = 'newspaper'
language = 'fr'
needs_subscription = True
Expand All @@ -26,6 +32,15 @@ class Mediapart(BasicNewsRecipe):
use_embedded_content = False
no_stylesheets = True

keep_only_tags = [
dict(name='h1'),
dict(name='div', **classes('author')),
classes('introduction content-article')
]
remove_tags = [
classes('login-subscribe print-source_url')
]

cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'

# --
Expand Down Expand Up @@ -116,8 +131,6 @@ class Mediapart(BasicNewsRecipe):

conversion_options = {'smarten_punctuation': True}

remove_tags = [dict(name='div', attrs={'class': 'print-source_url'})]

# non-locale specific date parse (strptime("%d %b %Y",s) would work with
# french locale)
def parse_french_date(self, date_str):
Expand All @@ -127,21 +140,6 @@ class Mediapart(BasicNewsRecipe):
month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

def print_version(self, url):
soup = self.index_to_soup(url)
# Filter old articles
# article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))

# if article_date < self.oldest_article_date:
# return None

tools = soup.find('li', {'class': 'print'})
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
# if link is None:
# print 'Error: print link not found'
# return None
return 'https://mediapart.fr' + link['href']

# -- Handle login
def get_browser(self):
def is_form_login(form):
Expand All @@ -154,12 +152,3 @@ class Mediapart(BasicNewsRecipe):
br['password'] = self.password
br.submit()
return br

# This is a workaround articles with scribd content that include
# <body></body> tags _within_ the body
preprocess_regexps = [
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE | re.DOTALL),
lambda match:
match.group(1) + re.sub(
re.compile(r'</?body>', re.IGNORECASE | re.DOTALL), '', match.group(2)) + '</body>')
]

0 comments on commit 91f95e5

Please sign in to comment.