Update Focus (PL)

kovidgoyal · Oct 4, 2013 · 9a9daa7 · 9a9daa7
1 parent 002886b
commit 9a9daa7
Showing 1 changed file with 43 additions and 77 deletions.
diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe
@@ -1,85 +1,51 @@
-#!/usr/bin/env python
-__license__ = 'GPL v3'
+from calibre.web.feeds.recipes import BasicNewsRecipe
 
-import re
-from calibre.web.feeds.news import BasicNewsRecipe
 
-class FocusRecipe(BasicNewsRecipe):
+class NYTimes(BasicNewsRecipe):
 
-    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
+    title = 'Focus'
+    __author__ = 'Krittika Goyal'
     language = 'pl'
-    version = 1
+    description = 'Polish scientific monthly magazine'
+    timefmt = ' [%d %b, %Y]'
+    needs_subscription = False
 
-    title = u'Focus'
-    publisher = u'Gruner + Jahr Polska'
-    category = u'News'
-    description = u'Focus.pl - pierwszy w Polsce portal społecznościowy dla miłośników nauki. Tematyka: nauka, historia, cywilizacja, technika, przyroda, sport, gadżety'
-    category = 'magazine'
-    cover_url = ''
-    remove_empty_feeds = True
     no_stylesheets = True
-    oldest_article = 7
-    max_articles_per_feed = 100000
-    recursions = 0
-
-    no_stylesheets = True
-    remove_javascript = True
-    encoding = 'utf-8'
-    # Seems to work best, but YMMV
-    simultaneous_downloads = 5
-
-    r = re.compile('.*(?P<url>http:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
-    keep_only_tags = []
-    keep_only_tags.append(dict(name='div', attrs={'id': 'cll'}))
-
-    remove_tags = []
-    remove_tags.append(dict(name='div', attrs={'class': 'ulm noprint'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'txb'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'h2'}))
-    remove_tags.append(dict(name='ul', attrs={'class': 'txu'}))
-    remove_tags.append(dict(name='div', attrs={'class': 'ulc'}))
-
-    extra_css = '''
-                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
-                    h1{text-align: left;}
-                    h2{font-size: medium; font-weight: bold;}
-                    p.lead {font-weight: bold; text-align: left;}
-                    .authordate {font-size: small; color: #696969;}
-                    .fot{font-size: x-small; color: #666666;}
-                    '''
-
-    feeds = [
-        ('Nauka', 'http://www.focus.pl/nauka/rss/'),
-        ('Historia', 'http://www.focus.pl/historia/rss/'),
-        ('Cywilizacja', 'http://www.focus.pl/cywilizacja/rss/'),
-        ('Sport', 'http://www.focus.pl/sport/rss/'),
-        ('Technika', 'http://www.focus.pl/technika/rss/'),
-        ('Przyroda', 'http://www.focus.pl/przyroda/rss/'),
-        ('Technologie', 'http://www.focus.pl/gadzety/rss/')
+    keep_only_tags = dict(name='article', attrs={'class': 'content'})
+    remove_tags_after = dict(name='div', attrs={'class': 'inner_article'})
+    remove_tags = [
+        dict(name='div', attrs={'class': ['social_btns']}),
     ]
 
-    def skip_ad_pages(self, soup):
-        if ('advertisement' in soup.find('title').string.lower()):
-            href = soup.find('a').get('href')
-            return self.index_to_soup(href, raw=True)
-        else:
-            return None
-
-    def get_cover_url(self):
-        soup = self.index_to_soup('http://www.focus.pl/magazyn/')
-        tag = soup.find(name='div', attrs={'class': 'clr fl'})
-        if tag:
-            self.cover_url = 'http://www.focus.pl/' + tag.a['href']
-            return getattr(self, 'cover_url', self.cover_url)
-
-    def print_version(self, url):
-        if url.count('focus.pl.feedsportal.com'):
-            u = url.find('focus0Bpl')
-            u = 'http://www.focus.pl/' + url[u + 11:]
-            u = u.replace('0C', '/')
-            u = u.replace('A', '')
-            u = u.replace('0E', '-')
-            u = u.replace('/nc/1//story01.htm', '/do-druku/1')
-        else:
-            u = url.replace('/nc/1', '/do-druku/1')
-        return u
+    # TO GET ARTICLE TOC
+    def nejm_get_index(self):
+            return self.index_to_soup('http://www.focus.pl/')
+
+    # To parse artice toc
+    def parse_index(self):
+            soup = self.nejm_get_index()
+
+            toc = soup.find('div', id='wrapper')
+
+            articles = []
+            feeds = []
+            section_title = 'Focus Articles'
+            for x in toc.findAll(True):
+                if x.name == 'h1':
+                    # Article found
+                    a = x.find('a')
+                    if a is None:
+                        continue
+                    title = self.tag_to_string(a)
+                    url = a.get('href', False)
+                    if not url or not title:
+                        continue
+                    # if url.startswith('story'):
+                    url = 'http://www.focus.pl' + url
+                    self.log('\t\tFound article:', title)
+                    self.log('\t\t\t', url)
+                    articles.append({'title': title, 'url': url,
+                                     'description': '', 'date': ''})
+            feeds.append((section_title, articles))
+
+            return feeds