Update New Yorker

kovidgoyal · Jul 27, 2014 · 5d136ad · 5d136ad
1 parent 124310d
commit 5d136ad
Showing 1 changed file with 19 additions and 243 deletions.
diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe
@@ -1,22 +1,14 @@
-#!/usr/bin/env  python
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 
-'''
-www.canada.com
-'''
-import re
 from calibre.web.feeds.news import BasicNewsRecipe
 
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
-
 class NewYorker(BasicNewsRecipe):
 
-
     title = u'New Yorker Magazine'
     newyorker_prefix = 'http://m.newyorker.com'
     description = u'Content from the New Yorker website'
-    fp_tag = 'CAN_TC'
 
     masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif'
 
@@ -27,254 +19,38 @@ class NewYorker(BasicNewsRecipe):
 
     url_list = []
     language = 'en'
-    __author__ = 'Nick Redding'
+    __author__ = 'Krittika Goyal'
     no_stylesheets = True
+    auto_cleanup = True
     timefmt =  ' [%b %d]'
     encoding = 'utf-8'
     extra_css = '''
                 .byline { font-size:xx-small; font-weight: bold;}
                 h3 { margin-bottom: 6px; }
                 .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                 '''
-    keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})]
-
-    remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}]
 
     def get_cover_url(self):
         cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
         soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine')
         cover_item = soup.find('div',attrs={'id':'media-count-1'})
         if cover_item:
-           cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
+            cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
         return cover_url
 
-    def fixChars(self,string):
-        # Replace lsquo (\x91)
-        fixed = re.sub("\x91","‘",string)
-        # Replace rsquo (\x92)
-        fixed = re.sub("\x92","’",fixed)
-        # Replace ldquo (\x93)
-        fixed = re.sub("\x93","“",fixed)
-        # Replace rdquo (\x94)
-        fixed = re.sub("\x94","”",fixed)
-        # Replace ndash (\x96)
-        fixed = re.sub("\x96","–",fixed)
-        # Replace mdash (\x97)
-        fixed = re.sub("\x97","—",fixed)
-        fixed = re.sub("&#x2019;","’",fixed)
-        return fixed
-
-    def massageNCXText(self, description):
-        # Kindle TOC descriptions won't render certain characters
-        if description:
-            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
-            # Replace '&' with '&'
-            massaged = re.sub("&","&", massaged)
-            return self.fixChars(massaged)
-        else:
-            return description
-
-    def populate_article_metadata(self, article, soup, first):
-        if first:
-            picdiv = soup.find('body').find('img')
-            if picdiv is not None:
-                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
-        xtitle = article.text_summary.strip()
-        if len(xtitle) == 0:
-            desc = soup.find('meta',attrs={'property':'og:description'})
-            if desc is not None:
-                article.summary = article.text_summary = desc['content']
-        shortparagraph = ""
-##        try:
-        if len(article.text_summary.strip()) == 0:
-            articlebodies = soup.findAll('div',attrs={'class':'entry-content'})
-            if articlebodies:
-                for articlebody in articlebodies:
-                    if articlebody:
-                        paras = articlebody.findAll('p')
-                        for p in paras:
-                            refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
-                            #account for blank paragraphs and short paragraphs by appending them to longer ones
-                            if len(refparagraph) > 0:
-                                if len(refparagraph) > 70: #approximately one line of text
-                                    newpara = shortparagraph + refparagraph
-                                    article.summary = article.text_summary = newpara.strip()
-                                    return
-                                else:
-                                    shortparagraph = refparagraph + " "
-                                    if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
-                                        shortparagraph = shortparagraph + "- "
-        else:
-            article.summary = article.text_summary = self.massageNCXText(article.text_summary)
-##        except:
-##            self.log("Error creating article descriptions")
-##            return
-
-
-    def strip_anchors(self,soup):
-        paras = soup.findAll(True)
-        for para in paras:
-            aTags = para.findAll('a')
-            for a in aTags:
-                if a.img is None:
-                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
-        return soup
-
-    def preprocess_html(self,soup):
-        dateline = soup.find('div','published')
-        byline = soup.find('div','byline')
-        title = soup.find('h1','entry-title')
-        if title is None:
-            return self.strip_anchors(soup)
-        if byline is None:
-            title.append(dateline)
-            return self.strip_anchors(soup)
-        byline.append(dateline)
-        return self.strip_anchors(soup)
-
-    def load_global_nav(self,soup):
-        seclist = []
-        ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')})
-        if ul is not None:
-            for li in ul.findAll('li'):
-                if li.a is not None:
-                    securl = li.a['href']
-                    if securl != '/' and securl != '/magazine' and securl.startswith('/'):
-                        seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl))
-        return seclist
-
-    def exclude_url(self,url):
-        if url in self.url_list:
-            return True
-        if not url.endswith('html'):
-            return True
-        if 'goings-on-about-town-app' in url:
-            return True
-        if 'something-to-be-thankful-for' in url:
-            return True
-        if '/shouts/' in url:
-            return True
-        if 'out-loud' in url:
-            return True
-        if '/rss/' in url:
-            return True
-        if '/video-' in url:
-            return True
-        self.url_list.append(url)
-        return False
-
-
-    def load_index_page(self,soup):
-        article_list = []
-        for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}):
-            h2 = div.h2
-            if h2 is not None:
-                a = h2.a
-                if a is not None:
-                    url = a['href']
-                    if not self.exclude_url(url):
-                        if url.startswith('/'):
-                            url = self.newyorker_prefix+url
-                        byline = h2.span
-                        if byline is not None:
-                            author = self.tag_to_string(byline)
-                            if author.startswith('by '):
-                                author.replace('by ','')
-                            byline.extract()
-                        else:
-                            author = ''
-                        if h2.br is not None:
-                            h2.br.replaceWith(' ')
-                        title = self.tag_to_string(h2)
-                        desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']})
-                        if desc is not None:
-                            description = self.tag_to_string(desc)
-                        else:
-                            description = ''
-                        article_list.append(dict(title=title,url=url,date='',description=description,author=author,content=''))
-                        ul = div.find('ul','feature-blurb-links')
-                        if ul is not None:
-                            for li in ul.findAll('li'):
-                                a = li.a
-                                if a is not None:
-                                    url = a['href']
-                                    if not self.exclude_url(url):
-                                        if url.startswith('/'):
-                                            url = self.newyorker_prefix+url
-                                        if a.br is not None:
-                                            a.br.replaceWith(' ')
-                                        title = '>>'+self.tag_to_string(a)
-                                        article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
-        for h3 in soup.findAll('h3','header'):
-            a = h3.a
-            if a is not None:
-                url = a['href']
-                if not self.exclude_url(url):
-                    if url.startswith('/'):
-                        url = self.newyorker_prefix+url
-                    byline = h3.span
-                    if byline is not None:
-                        author = self.tag_to_string(byline)
-                        if author.startswith('by '):
-                            author = author.replace('by ','')
-                        byline.extract()
-                    else:
-                        author = ''
-                    if h3.br is not None:
-                        h3.br.replaceWith(' ')
-                    title = self.tag_to_string(h3).strip()
-                    article_list.append(dict(title=title,url=url,date='',description='',author=author,content=''))
-        return article_list
-
-    def load_global_section(self,securl):
-        article_list = []
-        try:
-            soup = self.index_to_soup(securl)
-        except:
-            return article_list
-        if '/blogs/' not in securl:
-            return self.load_index_page(soup)
-        for div in soup.findAll('div',attrs={'id':re.compile('^entry')}):
-            h3 = div.h3
-            if h3 is not None:
-                a = h3.a
-                if a is not None:
-                    url = a['href']
-                    if not self.exclude_url(url):
-                        if url.startswith('/'):
-                            url = self.newyorker_prefix+url
-                        if h3.br is not None:
-                            h3.br.replaceWith(' ')
-                        title = self.tag_to_string(h3)
-                        article_list.append(dict(title=title,url=url,date='',description='',author='',content=''))
-        return article_list
-
-    def filter_ans(self, ans) :
-        total_article_count = 0
-        idx = 0
-        idx_max = len(ans)-1
-        while idx <= idx_max:
-            if True: #self.verbose
-                self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
-            for article in ans[idx][1]:
-                total_article_count += 1
-                if True: #self.verbose
-                    self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
-                              article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace')))
-            idx = idx+1
-        self.log( "Queued %d articles" % total_article_count )
-        return ans
-
-
     def parse_index(self):
-        ans = []
-        try:
-            soup = self.index_to_soup(self.newyorker_prefix)
-        except:
-            return ans
-        seclist = self.load_global_nav(soup)
-        ans.append(('Front Page',self.load_index_page(soup)))
-        for (sectitle,securl) in seclist:
-            ans.append((sectitle,self.load_global_section(securl)))
-        return self.filter_ans(ans)
-
+        soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine')
+        articles = []
+        for story in soup.findAll('article', attrs={'itemtype':'http://schema.org/NewsArticle'}):
+            h2 = story.find('h2')
+            url = h2.find('a', href=True)['href']
+            a = h2.find('a')
+            title = self.tag_to_string(a)
+            desc = ''
+            self.log('Found article:', title)
+            self.log('\t', url)
+            self.log('\t', desc)
+            articles.append({'title':title, 'url':url, 'date':'',
+                'description':desc})
+
+        return [('Current Issue', articles)]