diff --git a/recipes/new_yorker.recipe b/recipes/new_yorker.recipe index 93a231792c74..862428fbc0d6 100644 --- a/recipes/new_yorker.recipe +++ b/recipes/new_yorker.recipe @@ -1,22 +1,14 @@ -#!/usr/bin/env python +#!/usr/bin/env python # -*- coding: utf-8 -*- __license__ = 'GPL v3' -''' -www.canada.com -''' -import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup - class NewYorker(BasicNewsRecipe): - title = u'New Yorker Magazine' newyorker_prefix = 'http://m.newyorker.com' description = u'Content from the New Yorker website' - fp_tag = 'CAN_TC' masthead_url = 'http://www.newyorker.com/images/elements/print/newyorker_printlogo.gif' @@ -27,8 +19,9 @@ class NewYorker(BasicNewsRecipe): url_list = [] language = 'en' - __author__ = 'Nick Redding' + __author__ = 'Krittika Goyal' no_stylesheets = True + auto_cleanup = True timefmt = ' [%b %d]' encoding = 'utf-8' extra_css = ''' @@ -36,245 +29,28 @@ class NewYorker(BasicNewsRecipe): h3 { margin-bottom: 6px; } .caption { font-size: xx-small; font-style: italic; font-weight: normal; } ''' - keep_only_tags = [dict(name='div', attrs={'id':re.compile('pagebody')})] - - remove_tags = [{'class':'socialUtils'},{'class':'entry-keywords'}] def get_cover_url(self): cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg" soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine') cover_item = soup.find('div',attrs={'id':'media-count-1'}) if cover_item: - cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() + cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip() return cover_url - def fixChars(self,string): - # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) - # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) - # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) - # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) - # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) - # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) - fixed = re.sub("’","’",fixed) - return fixed - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) - return self.fixChars(massaged) - else: - return description - - def populate_article_metadata(self, article, soup, first): - if first: - picdiv = soup.find('body').find('img') - if picdiv is not None: - self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src'])) - xtitle = article.text_summary.strip() - if len(xtitle) == 0: - desc = soup.find('meta',attrs={'property':'og:description'}) - if desc is not None: - article.summary = article.text_summary = desc['content'] - shortparagraph = "" -## try: - if len(article.text_summary.strip()) == 0: - articlebodies = soup.findAll('div',attrs={'class':'entry-content'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - paras = articlebody.findAll('p') - for p in paras: - refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip() - #account for blank paragraphs and short paragraphs by appending them to longer ones - if len(refparagraph) > 0: - if len(refparagraph) > 70: #approximately one line of text - newpara = shortparagraph + refparagraph - article.summary = article.text_summary = newpara.strip() - return - else: - shortparagraph = refparagraph + " " - if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): - shortparagraph = shortparagraph + "- " - else: - article.summary = article.text_summary = self.massageNCXText(article.text_summary) -## except: -## self.log("Error creating article descriptions") -## return - - - def strip_anchors(self,soup): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) - return soup - - def preprocess_html(self,soup): - dateline = soup.find('div','published') - byline = soup.find('div','byline') - title = soup.find('h1','entry-title') - if title is None: - return self.strip_anchors(soup) - if byline is None: - title.append(dateline) - return self.strip_anchors(soup) - byline.append(dateline) - return self.strip_anchors(soup) - - def load_global_nav(self,soup): - seclist = [] - ul = soup.find('ul',attrs={'id':re.compile('global-nav-menu')}) - if ul is not None: - for li in ul.findAll('li'): - if li.a is not None: - securl = li.a['href'] - if securl != '/' and securl != '/magazine' and securl.startswith('/'): - seclist.append((self.tag_to_string(li.a),self.newyorker_prefix+securl)) - return seclist - - def exclude_url(self,url): - if url in self.url_list: - return True - if not url.endswith('html'): - return True - if 'goings-on-about-town-app' in url: - return True - if 'something-to-be-thankful-for' in url: - return True - if '/shouts/' in url: - return True - if 'out-loud' in url: - return True - if '/rss/' in url: - return True - if '/video-' in url: - return True - self.url_list.append(url) - return False - - - def load_index_page(self,soup): - article_list = [] - for div in soup.findAll('div',attrs={'class':re.compile('^rotator')}): - h2 = div.h2 - if h2 is not None: - a = h2.a - if a is not None: - url = a['href'] - if not self.exclude_url(url): - if url.startswith('/'): - url = self.newyorker_prefix+url - byline = h2.span - if byline is not None: - author = self.tag_to_string(byline) - if author.startswith('by '): - author.replace('by ','') - byline.extract() - else: - author = '' - if h2.br is not None: - h2.br.replaceWith(' ') - title = self.tag_to_string(h2) - desc = div.find(attrs={'class':['rotator-ad-body','feature-blurb-text']}) - if desc is not None: - description = self.tag_to_string(desc) - else: - description = '' - article_list.append(dict(title=title,url=url,date='',description=description,author=author,content='')) - ul = div.find('ul','feature-blurb-links') - if ul is not None: - for li in ul.findAll('li'): - a = li.a - if a is not None: - url = a['href'] - if not self.exclude_url(url): - if url.startswith('/'): - url = self.newyorker_prefix+url - if a.br is not None: - a.br.replaceWith(' ') - title = '>>'+self.tag_to_string(a) - article_list.append(dict(title=title,url=url,date='',description='',author='',content='')) - for h3 in soup.findAll('h3','header'): - a = h3.a - if a is not None: - url = a['href'] - if not self.exclude_url(url): - if url.startswith('/'): - url = self.newyorker_prefix+url - byline = h3.span - if byline is not None: - author = self.tag_to_string(byline) - if author.startswith('by '): - author = author.replace('by ','') - byline.extract() - else: - author = '' - if h3.br is not None: - h3.br.replaceWith(' ') - title = self.tag_to_string(h3).strip() - article_list.append(dict(title=title,url=url,date='',description='',author=author,content='')) - return article_list - - def load_global_section(self,securl): - article_list = [] - try: - soup = self.index_to_soup(securl) - except: - return article_list - if '/blogs/' not in securl: - return self.load_index_page(soup) - for div in soup.findAll('div',attrs={'id':re.compile('^entry')}): - h3 = div.h3 - if h3 is not None: - a = h3.a - if a is not None: - url = a['href'] - if not self.exclude_url(url): - if url.startswith('/'): - url = self.newyorker_prefix+url - if h3.br is not None: - h3.br.replaceWith(' ') - title = self.tag_to_string(h3) - article_list.append(dict(title=title,url=url,date='',description='',author='',content='')) - return article_list - - def filter_ans(self, ans) : - total_article_count = 0 - idx = 0 - idx_max = len(ans)-1 - while idx <= idx_max: - if True: #self.verbose - self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) - for article in ans[idx][1]: - total_article_count += 1 - if True: #self.verbose - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), - article['url'].replace('http://m.newyorker.com','').encode('cp1252','replace'))) - idx = idx+1 - self.log( "Queued %d articles" % total_article_count ) - return ans - - def parse_index(self): - ans = [] - try: - soup = self.index_to_soup(self.newyorker_prefix) - except: - return ans - seclist = self.load_global_nav(soup) - ans.append(('Front Page',self.load_index_page(soup))) - for (sectitle,securl) in seclist: - ans.append((sectitle,self.load_global_section(securl))) - return self.filter_ans(ans) - + soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine') + articles = [] + for story in soup.findAll('article', attrs={'itemtype':'http://schema.org/NewsArticle'}): + h2 = story.find('h2') + url = h2.find('a', href=True)['href'] + a = h2.find('a') + title = self.tag_to_string(a) + desc = '' + self.log('Found article:', title) + self.log('\t', url) + self.log('\t', desc) + articles.append({'title':title, 'url':url, 'date':'', + 'description':desc}) + + return [('Current Issue', articles)]