In [None]:
import requests
from bs4 import BeautifulSoup
from abc import ABC
import re

In [None]:
class Site:
    def __init__(self,name,posts):
        self.name = name
        self.posts = posts
        
    def __repr__(self):
        #stub
        return f'{self.name}\n\n\n{self.posts_to_str()}'
    
    def filter_posts(self):
        '''
        remove unecessary posts from the posts attribute.
        
        to be implemented by a subclass.'''
        pass
    
    def posts_to_str(self):
        return '\n\n\n'.join([f'{post}' for post in self.posts])

In [None]:
class Post:
    def __init__(self,title,content,link):
        self.title = title
        self.content = content
        self.link = link
    
    def title_contains(self, target):
        return target in self.title.lower()
    
    def __repr__(self):
        #stub
        return f'TITLE:\n\n{self.title}\n\nCONTENT:\n\n{self.content}\n\nLINK:\n\n{self.link}'

In [None]:
class Source():
    def __init__(self,name,url):
        self.name = name
        self.url = url
        
    def download(self) -> Site:
        '''returns Site'''
        source_html = requests.get(self.url).text
        soup = BeautifulSoup(source_html,'html.parser')
        
        site = self.parse(soup)
        site.filter_posts()
        
        return site
    
    def parse(self,soup) -> Site:
        '''to be implemented by a subclass'''
        pass

In [None]:
class MacrumorsSite(Site):
    def filter_posts(self):
        self.remove_giveaways_posts()
        self.remove_deals()
        
    def remove_deals(self):
        self.remove_posts(lambda post: post.title_contains('deals:'))
        
    def remove_giveaways_posts(self):
        self.remove_posts(lambda post: post.title_contains('giveaway'))
    
    def remove_posts(self, filter_fn):
        '''filter_fn is a function that returns true
        if the post should be removed.'''
        
        self.posts = list(filter(lambda post: not filter_fn(post), self.posts))

In [None]:
class MacrumorsSource(Source):
    def parse(self,soup) -> MacrumorsSite:
        contents = self.parse_content(soup)
        titles = self.parse_titles(soup)
        links = self.parse_links(soup)
        
        posts_raw = zip(titles,contents,links)
        posts = [Post(*post_data) for post_data in posts_raw]
        
        return MacrumorsSite(self.name, posts)
    
    def parse_content(self,soup):
        content_list = soup.find_all(class_='js-contentInner')
        content_list = [self.cleanup_content(content.get_text()) for content in content_list]
        return content_list
    
    def parse_titles(self,soup):
        return self.get_all_titles(lambda title: title.get_text(), soup)
    
    def parse_links(self,soup):
        return self.get_all_titles(lambda title: title.find('a')['href'],soup)
    
    def get_all_titles(self,fn,soup):
        '''returns [fn(title)]'''
        articles = soup.find_all(class_='js-article')
        titles = map(self.get_title, articles)
        return [fn(title) for title in titles]
    
    def get_title(self,article):
        '''
        gets the first child of the article.
        
        This is more resilient than looking for certain tags.
        '''
        return next(article.children)
        
    def cleanup_content(self,content):
        # corrects the content text.
        replacements = [
            ('\r ','\n'*2), #replace linebreaks
            ('  ', ' '), #fix double spaces
            ('\n','\n\n')
        ]
        for r in replacements:
            content = content.replace(*r)
            
        # removes repertitive whitespace
        content = re.sub(r'\n\n+[\n]','\n\n',content)

        # remove errors in the content text
        errors = [
            '\u200c',
            'img.lazyload { display: none; }',
            'img.lazyload { display: none; } '
        ]
        
        for e in errors:
            # removes error
            content = content.replace(e, '')
            
        # remove leading blankspace
        paragraphs = content.split('\n'*2)
        paragraphs = [p.strip() for p in paragraphs]
        content = ('\n'*2).join(paragraphs)
        
        # remove trailing blankspace
        content = content.strip()
            
        return content

In [None]:
def prevent_exit():
    input('Press enter to exit.')

In [None]:
m = MacrumorsSource('Macrumors','https://www.macrumors.com/')
site = m.download()

In [None]:
print(site)
prevent_exit()