In [1]:
import requests
from bs4 import BeautifulSoup
from abc import ABC
import re

In [5]:
class Source():
    def __init__(self,name,url):
        self.name = name
        self.url = url
        
    def download(self) -> Site:
        '''returns Site'''
        source_html = requests.get(self.url).text
        soup = BeautifulSoup(source_html,'html.parser')
        
        return self.parse(soup)
    
    def parse(self,soup) -> Site:
        '''to be implemented by a subclass'''
        pass

In [None]:
class MacrumorsSource(Source):
    def parse(self,soup):
        contents = self.parse_content(soup)
        titles = self.parse_titles(soup)
        links = self.parse_links(soup)
        
        posts_raw = zip(titles,contents,links)
        posts = [Post(*post_data) for post_data in posts_raw]
        
        return Site(self.name, posts)
    
    def parse_content(self,soup):
        content_list = soup.find_all(class_='js-contentInner')
        content_list = [self.cleanup_content(content.get_text()) for content in content_list]
        return content_list
    
    def parse_titles(self,soup):
        articles = soup.find_all(class_='js-article')
        titles = map(self.get_title, articles)
        return [title.get_text() for title in titles]
    
    def parse_links(self,soup):
        articles = soup.find_all(class_='js-article')
        titles = map(self.get_title, articles)
        links = [title.find('a')['href'] for title in titles]
        return links
    
    def get_title(self,article):
        return article.find('h2')
        
    def cleanup_content(self,content):
        # corrects the content text.
        replacements = [
            ('\r ','\n'*2), #replace linebreaks
            ('  ', ' '), #fix double spaces
            ('\n','\n\n')
        ]
        for r in replacements:
            content = content.replace(*r)
            
        # removes repertitive whitespace
        content = re.sub(r'\n\n+[\n]','\n\n',content)

        # remove errors in the content text
        errors = [
            '\u200c',
            'img.lazyload { display: none; }',
            'img.lazyload { display: none; } '
        ]
        
        for e in errors:
            # removes error
            content = content.replace(e, '')
            
        # remove leading blankspace
        paragraphs = content.split('\n'*2)
        paragraphs = [p.strip() for p in paragraphs]
        content = ('\n'*2).join(paragraphs)
        
        # remove trailing blankspace
        content = content.strip()
            
        return content

In [3]:
class Site:
    def __init__(self,name,posts):
        self.name = name
        self.posts = posts
        
    def __repr__(self):
        #stub
        return f'{self.name}\n\n\n{self.posts_to_str()}'
    
    def posts_to_str(self):
        return '\n\n\n'.join([f'{post}' for post in self.posts])

In [4]:
class Post:
    def __init__(self,title,content,link):
        self.title = title
        self.content = content
        self.link = link
        
    def __repr__(self):
        #stub
        return f'TITLE:\n\n{self.title}\n\nCONTENT:\n\n{self.content}\n\nLINK:\n\n{self.link}'

In [None]:
def prevent_exit():
    input('Press enter to exit.')

In [None]:
m = MacrumorsSource('Macrumors','https://www.macrumors.com/')
site = m.download()

In [None]:
print(site)
prevent_exit()