In [1]:
import requests
from bs4 import BeautifulSoup
from abc import ABC

In [14]:
class Source():
    def __init__(self,name,url):
        self.name = name
        self.url = url
        
    def download(self):
        '''returns Site'''
        source_html = requests.get(self.url).text
        soup = BeautifulSoup(source_html,'html.parser')
        
        return self.parse(soup)
    
    def parse(self,soup):
        '''to be implemented by a subclass'''
        pass

In [3]:
class MacrumorsSource(Source):
    def parse(self,soup):
        contents = self.parse_content(soup)
        titles = self.parse_titles(soup)
        links = self.parse_links(soup)
        
        posts_raw = zip(titles,contents,links)
        posts = [Post(*post_data) for post_data in posts_raw]
        
        return Site(self.name, posts)
    
    def parse_content(self,soup):
        content_list = soup.find_all(class_='content_inner')
        content_list = [self.cleanup_content(content.get_text()) for content in content_list]
        return content_list
    
    def parse_titles(self,soup):
        titles = soup.find_all(class_='title')
        return [title.get_text() for title in titles]
    
    def parse_links(self,soup):
        titles = soup.find_all(class_='title')
        links = [title.find('a')['href'] for title in titles[:-1]]
        return links
    
    def cleanup_content(self,content):
        # corrects the content text.
        replacements = [
            ('\r ','\n'*2), #replace linebreaks
            ('  ', ' ') #fix double spaces
        ]
        for r in replacements:
            content = content.replace(*r)

        # remove errors in the content text
        errors = [
            '\u200c',
            'img.lazyload { display: none; }',
            'img.lazyload { display: none; } '
        ]
        for e in errors:
            # removes error
            content = content.replace(e, '')
            
        # remove leading blankspace
        paragraphs = content.split('\n'*2)
        paragraphs = [p.strip() for p in paragraphs]
        content = ('\n'*2).join(paragraphs)
            
        return content

In [4]:
class Site:
    def __init__(self,name,posts):
        self.name = name
        self.posts = posts
        
    def __repr__(self):
        #stub
        return f'{self.name}\n\n\n{self.posts_to_str()}'
    
    def posts_to_str(self):
        return '\n\n\n'.join([f'{post}' for post in self.posts])

In [5]:
class Post:
    def __init__(self,title,content,link):
        self.title = title
        self.content = content
        self.link = link
        
    def __repr__(self):
        #stub
        return f'TITLE:\n\n{self.title}\n\nCONTENT:\n\n{self.content}\n\nLINK:\n\n{self.link}'

In [13]:
def prevent_exit():
    input('Press enter to exit.')

In [11]:
m = MacrumorsSource('Macrumors','https://www.macrumors.com/')
site = m.download()

In [12]:
print(site)
prevent_exit()

Macrumors


TITLE:

Testing Samsung's New $1,380 Galaxy Z Flip Foldable Smartphone

CONTENT:

Samsung last week unveiled the Galaxy Z Flip, which began shipping out over the weekend. We managed to get our hands on one of the new foldable smartphones, and thought we'd check it out to see how it compares to the Galaxy Fold and how foldable smartphone technology is progressing.

Subscribe to the MacRumors YouTube channel for more videos. The Galaxy Z Flip is the followup to Samsung's original Galaxy Fold, which did not receive stellar reviews because it felt more like a prototype than an actual smartphone worth purchasing. The Galaxy Fold was a smartphone that unfolded into a tablet, but the Galaxy Z Flip is a smartphone that folds down to become more compact.

Like the flip phones of yore, the Galaxy Z Flip folds in half top over bottom, compressing down into a little pocketable square. It's thick, like two smartphones stacked on top of each other, in fact, but some people are going to p