In [1]:
!pip install beautifulsoup4 requests pandas



In [2]:
import urllib
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import pickle

In [3]:
def get_cna_articles():
    CNA_URL = "https://www.channelnewsasia.com/"
    # get cna main page

    page = requests.get(CNA_URL)
    soup = BeautifulSoup(page.text, "html.parser")
    # get links to different news articles (the class is for this url, will differ across websites)
    links = soup.find_all('a', attrs={'class': 'h6__link h6__link-- list-object__heading-link'})
    articles = []
    for link in links:
        article_page = requests.get(f"{CNA_URL}{link['href']}")
        article = BeautifulSoup(article_page.text, "html.parser")
        article_text = article.find_all('div', attrs={'class': 'text-long'})
        articles.append((' '.join([[' '.join([child.text for child in div.findChildren('p')])][0] for div in article_text]),
                                     f"{CNA_URL}{link['href']}"))
    
    return articles

In [4]:
# run this block to get saved data/scrape data
CNA_ARTICLES_ADDRESS = Path("cna_articles.pickle")
if CNA_ARTICLES_ADDRESS.is_file():
    with open(CNA_ARTICLES_ADDRESS, "rb") as f:
        cna_articles = pickle.load(f)

else:
    cna_articles = get_cna_articles()
    CNA_ARTICLES_ADDRESS.touch(exist_ok=True)
    with open(CNA_ARTICLES_ADDRESS, "wb") as f:
        pickle.dump(cna_articles, f)

In [5]:
CNA_ARTICLES_ADDRESS.is_file()

True

In [6]:
# scrape stackoverflow
from IPython.core.display import display, HTML
def get_stack_overflow_articles():
    STACK_OVERFLOW_URL = "https://stackoverflow.com/"
    page = requests.get(f"{STACK_OVERFLOW_URL}questions?tab=Frequent")
    soup = BeautifulSoup(page.text, "html.parser")
    # get links to different news articles (the class is for this url, will differ across websites)
    links = soup.find_all("a", "question-hyperlink")
    posts = []
    for idx, link in enumerate(links,1):
        article_site = requests.get(f"{STACK_OVERFLOW_URL}{link['href']}")
        article_page = BeautifulSoup(article_site.text, "html.parser")
        parent = article_page.find("div", "s-prose js-post-body")
        p_tags = parent.findChildren("p")
        post = []
        for p_tag in p_tags:
            post.append(p_tag.text)
            display(HTML(str(p_tag)))
        posts.append((" ".join(post), f"{STACK_OVERFLOW_URL}{link['href']}"))
        if idx == 50: break
    return posts

In [7]:
# run this block to get saved data/scrape data
STACK_OVERFLOW_ARTICLES_ADDRESS = Path("stack_overflow_articles.pickle")
if STACK_OVERFLOW_ARTICLES_ADDRESS.is_file():
    with open(STACK_OVERFLOW_ARTICLES_ADDRESS, "rb") as f:
        stack_overflow_articles = pickle.load(f)

else:
    stack_overflow_articles = get_stack_overflow_articles()
    STACK_OVERFLOW_ARTICLES_ADDRESS.touch(exist_ok=True)
    with open(STACK_OVERFLOW_ARTICLES_ADDRESS, "wb") as f:
        pickle.dump(stack_overflow_articles, f)

In [8]:
stack_overflow_articles

[('When discussing performance with colleagues, teaching, sending a bug report or searching for guidance on mailing lists and here on Stack\xa0Overflow, a reproducible example is often asked and always helpful. What are your tips for creating an excellent example? How do you paste data structures from r in a text format? What other information should you include? Are there other tricks in addition to using dput(), dump() or structure()?  When should you include library() or require() statements?  Which reserved words should one avoid, in addition to c, df, data, etc.? How does one make a great r reproducible example?',
  'https://stackoverflow.com//questions/5963269/how-to-make-a-great-r-reproducible-example'),
 ('What are Null Pointer Exceptions (java.lang.NullPointerException) and what causes them? What methods/tools can be used to determine the cause so that you stop the exception from causing the program to terminate prematurely?',
  'https://stackoverflow.com//questions/218384/wha

In [9]:
# scrape hardwarezone
def get_hwz_articles():
    HWZ_URL = "https://forums.hardwarezone.com.sg/"
    page = requests.get(HWZ_URL)
    soup = BeautifulSoup(page.text, "html.parser")
    # get links to different news articles (the class is for this url, will differ across websites)
    divs = soup.find_all("div", "structItem-title")
    content = []
    for div in divs:
        forum = BeautifulSoup(requests.get(f"{HWZ_URL}{div.find('a')['href']}").text, "html.parser")
#         parent = article_page.find("div", "s-prose js-post-body")
        posts = [comment.text for comment in forum.find_all("div", "bbWrapper")]
        content.append((posts, f"{HWZ_URL}{div.find('a')['href']}"))
    return content

In [10]:
# run this block to get saved data/scrape data
HWZ_ARTICLES_PATH = Path("hwz_articles.pickle")
if HWZ_ARTICLES_PATH.is_file():
    with open(HWZ_ARTICLES_PATH, "rb") as f:
        hwz_articles = pickle.load(f)

else:
    hwz_articles = get_hwz_articles()
    HWZ_ARTICLES_PATH.touch(exist_ok=True)
    with open(HWZ_ARTICLES_PATH, "wb") as f:
        pickle.dump(hwz_articles, f)

In [11]:
# dicussion of writing styles
import random
random.seed(200)

cna_sample = random.sample(cna_articles, 2)
stackoverflow_sample = random.sample(stack_overflow_articles, 2)
hwz_sample = random.sample(hwz_articles, 2)


In [12]:
# is the first word in a sentence capitalized; do sentences follow good grammars; are the proper
# nouns capitalized;

for samples in [cna_sample, stackoverflow_sample, hwz_sample]:
    (sample1, sample1_url), (sample2, sample2_url) = samples
    print(f"sample1: {sample1}\n"
          f"sample1 url: {sample1_url}\n"
          f"sample1: {sample2}\n"
          f"sample1 url: {sample2_url}\n")

MELBOURNE :     Australia have renewed belief after finishing runner-up in the Rugby Championship on the back of four straight wins but they must keep "emptying the tank" on the field to get the country behind them, coach Dave Rennie said. The Wallabies recovered from a 3-0 whitewash by New Zealand to beat South Africa and Argentina twice each in the Rugby Championship and bring momentum into their northern hemisphere tour for tests against Japan, England, Scotland and Wales. "You win a couple of tests and things change a bit but we also know things can swing the other way," Rennie told Australian broadcaster Channel Nine. "We honestly believe if we want to get the country behind us we need consistent high-quality performances and they want to see our boys emptying the tank week after week and if they do that people will respect us regardless of the result. "We've made a start but it's something that's a big focus for us." Rennie, in his second year in charge, said the Wallabies had ma

In [14]:
hwz_sample[0]

(['',
  'The news anchor got those sibeh act smart kind of bin. Sibeh er xin to watch. Wheres julie yoo sia.',
  'fug the expert\n\nCNA PRESENTER IS SUPER KY',
  "knn can Teoh Yik Yin comment on our healthcare situation?\n\nclose to collapsing still don't want leepork daily case...\n\n\nSent from EDMWER app!",
  'The presenter is KY  ',
  '\n\nredorangeyellow said:\n\n\n\n\t\t\tThe news anchor got those sibeh act smart kind of bin. Sibeh er xin to watch. Wheres julie yoo sia.\n\t\t\nClick to expand...\n\nyup called the sexpert "prof" like damn chummy with him',
  'No difference from a keyboard warrior',
  '\n\nredorangeyellow said:\n\n\n\n\t\t\tThe news anchor got those sibeh act smart kind of bin. Sibeh er xin to watch. Wheres julie yoo sia.\n\t\t\nClick to expand...\n\nagree. they copy american news anchor style. i hate to watch.',
  'only Angela is keyi',
  'Teo Yik Ying. One of the beasts who contributed to daily deaths and numbers.',
  'this gal is keyi!!!!!',
  'Wtf ish he talkin