# Scraping Data with Python


In [2]:
import requests
from bs4 import BeautifulSoup

## BeautifulSoup Basics

In [3]:
res = requests.get('https://news.ycombinator.com/news')
soup = BeautifulSoup(res.text, 'html.parser')
# print(soup.body)  # returns the body of the page
# print(soup.body.contents)  # returns contents in body as list

# print(soup.find_all('div'))  # retuns all the divs as a list
# print(soup.find(id="score_20514755"))  # find element by id

In [4]:
print(soup.select('.score'))  # grab via css selector (class = score)

[<span class="score" id="score_23666950">357 points</span>, <span class="score" id="score_23667688">55 points</span>, <span class="score" id="score_23668626">21 points</span>, <span class="score" id="score_23668507">5 points</span>, <span class="score" id="score_23666999">190 points</span>, <span class="score" id="score_23668110">111 points</span>, <span class="score" id="score_23664067">852 points</span>, <span class="score" id="score_23667908">19 points</span>, <span class="score" id="score_23662540">78 points</span>, <span class="score" id="score_23660123">44 points</span>, <span class="score" id="score_23664900">116 points</span>, <span class="score" id="score_23661278">286 points</span>, <span class="score" id="score_23661160">90 points</span>, <span class="score" id="score_23662443">109 points</span>, <span class="score" id="score_23666364">80 points</span>, <span class="score" id="score_23665731">70 points</span>, <span class="score" id="score_23667675">78 points</span>, <span c

In [5]:
links = soup.select('.storylink')
votes = soup.select('.score')
print(votes[0])

<span class="score" id="score_23666950">357 points</span>


In [6]:
votes[0].get('id')

'score_23666950'

We can chain these

In [7]:
import pprint

hn_links = soup.select('.storylink')
hn_subtext = soup.select('.subtext')


def create_custom_hn(links, subtext):
    hn = []
    for index, item in enumerate(links):
        title = item.getText()
        href = item.get('href', None)
        vote = subtext[index].select('.score')
        if len(vote):
            points = int(vote[0].getText().replace(' points', ''))
            if points > 99:
                hn.append({'title': title, 'link': href, 'votes': points})
    return hn

data = create_custom_hn(hn_links, hn_subtext)
pprint.pprint(data)

[{'link': 'https://github.com/foambubble/foam',
  'title': 'Foam – A Roam Research alternative with VSCode, Markdown and '
           'GitHub',
  'votes': 357},
 {'link': 'https://www.sankalpjonna.com/posts/our-aws-bill-is-2-of-revenue-heres-how-we-did-it',
  'title': 'How we got our AWS bill to around 2% of revenue',
  'votes': 190},
 {'link': 'https://www.buzzfeednews.com/article/danvergano/cdc-coronavirus-containment-redfield',
  'title': 'The CDC lost control of the coronavirus pandemic, then the agency '
           'disappeared',
  'votes': 111},
 {'link': 'https://dev.lemmy.ml/post/35293',
  'title': 'Lemmy, an open-source federated Reddit alternative, gets funding '
           'for development',
  'votes': 852},
 {'link': 'https://paulromer.net/fda_perpetual_process_machine/',
  'title': "The FDA's perpetual process machine",
  'votes': 116},
 {'link': 'http://news.mit.edu/2020/astronomers-rhythm-radio-waves-0617',
  'title': 'Astronomers detect regular rhythm of radio waves, wi

In [8]:
def sort_stories_by_votes(hnlist):
    return sorted(hnlist, key=lambda k: k['votes'], reverse=True)


def create_custom_hn(links, subtext):
    hn = []
    for index, item in enumerate(links):
        title = item.getText()
        href = item.get('href', None)
        vote = subtext[index].select('.score')
        if len(vote):
            points = int(vote[0].getText().replace(' points', ''))
            if points > 99:
                hn.append({'title': title, 'link': href, 'votes': points})
    return sort_stories_by_votes(hn)


data = create_custom_hn(hn_links, hn_subtext)
pprint.pprint(data)

[{'link': 'https://dev.lemmy.ml/post/35293',
  'title': 'Lemmy, an open-source federated Reddit alternative, gets funding '
           'for development',
  'votes': 852},
 {'link': 'https://unim.press/#dataisbeautiful',
  'title': 'Show HN: A Reddit reader that looks like the frontpage of a print '
           'newspaper',
  'votes': 565},
 {'link': 'https://raphlinus.github.io/xi/2020/06/27/xi-retrospective.html',
  'title': 'Xi-Editor Retrospective',
  'votes': 464},
 {'link': 'https://jvns.ca/blog/2014/09/27/how-does-sqlite-work-part-1-pages/',
  'title': 'How Does Sqlite Work? (2014)',
  'votes': 415},
 {'link': 'https://mobile.reuters.com/article/idUSKBN23X2TN',
  'title': 'Sensors detect rise in nuclear particles on Baltic Sea',
  'votes': 394},
 {'link': 'https://www.anfractuosity.com/projects/rainbow/',
  'title': 'Show HN: Rainbow – an attempt to display colour on a B&W monitor',
  'votes': 373},
 {'link': 'https://github.com/maxgoedjen/secretive',
  'title': 'Secretive – macOS

[HackerNews Scraper repo](https://github.com/hungrypc/python-scraper-hackernews)