Skip to content

Commit

Permalink
* Added support for pagination and corresponding nosetest
Browse files Browse the repository at this point in the history
  • Loading branch information
arch119 committed Dec 7, 2013
1 parent 4b99a0f commit 9c22f49
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 4 deletions.
44 changes: 40 additions & 4 deletions hn/hn.py
Expand Up @@ -32,17 +32,46 @@

from bs4 import BeautifulSoup
import requests
import time


BASE_URL = 'https://news.ycombinator.com'

INTERVAL_BETWEEN_REQUESTS = 1 #Sleep these many seconds between 2 consecutive page requests

class HN(object):
"""
The class that parses the HN page, and builds up all stories
"""


def _get_next_page(self, soup):
"""
Get the relative url of the next page (The "More" link @ the bottom of the page)
"""
table = soup.findChildren('table')[2] # the table with all submissions
return table.findChildren(['tr'])[-1].find('a').get('href') # the last row of the table contains the relative url of the next page


def _get_soups(self, page, fpl=0):
"""
Get bs4 objects for all pages in the chain starting from 'page' following up to fpl links
"""
soups = list()
soups.append(self._get_soup(page))

while len(soups) < (fpl+1):
cur_soup = soups[-1]
time.sleep(INTERVAL_BETWEEN_REQUESTS) #Be a good citizen.
next_page = self._get_next_page(cur_soup).lstrip("//")
next_soup = self._get_soup(next_page)
if not len(next_soup.findChildren("table")) == 0:
soups.append(next_soup)
else:
break

return soups


def _get_soup(self, page=''):
"""
Returns a bs4 object of the page requested
Expand Down Expand Up @@ -151,13 +180,20 @@ def _build_story(self, all_rows):
return all_stories


def get_stories(self, story_type=''):
def get_stories(self, story_type='', follow_pagination_limit=0):
"""
Returns a list of stories from the passed page
of HN. 'story_type' can be:
'' = top stories (homepage)
'newest' = most recent stories
'best' = best stories
'follow_pagination_limit' specifies the maximum number of additional paginated pages to follow.
"""
all_rows = self._get_zipped_rows(self._get_soup(page=story_type))
return self._build_story(all_rows)
story = list()
all_soups = self._get_soups(story_type, follow_pagination_limit)
for soup in all_soups:
all_rows = self._get_zipped_rows(soup)
story = story + self._build_story(all_rows)

return story
23 changes: 23 additions & 0 deletions tests/test_pagination.py
@@ -0,0 +1,23 @@
from hn import HN

hn = HN()

def test_pagination():
"""
This test checks if the pagination works for the front page.
"""
assert len(hn.get_stories(story_type='', follow_pagination_limit=1)) == 30+1*30


def test_pagination_newest():
"""
This test checks if the pagination works for the best page.
"""
assert len(hn.get_stories(story_type='newest', follow_pagination_limit=3)) == 30+3*30


def test_pagination_best():
"""
This test checks if the pagination works for the top stories.
"""
assert len(hn.get_stories(story_type='best', follow_pagination_limit=5)) == 30+5*30

0 comments on commit 9c22f49

Please sign in to comment.