In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
headers = {
    "Accept-Language":"en-US,en;q=0.9",
    "User-Agent":"Mozilla/5.0 (Macintosh; \
    Intel Mac OS X 10_15_7) \
    AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/98.0.4758.102 Safari/537.36"
}


Functions written after today's exploration of the pages are immediately below. 

In [221]:
import re
import time
CRAWL_DELAY = 5

def collect_hn_post_data(num_pages=1):
    """Collect data on posts from the front page(s) of HackerNews.
    
    Params
    ----------
    num_pages: int, default = 1
        Total number of pages to be scraped. 
    Returns
    ----------
    List[dict]
        A list of dicts, each containing details of a HackerNews post. 
        If an error occurs, the current list (or empty list) is returned."""
    
    news_url = 'https://news.ycombinator.com/news?p='
    post_url = 'https://news.ycombinator.com/item?'
    detail_url = 'https://news.ycombinator.com/'
    posts = []
    
    for page_num in range(1, num_pages+1):
        
        try:
            page = requests.get(news_url+str(page_num))
        except:
            return posts
        
        if page.status_code != 200:
            return posts
        
        soup = BeautifulSoup(page.content)
        all_posts = soup.find_all('tr', class_='athing')

        for post in all_posts:
            post_data = {} 
            
            title = post.find_all('a')[-2]
            post_data['title'] = title.get_text(strip=True)
            
            link = title['href']
            if link.startswith('http'):
                post_data['url'] = link
            else:
                post_data['url'] = post_url + re.search(r'id=[0-9]+', link)[0]
            
            post_detail_url = detail_url + post.next_sibling.find_all('a')[-1]['href']
            print(post_detail_url, detail_url)
            post_data['detail_url'] = post_detail_url 
            
            subtitle = post.next_sibling.text.split()
            post_data['points'] = int(subtitle[0])
            post_data['comments'] = int(subtitle[-2]) if subtitle[-2].isnumeric() else 0
            
            posts.append(post_data)
            
            if num_pages > 1:
                time.sleep(CRAWL_DELAY)
            
        return posts
                

In [222]:
def collect_hn_comments(url, max_comments=50):
    """Collect comments from a HackerNews post.
    
    Params
    ----------
    url: str
        URL to a HackerNews post. 
    max_comments: int, default = 50
        Maximum comments to be retrieved. 
        
    Returns
    ----------
    List[str] | None
        A list of each comment as a single string. 
        If an error occurs, returns None."""
    
    try:
        page = requests.get(url)
    except:
        return None
    
    if page.status_code != 200:
        return None
    
    soup = BeautifulSoup(page.content)
    
    comments_list = []
    
    for comm in soup.find_all('tr', class_='athing comtr')[:max_comments]:
        comment = comm.find('span', class_='commtext').get_text(strip=True)
        comments_list.append(comment)
        
    return comments_list


## -------

In [3]:
res = requests.get('https://news.ycombinator.com/news')

In [8]:
res.status_code

200

In [7]:
soup = BeautifulSoup(res.content)

In [9]:
soup

<html lang="en" op="news"><head><meta content="origin" name="referrer"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><link href="news.css?BunVEUExo5PFSt1oHKIB" rel="stylesheet" type="text/css"/>
<link href="favicon.ico" rel="shortcut icon"/>
<link href="rss" rel="alternate" title="RSS" type="application/rss+xml"/>
<title>Hacker News</title></head><body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask

In [23]:
items = soup.find_all('tr', class_='athing')

In [24]:
len(items)

30

In [29]:
items[0]

<tr class="athing" id="36127703">
<td align="right" class="title" valign="top"><span class="rank">1.</span></td> <td class="votelinks" valign="top"><center><a href="vote?id=36127703&amp;how=up&amp;goto=news" id="up_36127703"><div class="votearrow" title="upvote"></div></a></center></td><td class="title"><span class="titleline"><a href="https://jmap.io/">JMAP – a much needed modern email open standard</a><span class="sitebit comhead"> (<a href="from?site=jmap.io"><span class="sitestr">jmap.io</span></a>)</span></span></td></tr>

In [30]:
items[0].text

'\n1. JMAP – a much needed modern email open standard (jmap.io)'

In [26]:
items[0].next_sibling

<tr><td colspan="2"></td><td class="subtext"><span class="subline">
<span class="score" id="score_36127703">312 points</span> by <a class="hnuser" href="user?id=tambourine_man">tambourine_man</a> <span class="age" title="2023-05-30T17:26:46"><a href="item?id=36127703">3 hours ago</a></span> <span id="unv_36127703"></span> | <a href="hide?id=36127703&amp;goto=news">hide</a> | <a href="item?id=36127703">98 comments</a> </span>
</td></tr>

In [31]:
items[0].next_sibling.text

'\n312 points by tambourine_man 3 hours ago  | hide | 98\xa0comments \n'

In [32]:
items[0].text.strip()

'1. JMAP – a much needed modern email open standard (jmap.io)'

In [33]:
items[0].next_sibling.text.strip()

'312 points by tambourine_man 3 hours ago  | hide | 98\xa0comments'

In [34]:
items[0].next_sibling.text.strip().split()

['312',
 'points',
 'by',
 'tambourine_man',
 '3',
 'hours',
 'ago',
 '|',
 'hide',
 '|',
 '98',
 'comments']

In [35]:
items[0].text.strip().split()

['1.',
 'JMAP',
 '–',
 'a',
 'much',
 'needed',
 'modern',
 'email',
 'open',
 'standard',
 '(jmap.io)']

In [51]:
items[0].find_all('a')[-2]

<a href="https://jmap.io/">JMAP – a much needed modern email open standard</a>

In [59]:
items[3].find_all('a')[-2]['href']

'https://www.databricks.com/blog/welcoming-bit-io-databricks-investing-developer-experience'

In [81]:
import re

In [93]:
search = re.search(r'id=[0-9]+', 'vote?id=36128618&how=up&goto=news')

In [101]:
search[0]

'id=36128618'

In [105]:
target_urls = []

for elem in items:
    link = elem.find_all('a')[-2]['href']
    if link.startswith('http'):
        target_urls.append(link)
    else:
        print('local', link)
        target_urls.append('https://news.ycombinator.com/item?' + re.search(r'id=[0-9]+', link)[0])
    

local vote?id=36128618&how=up&goto=news


In [75]:
items[0].find_all('a')[-1]['href']

'from?site=jmap.io'

In [107]:
items[12].find_all('a')[1]

<a href="item?id=36128618">Ask HN: Where have you found community outside of work?</a>

In [112]:
for url in target_urls:
    print(url)

https://jmap.io/
https://usehooks.com/
https://plato.stanford.edu/entries/camus/
https://www.databricks.com/blog/welcoming-bit-io-databricks-investing-developer-experience
https://caymannewsservice.com/2023/05/new-horror-revealed-in-sargassum-blob/
https://arxiv.org/abs/2304.09140
https://novalis.org/blog/2023-05-30-turds.html
https://fab.cba.mit.edu/classes/862.22/index.html
https://www.cs.cmu.edu/~rwh/students/okasaki.pdf
https://www.devever.net/~hl/backstage-cast
https://www.bsdcan.org/events/bsdcan_2023/sessions/session/142/slides/58/20230520-memory-safe-desktop-compressed.pdf
https://peteroupc.github.io/random.html
https://news.ycombinator.com/item?id=36128618
https://www.safe.ai/statement-on-ai-risk
https://github.com/kochrt/qr-designer
https://www.lyft.com/rev/posts/lyfts-secret-plan-to-take-control-of-its-maps-and-its-future
https://www.bloomberg.com/news/newsletters/2023-05-25/podcast-consolidation-reaches-smaller-studios-amid-advertising-downturn
https://tomforsyth1000.github

In [125]:
# Sometimes articles are job postings, and have no discussion. 

comment_urls = []
for elem in items:
    link = elem.next_sibling.find_all('a')[-1]['href']
    if link.startswith('hide'):
        print('nope')
    print(link)
    comment_urls.append('https://news.ycombinator.com/' + link)
    

item?id=36127703
item?id=36129622
item?id=36128235
item?id=36127230
item?id=36129757
item?id=36129227
item?id=36129387
item?id=36129199
item?id=36123651
item?id=36127543
item?id=36128067
item?id=36101962
item?id=36128618
item?id=36123082
item?id=36128082
item?id=36127058
item?id=36101907
item?id=36096604
item?id=36128484
item?id=36116192
item?id=36128435
item?id=36127800
nope
hide?id=36123766&goto=news
item?id=36126097
item?id=36128811
item?id=36129373
item?id=36112951
item?id=36122270
item?id=36126032
item?id=36117899


In [126]:
for link in comment_urls:
    print(link)

https://news.ycombinator.com/item?id=36127703
https://news.ycombinator.com/item?id=36129622
https://news.ycombinator.com/item?id=36128235
https://news.ycombinator.com/item?id=36127230
https://news.ycombinator.com/item?id=36129757
https://news.ycombinator.com/item?id=36129227
https://news.ycombinator.com/item?id=36129387
https://news.ycombinator.com/item?id=36129199
https://news.ycombinator.com/item?id=36123651
https://news.ycombinator.com/item?id=36127543
https://news.ycombinator.com/item?id=36128067
https://news.ycombinator.com/item?id=36101962
https://news.ycombinator.com/item?id=36128618
https://news.ycombinator.com/item?id=36123082
https://news.ycombinator.com/item?id=36128082
https://news.ycombinator.com/item?id=36127058
https://news.ycombinator.com/item?id=36101907
https://news.ycombinator.com/item?id=36096604
https://news.ycombinator.com/item?id=36128484
https://news.ycombinator.com/item?id=36116192
https://news.ycombinator.com/item?id=36128435
https://news.ycombinator.com/item?

In [108]:
items[12].next_sibling.find_all('a')[1]

<a href="item?id=36128618">2 hours ago</a>

In [110]:
items[12].next_sibling.find_all('a')[-1]

<a href="item?id=36128618">122 comments</a>

In [127]:
res_comment = requests.get('https://news.ycombinator.com/item?id=36127703')

In [128]:
res_comment.status_code

200

In [130]:
c_soup = BeautifulSoup(res_comment.content)

In [131]:
c_soup

<html lang="en" op="item"><head><meta content="origin" name="referrer"/><meta content="width=device-width, initial-scale=1.0" name="viewport"/><link href="news.css?BunVEUExo5PFSt1oHKIB" rel="stylesheet" type="text/css"/>
<link href="favicon.ico" rel="shortcut icon"/>
<title>JMAP – a modern email open standard | Hacker News</title></head><body><center><table bgcolor="#f6f6ef" border="0" cellpadding="0" cellspacing="0" id="hnmain" width="85%">
<tr><td bgcolor="#ff6600"><table border="0" cellpadding="0" cellspacing="0" style="padding:2px" width="100%"><tr><td style="width:18px;padding-right:4px"><a href="https://news.ycombinator.com"><img height="18" src="y18.svg" style="border:1px white solid; display:block" width="18"/></a></td>
<td style="line-height:12pt; height:10px;"><span class="pagetop"><b class="hnname"><a href="news">Hacker News</a></b>
<a href="newest">new</a> | <a href="front">past</a> | <a href="newcomments">comments</a> | <a href="ask">ask</a> | <a href="show">show</a> | <a 

In [138]:
comments = c_soup.find_all('tr', class_='athing comtr')

In [139]:
len(comments)

196

In [140]:
comments[0]

<tr class="athing comtr" id="36132480"><td><table border="0"> <tr> <td class="ind" indent="0"><img height="1" src="s.gif" width="0"/></td><td class="votelinks" valign="top">
<center><a href="vote?id=36132480&amp;how=up&amp;goto=item%3Fid%3D36127703" id="up_36132480"><div class="votearrow" title="upvote"></div></a></center> </td><td class="default"><div style="margin-top:2px; margin-bottom:-10px;"><span class="comhead">
<a class="hnuser" href="user?id=doodlesdev">doodlesdev</a> <span class="age" title="2023-05-30T23:18:37"><a href="item?id=36132480">22 minutes ago</a></span> <span id="unv_36132480"></span> <span class="navs">
             | <a aria-hidden="true" class="clicky" href="#36128749">next</a> <a class="togg clicky" href="javascript:void(0)" id="36132480" n="2">[–]</a><span class="onstory"></span> </span>
</span></div><br/><div class="comment">
<span class="commtext c00"><p></p><pre><code>   &gt; A lot of the optimisations for efficient client-server sync require the server to 

In [146]:
comment_1 = comments[0].find('span', class_='commtext')

In [147]:
comment_1.get_text(strip=True)

'> A lot of the optimisations for efficient client-server sync require the server to be able to read the message. If everything were encrypted, the server would basically be a dumb blob store. This is particularly bad for mobile, where you only want to sync partial information. Users expect to be able to search their whole archive, so either you need all the data in the client, or the server needs to have access to the data.\n\n   > JMAP is therefore not introducing any new measures to address end-to-end encryption. The best advice is probably to run your own "JMAP server" on trusted hardware; otherwise you need to sync the entire multi-gigabyte mail spool to all your devices. JMAP is also simple enough that you could run the server on multiple machines with an underlying replication protocol over encrypted links and have that do your smarts.They lost a huge opportunity. Encryption at rest of emails and E2EE should\'ve been how we built these protocols from the start, here we get a cha

In [148]:
lst = list(range(20))

In [169]:
def collect_hn_comments(url, max_comments=50):
    """Collect comments from a HackerNews post.
    
    Params
    ----------
    url: str
        URL to a HackerNews post. 
    max_comments: int, default = 50
        Maximum comments to be retrieved. 
        
    Returns
    ----------
    List[str]
        A list of each comment as a single string."""
    
    try:
        page = requests.get(url)
    except:
        return None
    
    if page.status_code != 200:
        return None
    
    soup = BeautifulSoup(page.content)
    
    comments_list = []
    
    for comm in soup.find_all('tr', class_='athing comtr')[:max_comments]:
        comment = comm.find('span', class_='commtext').get_text(strip=True)
        comments_list.append(comment)
        
    return comments_list


In [166]:
requests.get('http://localhost:8880')

ConnectionError: HTTPConnectionPool(host='localhost', port=8880): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x13a97e610>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [167]:
collect_hn_comments('http://localhost:8880')

In [171]:
test_posts = collect_hn_comments('https://news.ycombinator.com/item?id=36127703')

In [173]:
len(test_posts)

50

In [215]:
import re
import time
CRAWL_DELAY = 5

def collect_hn_post_data(num_pages=1):
    """Collect data on posts from the front page(s) of HackerNews.
    
    Params
    ----------
    num_pages: int, default = 1
        Total number of pages to be scraped. 
    Returns
    ----------
    List[dict]
        A list of dicts, each containing details of a HackerNews post."""
    
    news_url = 'https://news.ycombinator.com/news?p='
    post_url = 'https://news.ycombinator.com/item?'
    detail_url = 'https://news.ycombinator.com/'
    posts = []
    
    for page_num in range(1, num_pages+1):
        
        try:
            page = requests.get(news_url+str(page_num))
        except:
            return posts
        
        if page.status_code != 200:
            return posts
        
        soup = BeautifulSoup(page.content)
        all_posts = soup.find_all('tr', class_='athing')

        for post in all_posts:
            post_data = {} 
            
            title = post.find_all('a')[-2]
            post_data['title'] = title.get_text(strip=True)
            
            link = title['href']
            if link.startswith('http'):
                post_data['url'] = link
            else:
                post_data['url'] = post_url + re.search(r'id=[0-9]+', link)[0]
            
            post_detail_url = detail_url + post.next_sibling.find_all('a')[-1]['href']
            print(post_detail_url, detail_url)
            post_data['detail_url'] = post_detail_url 
            
            subtitle = post.next_sibling.text.split()
            post_data['points'] = int(subtitle[0])
            post_data['comments'] = int(subtitle[-2]) if subtitle[-2].isnumeric() else 0
            
            posts.append(post_data)
            
            if num_pages > 1:
                time.sleep(CRAWL_DELAY)
            
        return posts
                

In [196]:
items[0].find_all('a')[-2].text

'JMAP – a much needed modern email open standard'

In [184]:
items[0].next_sibling.text.split()

['312',
 'points',
 'by',
 'tambourine_man',
 '3',
 'hours',
 'ago',
 '|',
 'hide',
 '|',
 '98',
 'comments']

In [195]:
items[0].get_text(strip=True)

'1.JMAP – a much needed modern email open standard(jmap.io)'

In [214]:
posts = collect_hn_post_data()

https://news.ycombinator.comitem?id=36133226 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36127703 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36128617 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36129594 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36132693 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36128082 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36133263 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36130166 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36132265 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36131319 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36131610 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36129622 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36130191 https://news.ycombinator.com
https://news.ycombinator.comitem?id=36

In [216]:
posts

[{'title': 'Nvidia DGX GH200: 100 Terabyte GPU Memory System',
  'url': 'https://developer.nvidia.com/blog/announcing-nvidia-dgx-gh200-first-100-terabyte-gpu-memory-system/',
  'detail_url': 'https://news.ycombinator.comitem?id=36133226',
  'points': 172,
  'comments': 110},
 {'title': 'JMAP – a modern email open standard',
  'url': 'https://jmap.io/',
  'detail_url': 'https://news.ycombinator.comitem?id=36127703',
  'points': 774,
  'comments': 269},
 {'title': 'Hacking my “smart” toothbrush',
  'url': 'https://kuenzi.dev/toothbrush/',
  'detail_url': 'https://news.ycombinator.comitem?id=36128617',
  'points': 247,
  'comments': 72},
 {'title': 'Plane: Open-Source Alternative to Jira',
  'url': 'https://github.com/makeplane/plane',
  'detail_url': 'https://news.ycombinator.comitem?id=36129594',
  'points': 328,
  'comments': 110},
 {'title': 'A Mechanistic Interpretability Analysis of Grokking',
  'url': 'https://www.alignmentforum.org/posts/N6WM6hs7RQMKDhYjB/a-mechanistic-interpretab

In [217]:
comments = collect_hn_comments('https://news.ycombinator.com/item?id=36127703')

In [219]:
comments[0]

"I love JMAP. It's what allowed me and my team (at 1Password) to easily add support for Masked Emails, where we randomly generate your email address in addition to your password.Our own Madeline Hanley wrote about that experience, if you'd like to see what it's like to work with JMAP:https://blog.1password.com/making-masked-email-with-jmap/"

In [220]:
import pandas as pd
pd.DataFrame(posts)

Unnamed: 0,title,url,detail_url,points,comments
0,Nvidia DGX GH200: 100 Terabyte GPU Memory System,https://developer.nvidia.com/blog/announcing-n...,https://news.ycombinator.comitem?id=36133226,172,110
1,JMAP – a modern email open standard,https://jmap.io/,https://news.ycombinator.comitem?id=36127703,774,269
2,Hacking my “smart” toothbrush,https://kuenzi.dev/toothbrush/,https://news.ycombinator.comitem?id=36128617,247,72
3,Plane: Open-Source Alternative to Jira,https://github.com/makeplane/plane,https://news.ycombinator.comitem?id=36129594,328,110
4,A Mechanistic Interpretability Analysis of Gro...,https://www.alignmentforum.org/posts/N6WM6hs7R...,https://news.ycombinator.comitem?id=36132693,133,28
5,Show HN: I open sourced the QR designer from m...,https://github.com/kochrt/qr-designer,https://news.ycombinator.comitem?id=36128082,375,42
6,I try to answer “how to become a systems engin...,https://rachelbythebay.com/w/2023/05/30/eng/,https://news.ycombinator.comitem?id=36133263,58,25
7,Federal judge: Border searches of cell phones ...,https://www.eff.org/deeplinks/2023/05/federal-...,https://news.ycombinator.comitem?id=36130166,820,217
8,The role of cat eye narrowing movements in cat...,https://www.nature.com/articles/s41598-020-734...,https://news.ycombinator.comitem?id=36132265,95,21
9,"Who owns this camera, Nikon? Me or you? [video]",https://www.youtube.com/watch?v=TPyX1WBzxN8,https://news.ycombinator.comitem?id=36131319,111,55
