In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [23]:
headers = {
    "Accept-Language":"en-US,en;q=0.9",
    "User-Agent":"Mozilla/5.0 (Macintosh; \
    Intel Mac OS X 10_15_7) \
    AppleWebKit/537.36 (KHTML, like Gecko) \
    Chrome/98.0.4758.102 Safari/537.36"
}


Looked into some scraping strategies for reddit - require authentication, which is kind of annoying. API also requires auth and is moving to a paid model so I'll skip it for now. Set up scraper for techmeme - interesting stuff, definitely more corporate than hackernews.

In [None]:
# Get a lightweight NN to dedupe these
# Create embeddings for each headline, store recent embeddings and compare.
# First appearing link should be the highest priority

def collect_techmeme_fp(mode: str ='all'):
    """Collect data on posts from the front page of Techmeme.
    
    Params
    ----------
    mode: str, default = 'all'
        Determines how many articles to pull. Should be 'all' (~48), 
        'most' (30), or 'some' (12).  
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a Techmeme post. 
        If an error occurs, returns None."""
    
    if mode == 'all':
        cap = None
    elif mode == 'most':
        cap = 30
    elif mode == 'some':
        cap = 12
    else:
        raise ValueError(f'Value for "mode" must be "all", "most", or "some". Not "{mode}".')
        
    
    url = 'https://www.techmeme.com/'
    res = requests.get(url)
    
    if res.status_code != 200:
        return None
    
    articles = []
    
    soup = BeautifulSoup(res.content)
    all_articles = soup.find_all('a', class_='ourh')

    for article in all_articles:
        post = {}
        
        post['title'] = article.get_text(strip=True)
        post['url'] = article.get('href', 0)
        
        articles.append(post)
        
    return articles
    

---

In [24]:
url = 'https://www.reddit.com/r/dataisbeautiful/'
res = requests.get(url, headers=headers)

In [25]:
res.status_code

200

In [26]:
soup = BeautifulSoup(res.content)

In [27]:
soup

<!DOCTYPE html>
<html class="theme-beta theme-light" lang="en-US">
<head>
<script>
    var __SUPPORTS_TIMING_API = typeof performance === 'object' && !!performance.mark && !! performance.measure && !!performance.getEntriesByType;
    function __perfMark(name) { __SUPPORTS_TIMING_API && performance.mark(name); };
    var __firstPostLoaded = false;
    function __markFirstPostVisible() {
      if (__firstPostLoaded) { return; }
      __firstPostLoaded = true;
      __perfMark("first_post_title_image_loaded");
    }
    var __firstCommentLoaded = false;
    function __markFirstCommentVisible() {
      if (__firstCommentLoaded) { return; }
      __firstCommentLoaded = true;
      __perfMark("first_comment_loaded");
    }
  </script>
<script>__perfMark('head_tag_start');</script>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="origin-when-cross-origin" name="referrer"/>
<style>
  /* http://meyerweb.com/eric/tools/css/reset/
    v2

In [30]:
posts = soup.find_all("div", attrs={"data-testid": "post-container"})

In [31]:
len(posts)

7

In [37]:
post_0_children = list(posts[0].children)

In [43]:
posts[0].find('h3').get_text()

'[Topic][Open] Open Discussion Thread — Anybody can post a general visualization question or start a fresh discussion!'

In [56]:
posts[1].find_all('a')[-1]['href']

'/r/dataisbeautiful/comments/140zh2k/oc_seven_companies_account_for_all_of_the_gains/'

In [57]:
url_oc = 'https://www.reddit.com/r/dataisbeautiful/?f=flair_name%3A%22OC%22'
res_oc = requests.get(url_oc, headers=headers)

In [58]:
res_oc.status_code

200

In [59]:
soup_oc = BeautifulSoup(res_oc.content)

In [60]:
posts_oc = soup_oc.find_all("div", attrs={"data-testid": "post-container"})

In [61]:
len(posts_oc)

7

In [80]:
posts[6].find_all('a')[-1]['href']

'/r/dataisbeautiful/comments/1408bq2/oc_the_entire_history_of_premier_league_teams_in/'

In [82]:
posts_oc[0].find_all('a')

[<a class="SQnoC3ObvgnGjWt90zD9Z _2INHSNB8V5eaWp4P0rY_mE" data-click-id="body" href="/r/dataisbeautiful/comments/141kjqx/one_month_stay_in_new_england_cost_breakdown_oc/"><div class="_2SdHzo12ISmrC8H86TgSCp _3wqmjmv3tb_k-PROt7qFZe" style="--posttitletextcolor:#005BA1"><h3 class="_eYtD2XCVieq6emjKBH3m">One Month Stay in New England Cost Breakdown [OC]</h3></div></a>,
 <a href="/r/dataisbeautiful/?f=flair_name%3A%22OC%22"><span class="_1jNPl3YUk6zbpLWdjaJT1r _2VqfzH0dZ9dIl3XWNxs42y aJrgrewN9C8x1Fusdx4hh" style="background-color:#EDEFF1;color:#1A1A1B">OC</span></a>,
 <a href="/r/dataisbeautiful/comments/141kjqx/one_month_stay_in_new_england_cost_breakdown_oc/"><div class="_3Oa0THmZ3f5iZXAQ0hBJ0k" style="max-height:512px;margin:0 auto"><div><img alt="Post image" class="_2_tDEnGMLxpM6uOa2kaDB3 ImageBox-image media-element _1XWObl-3b9tPy64oaG6fax" src="https://preview.redd.it/fv90hoewa84b1.jpg?width=640&amp;crop=smart&amp;auto=webp&amp;v=enabled&amp;s=f09451b0e1f759a401c14859e95c9b007afc0b19

In [90]:
noc_url = 'https://www.reddit.com/r/dataisbeautiful/?f=-flair_name%3A%22OC%22'
noc_res = requests.get(noc_url, headers=headers)

In [91]:
noc_soup = BeautifulSoup(noc_res.content)

In [94]:
noc_posts = noc_soup.find_all("div", attrs={"data-testid": "post-container"})

In [98]:
# Title
noc_posts[0].find('h3').get_text()

'Big Pharma Revenues by product for 2020'

In [102]:
# Discussion link
comment_url = noc_posts[0].find_all('a')[-1]['href']

In [104]:
base_url = 'https://reddit.com'
comment_res = requests.get(base_url + comment_url, headers=headers)

In [105]:
comment_soup = BeautifulSoup(comment_res.content)

In [107]:
comments = comment_soup.find_all("div", attrs={"data-testid": "comment"})

In [109]:
comment_soup.find_all('div')

[<div class="promotedlink" id="acceptabletest" style="width: 1px; height: 1px; position: absolute; left: -1e+06px; top: 0px; display: block;">
     Advertisement
   </div>,
 <div class="AdHeader AdUnit adsense-ads HeaderAd SidebarAd VerticalAd _has-ads ad--content ad-adsense ad-banner adsense-ads googads ad-banner-content ad-BANNER googleAd googlead hasads leftAd native-ad ADBAR ad-medium post-ad promoad rectad sidebar-ad small-ad sponsorAd sponsorPost" id="adblocktest" style="width: 1px; height: 1px; position: absolute; left: -1000%;"></div>,
 <div class="col-span-2 s:col-span-3 flex gap-xs items-center justify-start">
 <activate-feature activation="intent" name="hamburger-menu">
 <button class="s:hidden button-small px-[length:var(--rem6)] button-plain icon button flex items-center justify-center" id="navbar-menu-button" rpl="" type="button">
 <span class="flex items-center justify-center">
 <span class="flex"> <svg fill="currentColor" height="16" icon-name="menu-outline" rpl="" view

In [None]:
https://www.reddit.com/r/dataisbeautiful/comments/141kzdq/big_pharma_revenues_by_product_for_2020/

In [111]:
base_url + comment_url

'https://reddit.com/r/dataisbeautiful/comments/141kzdq/big_pharma_revenues_by_product_for_2020/'

In [115]:
# comment access is limited to authenticated users 

In [124]:
noc_posts[0].find_all('a')[1]['href']

'https://remoracompany.com/big-pharma-revenues-for-2020/'

In [None]:
# all in all, too inconvenient. using the API would be preferred.

---

In [10]:
url_2 = 'https://www.techmeme.com/'
res_2 = requests.get(url_2)

In [11]:
res_2.status_code

200

In [13]:
soup_2 = BeautifulSoup(res_2.content)

In [126]:
print(soup_2.get_text())



Techmeme




















 Mastodon
Open Links In New Tab



Mobile
Archives
Site News






June 5, 2023, 6:35 PM

Enter Techmeme snapshot date and time:

Cancel
 













 Mediagazer
 memeorandum
 WeSmirch

Home
River
Leaderboards
About
Sponsor
Events


Get our daily newsletter and never miss a story!   ⓧ

Newsletter





Top News






Adi Robertson / The Verge:


Tim Cook unveils Apple Vision Pro, a “Spatial Computing” headset controlled with the user's eyes, hands, and voice, coming to the US for $3,499+ in early 2024  —  Apple has announced an augmented reality headset called Apple Vision Pro that “seamlessly” blends the real and digital world.


More:
Apple, Ars Technica, 9to5Mac, Engadget, TechCrunch, AppleInsider, The Guardian, The Verge, Bloomberg, AppleInsider, Disconnect, The Register, Vox, Insider, Gizmodo, SiliconANGLE, VentureBeat, iTnews, Financial Times, MacRumors, Reuters, Tech Advisor, ZDNet, Thurrott, Road to VR, Axios, Forbes, Los Angeles Times, Silicon V

In [128]:
ourh = soup_2.find_all('a', class_='ourh')

In [131]:
for o in ourh:
    print(o.get_text())

Tim Cook unveils Apple Vision Pro, a “Spatial Computing” headset controlled with the user's eyes, hands, and voice, coming to the US for $3,499+ in early 2024
Apple Vision Pro specs: 12 cameras, five sensors, and six mics, an M2 and a new R1 chip, a digital crown, 23M pixels, and Zeiss lenses for vision correction
Apple announces visionOS, which powers the Vision Pro and displays icons and windows floating over real-world spaces; demos include Safari, a cinema, and gaming
Apple Vision Pro will have an Optic ID feature that authenticates users with an iris scan; Apple says Optic ID is more secure than Face ID, which twins can fool
Apple Vision Pro will launch with Disney+ and hundreds of thousands of existing iPhone and iPad apps, including Word and Excel, and support Unity game engine
A look at the Apple Vision Pro: much better-looking than other AR/VR headsets with silver sci-fi googles, cable, and battery pack, which offers two hours of use
A recording of Apple's WWDC 2023 keynote, w

In [139]:
ourh[3]

<a class="ourh" href="https://appleinsider.com/articles/23/06/05/new-optic-id-biometrics-use-your-iris-to-unlock-the-apple-vision-pro-headset">Apple Vision Pro will have an Optic ID feature that authenticates users with an iris scan; Apple says Optic ID is more secure than Face ID, which twins can fool</a>

In [140]:
ourh

[<a class="ourh" href="https://www.theverge.com/2023/6/5/23738968/apple-vision-pro-ar-headset-features-specs-price-release-date-wwdc-2023">Tim Cook unveils Apple Vision Pro, a “Spatial Computing” headset controlled with the user's eyes, hands, and voice, coming to the US for $3,499+ in early 2024</a>,
 <a class="ourh" href="https://techcrunch.com/2023/06/05/the-apple-vision-pro-features-an-m2-chip-a-ton-of-sensors-and-a-new-r1-chip/">Apple Vision Pro specs: 12 cameras, five sensors, and six mics, an M2 and a new R1 chip, a digital crown, 23M pixels, and Zeiss lenses for vision correction</a>,
 <a class="ourh" href="https://www.theverge.com/2023/6/5/23733874/apple-vision-pro-visionos-augmented-reality-os-specs-wwdc-2023">Apple announces visionOS, which powers the Vision Pro and displays icons and windows floating over real-world spaces; demos include Safari, a cinema, and gaming</a>,
 <a class="ourh" href="https://appleinsider.com/articles/23/06/05/new-optic-id-biometrics-use-your-iris-

In [149]:
# Get a lightweight NN to dedupe these
# Create embeddings for each headline, store recent embeddings and compare.
# First appearing link should be the highest priority

def collect_techmeme_fp(mode: str ='all'):
    """Collect data on posts from the front page of Techmeme.
    
    Params
    ----------
    mode: str, default = 'all'
        Determines how many articles to pull. Should be 'all' (~48), 
        'most' (30), or 'some' (12).  
    Returns
    ----------
    List[dict] | None
        A list of dicts, each containing details of a Techmeme post. 
        If an error occurs, returns None."""
    
    if mode == 'all':
        cap = None
    elif mode == 'most':
        cap = 30
    elif mode == 'some':
        cap = 12
    else:
        raise ValueError(f'Value for "mode" must be "all", "most", or "some". Not "{mode}".')
        
    
    url = 'https://www.techmeme.com/'
    res = requests.get(url)
    
    if res.status_code != 200:
        return None
    
    articles = []
    
    soup = BeautifulSoup(res.content)
    all_articles = soup.find_all('a', class_='ourh')

    for article in all_articles:
        post = {}
        
        post['title'] = article.get_text(strip=True)
        post['url'] = article.get('href', 0)
        
        articles.append(post)
        
    return articles
    

In [150]:
tech_articles = collect_techmeme_fp()

In [151]:
tech_articles

[{'title': "Tim Cook unveils Apple Vision Pro, a “Spatial Computing” headset controlled with the user's eyes, hands, and voice, coming to the US for $3,499+ in early 2024",
  'url': 'https://www.theverge.com/2023/6/5/23738968/apple-vision-pro-ar-headset-features-specs-price-release-date-wwdc-2023'},
 {'title': 'Apple Vision Pro specs: 12 cameras, five sensors, and six mics, an M2 and a new R1 chip, a digital crown, 23M pixels, and Zeiss lenses for vision correction',
  'url': 'https://techcrunch.com/2023/06/05/the-apple-vision-pro-features-an-m2-chip-a-ton-of-sensors-and-a-new-r1-chip/'},
 {'title': 'Hands-on with Apple Vision Pro, which appears to be a genuine leapfrog in capability and execution of XR, with near perfect eye tracking and gesture control',
  'url': 'https://techcrunch.com/2023/06/05/first-impressions-yes-apple-vision-pro-works-and-yes-its-good/'},
 {'title': 'A look at the Apple Vision Pro: much better-looking than other AR/VR headsets with silver sci-fi googles, cable