In [160]:
import lxml.html
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

## Setup

In [166]:
ident = (
    "Stephanie Andrews (jellomoat@gmail.com), " + 
    "scraping for educational purposes"
)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}

## Fetching data

In [449]:
def get_next_url(count, after_id):
    base_url = "https://www.reddit.com/subreddits/new?limit=100"

    # handle if first page
    if not count or not after_id or (count < 1):
        print(base_url)
        return base_url
    else:
        return f"{base_url}&count={count}&after={after_id}"

def fetch_next_page(page_nr, fetch_limit=100, after_id=None):
    count = page_nr * fetch_limit
    print(after_id)
    url = get_next_url(count, after_id)
    print(f"page: {page_nr}")
    print(f"Fetching {url}")
    try:
        return requests.get(
            url,
            headers=headers
        ).text
    except:
        print("No more pages!")

In [450]:
page_nr = 0
html = fetch_next_page(page_nr)

None
https://www.reddit.com/subreddits/new?limit=100
page: 0
Fetching https://www.reddit.com/subreddits/new?limit=100


In [451]:
# html

## Exploring the data...

In [452]:
# dom = lxml.html.fromstring(html)
soup = BeautifulSoup(html, "lxml")
soup.title

<title>subreddits</title>

In [453]:
# r_els = soup.cssselect("div > #siteTable")[0].cssselect("div > .subreddit")
# r_els = soup.find("div", id="siteTable").find_all("div", attrs={"class": "subreddit"})
# r_els = soup.select_one("div[id=siteTable]").find_all("div", attrs={"class": "subreddit"})
r_els = soup.select("#siteTable > div.subreddit")

len(r_els)

100

In [454]:
# lxml.html.tostring(r_els[0])
r_els[0]

<div class="thing id-t5_8rbxpq odd subreddit" data-fullname="t5_8rbxpq" data-gildings="0" data-type="subreddit" data-whitelist-status="" id="thing_t5_8rbxpq" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="BypassingTheMatrix" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/BypassingTheMatrix/">r/BypassingTheMatrix: BypassingTheMatrix</a></p><p class="tagline"><span class="score dislikes" title="0"><span class="number">0</span> <span class="word">subscribers</span></span><span class="score unvoted" title="1"><span class="number">1</span> <span class="word">subscriber</span></span><span class="score likes" title="2"><span class="number">2</span> <span class="word">subscribers</span></span>, a community for 1 mi

In [455]:
# all text for a single subreddit element, incl children
# r_els[0].text_content()
r_els[0].text

'joinleaver/BypassingTheMatrix: BypassingTheMatrix0 subscribers1 subscriber2 subscribers, a community for 1 minutereport'

In [456]:
# all text for all subreddits on page
# text_content = [r.text_content().lstrip("joinleave").rstrip("report") for r in r_els]
text_content = [r.text.lstrip("joinleave").rstrip("report") for r in r_els]
text_content[:3]

['r/BypassingTheMatrix: BypassingTheMatrix0 subscribers1 subscriber2 subscribers, a community for 1 minu',
 'r/HDSpiderManVersenow2: Where Can I Watch Spider Man Across The Spider Verse Free Online For Reddit?Universal Pictures! Here’s options for downloading or watching Spider-Man: Across the Spider-Verse streaming the full 𝓂𝑜𝓋𝒾𝑒 online for free on 123𝓂𝑜𝓋𝒾𝑒s & Reddit including where to watch Universal Pictures’ 𝓂𝑜𝓋𝒾𝑒 at home. Is Spider-Man: Across the Spider-Verse 2023 available to stream? Is watching Spider-Man: Across the Spider-Verse on Disney Plus, HBO Max, Netflix or Amazon Prime? Yes we have found an authentic streaming option /\n\n0 subscribers1 subscriber2 subscribers, a community for 1 minu',
 'r/Ritam1: Ritam10 subscribers1 subscriber2 subscribers, a community for 1 minu']

## Parsing the data

In [474]:
from datetime import datetime

def get_all_desc(desc_elements_list):
    if len(desc_elements_list) > 0:
        return " ".join([el.text for el in desc_elements_list])
    else:
        ""

def parse_and_add_to_df(els_list, core_df):
    search_str = re.compile(r"a community for (.*)report")
    parsed_subs_list = []

    for r in els_list:
        sub_data = {
        "name": r.select_one(".titlerow").text.split(":")[0],
        "desc": get_all_desc(r.select("div.md > p")),
        "sub_age_tup": (re.search(search_str, r.text).group(1).split()),
        "page_id": r.get("id").lstrip("thing_"),
        "num_subscribers": r.select_one("p.tagline > span.unvoted > span.number").text,
        "dt_retrieved": datetime.now()
        }
        sub_data["age_num"] = sub_data["sub_age_tup"][0]
        sub_data["age_word"] = sub_data["sub_age_tup"][1]
        sub_data.pop("sub_age_tup")
        parsed_subs_list.append(sub_data)

#         print(f"name: {sub_data['name']}")
#         print(f"age: {sub_data['subs_num']} {sub_data['subs_word']}")
#         print(f"desc: {sub_data['desc']}")
#         print(sub_data["page_id"])
#         print("****")

#     print(parsed_subs_list)
    return pd.concat([df, pd.DataFrame(parsed_subs_list)], ignore_index=True)

df = pd.DataFrame()
print(r_els[0])
df = parse_and_add_to_df(r_els, df)

<div class="thing id-t5_8rc6ew odd subreddit" data-fullname="t5_8rc6ew" data-gildings="0" data-type="subreddit" data-whitelist-status="" id="thing_t5_8rc6ew" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="SoundFreedomnowFree" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span><span alt="not approved" class="sr-type-icon sr-type-icon-restricted" title="not approved"></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/SoundFreedomnowFree/">r/SoundFreedomnowFree: Where To Watch Sound of Freedom Online For Free ReddiT</a></p><div class="description"><form action="#" class="usertext warn-on-unload" id="form-t5_8rc6ewza3" onsubmit="return post_form(this, 'editusertext')"><input name="thing_id" type="hidden" value="t5_8rc6ew"/><div class="usertext-body may-blank

In [475]:
# number of records with desc
df[~df["desc"].isna()]

Unnamed: 0,name,desc,page_id,num_subscribers,dt_retrieved,age_num,age_word
0,r/SoundFreedomnowFree,Now Is Sound of Freedom available to stream? I...,5_8rc6ew,1,2023-07-04 16:10:34.262942,3,minutes
1,r/ArcadeParty,Arcade Party is a game developed by Team Arcad...,5_8rc66r,1,2023-07-04 16:10:34.263796,4,minutes
5,r/thedogmanreport,A place for more convenient discussion of The ...,5_8rc5we,1,2023-07-04 16:10:34.266836,5,minutes
21,r/IndianaJonesHR,"31sec ago, Adventure Movie! Here’s options for...",5_8rc3q0,2,2023-07-04 16:10:34.280661,13,minutes
31,r/ApolloAutomation,Designed and assembled in the USA,5_8rc2kw,1,2023-07-04 16:10:34.289158,17,minutes
32,r/stop_christ_bot,stop u/christ_bot_9001,5_8rc271,1,2023-07-04 16:10:34.289869,18,minutes
40,r/femalefashionadvice2,A secondary community for female fashion advice.,5_8rc1lf,1,2023-07-04 16:10:34.296981,20,minutes
44,r/SoundFreedomhdNow,"How To watch Sound of Freedom online free, whi...",5_8rc1hj,1,2023-07-04 16:10:34.300320,20,minutes
47,r/marmarauni,Unofficial subreddit of Marmara University,5_8rc0y6,3,2023-07-04 16:10:34.302677,22,minutes
48,r/scheletridelloxlao,Qui entrano solo i veri fans,5_8rc0wt,1,2023-07-04 16:10:34.303598,22,minutes


In [478]:
df["age_word"].value_counts()

age_word
minutes    100
Name: count, dtype: int64

## Get next page

In [479]:
from time import sleep

for i in range(3):
    print("Fetching page " + str(i + 1))
    sleep(1)

Fetching page 1
Fetching page 2
Fetching page 3


In [470]:
get_next_url(100, "t5_8r8ktl")
# https://www.reddit.com/subreddits/new?count=25&after=t5_8r8ktl

'https://www.reddit.com/subreddits/new?limit=100&count=100&after=t5_8r8ktl'

In [471]:
df.loc[df["dt_retrieved"] == df["dt_retrieved"].max()]["page_id"]

99    5_8rbjja
Name: page_id, dtype: object

In [472]:
last_page_id = df.loc[df["dt_retrieved"] == df["dt_retrieved"].max()]["page_id"].values[0] \
    if (len(df) > 1) else None
last_page_id

'5_8rbjja'

In [473]:
# setup
core_df = pd.DataFrame()
result_limit = 100

# fetch each page
# if last resultset returned less than the limit (100), stop fetching
for page_nr in range(3): # swap to while loop after
    last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
        if (len(core_df) > 1) else None
    print(f"last page id => {last_page_id}")
    html = fetch_next_page(page_nr, result_limit, last_page_id)
    sleep(2)

    soup = BeautifulSoup(html, "lxml")
    print(soup.title)

    # parse each page, add to df
    r_els = soup.select("#siteTable > div.subreddit")
    core_df = parse_and_add_to_df(r_els, core_df)
    sleep(2)
    if len(r_els) < 25:
        print(len(r_els))
        break

last page id => None
None
https://www.reddit.com/subreddits/new?limit=100
page: 0
Fetching https://www.reddit.com/subreddits/new?limit=100
<title>subreddits</title>
last page id => 5_8rbvg2
5_8rbvg2
page: 1
Fetching https://www.reddit.com/subreddits/new?limit=100&count=100&after=5_8rbvg2
<title>subreddits</title>
last page id => 5_8rbvg2
5_8rbvg2
page: 2
Fetching https://www.reddit.com/subreddits/new?limit=100&count=200&after=5_8rbvg2
<title>subreddits</title>
