In [160]:
import lxml.html
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

## Setup

In [166]:
ident = (
    "Stephanie Andrews (jellomoat@gmail.com), " + 
    "scraping for educational purposes"
)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}

## Fetching data

In [562]:
def get_next_url(count, after_id=None):
    base_url = "https://www.reddit.com/subreddits/new"

    # handle if first page
    if not after_id:
        print(base_url)
        return base_url
    else:
        return f"{base_url}?count={count}&after=t{after_id}"

def fetch_next_page(page_nr, fetch_limit=25, after_id=None):
    count = page_nr * fetch_limit
    print(count)
    print(after_id)
    url = get_next_url(count, after_id)
    print(f"page: {page_nr}")
    print(f"Fetching {url}")
    try:
        return requests.get(
            url,
            headers=headers
        ).text
    except:
        print("No more pages!")

In [563]:
page_nr = 0
html = fetch_next_page(page_nr)

0
None
https://www.reddit.com/subreddits/new
page: 0
Fetching https://www.reddit.com/subreddits/new


In [564]:
# html

## Exploring the data...

In [565]:
# dom = lxml.html.fromstring(html)
soup = BeautifulSoup(html, "lxml")
soup.title

<title>subreddits</title>

In [566]:
# r_els = soup.cssselect("div > #siteTable")[0].cssselect("div > .subreddit")
# r_els = soup.find("div", id="siteTable").find_all("div", attrs={"class": "subreddit"})
# r_els = soup.select_one("div[id=siteTable]").find_all("div", attrs={"class": "subreddit"})
r_els = soup.select("#siteTable > div.subreddit")

len(r_els)

25

In [567]:
# lxml.html.tostring(r_els[0])
r_els[0]

<div class="thing id-t5_8rf33y odd subreddit" data-fullname="t5_8rf33y" data-gildings="0" data-type="subreddit" data-whitelist-status="" id="thing_t5_8rf33y" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="khyjj6" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/khyjj6/">r/khyjj6: khyjj6</a></p><p class="tagline"><span class="score dislikes" title="0"><span class="number">0</span> <span class="word">subscribers</span></span><span class="score unvoted" title="1"><span class="number">1</span> <span class="word">subscriber</span></span><span class="score likes" title="2"><span class="number">2</span> <span class="word">subscribers</span></span>, a community for 4 minutes</p><ul class="flat-list buttons"><li class

In [568]:
# all text for a single subreddit element, incl children
# r_els[0].text_content()
r_els[0].text

'joinleaver/khyjj6: khyjj60 subscribers1 subscriber2 subscribers, a community for 4 minutesreport'

In [569]:
# all text for all subreddits on page
# text_content = [r.text_content().lstrip("joinleave").rstrip("report") for r in r_els]
text_content = [r.text.lstrip("joinleave").rstrip("report") for r in r_els]
text_content[:3]

['r/khyjj6: khyjj60 subscribers1 subscriber2 subscribers, a community for 4 minutes',
 'r/isabrunellionly: isabrunellionly0 subscribers1 subscriber2 subscribers, a community for 4 minutes',
 'r/rivershott: rivershott0 subscribers1 subscriber2 subscribers, a community for 4 minutes']

## Parsing the data

In [573]:
from datetime import datetime

def get_all_desc(desc_elements_list):
    if len(desc_elements_list) > 0:
        return " ".join([el.text for el in desc_elements_list])
    else:
        ""

def parse_and_add_to_df(els_list, start_df):
    print(f"start count: {len(core_df)}")
    search_str = re.compile(r"a community for (.*)report")
    parsed_subs_list = []

    for r in els_list:
        sub_data = {
        "name": r.select_one(".titlerow").text.split(":")[0],
        "desc": get_all_desc(r.select("div.md > p")),
        "sub_age_tup": (re.search(search_str, r.text).group(1).split()),
        "page_id": r.get("id").lstrip("thing_"),
        "num_subscribers": r.select_one("p.tagline > span.unvoted > span.number").text,
        "dt_retrieved": datetime.now()
        }
        sub_data["age_num"] = sub_data["sub_age_tup"][0]
        sub_data["age_word"] = sub_data["sub_age_tup"][1]
        sub_data.pop("sub_age_tup")
        parsed_subs_list.append(sub_data)

#         print(f"name: {sub_data['name']}")
#         print(f"age: {sub_data['subs_num']} {sub_data['subs_word']}")
#         print(f"desc: {sub_data['desc']}")
#         print(sub_data["page_id"])
#         print("****")

#     print(parsed_subs_list)
    end_df = pd.concat([start_df, pd.DataFrame(parsed_subs_list)], ignore_index=True)
    print(f"final core_df len: {len(end_df)}")
    return end_df

core_df = pd.DataFrame()
print(r_els[0])
core_df = parse_and_add_to_df(r_els, core_df)

<div class="thing id-t5_8rf33y odd subreddit" data-fullname="t5_8rf33y" data-gildings="0" data-type="subreddit" data-whitelist-status="" id="thing_t5_8rf33y" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="khyjj6" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/khyjj6/">r/khyjj6: khyjj6</a></p><p class="tagline"><span class="score dislikes" title="0"><span class="number">0</span> <span class="word">subscribers</span></span><span class="score unvoted" title="1"><span class="number">1</span> <span class="word">subscriber</span></span><span class="score likes" title="2"><span class="number">2</span> <span class="word">subscribers</span></span>, a community for 4 minutes</p><ul class="flat-list buttons"><li class

In [587]:
# number of records with desc
core_df[~core_df["desc"].isna()]

Unnamed: 0,name,desc,page_id,num_subscribers,dt_retrieved,age_num,age_word
18,r/Anabolicminds,Annabolicminds is now live on Reddit,5_8rf22c,1,2023-07-04 23:08:03.664035,14,minutes
20,r/kiwihockwypgh,ice and dek hockey vids,5_8rf1vb,1,2023-07-04 23:08:03.670706,15,minutes
28,r/LabelBars,Labels you found that could be a rap bar shoul...,5_8rf108,1,2023-07-04 23:08:06.000500,19,minutes
31,r/rcwchatsworthga,welcome to the Renegade Championship Wrestling...,5_8rf02e,1,2023-07-04 23:08:06.001800,23,minutes
35,r/ReckoningEFed,Home of Reckoning Efed.,5_8rez0a,1,2023-07-04 23:08:06.003417,28,minutes
36,r/SWASD2,New official subreddit for the South Williamsp...,5_8reyyg,2,2023-07-04 23:08:06.003801,28,minutes
43,r/hegetsuz,HeGetsUz - submit your memes for HeGetsUz.com,5_8rexwb,1,2023-07-04 23:08:06.006737,32,minutes
50,r/Taliyaandgustavo_Pr,wellcome to https://newporntv.me/,5_8rewtt,2,2023-07-04 23:08:08.491291,37,minutes
51,r/DragonPuppetsClub,This community is for the people who love drag...,5_8rewq5,1,2023-07-04 23:08:08.491794,38,minutes
53,r/TickleLand,tigglemytoes community,5_8rew5j,1,2023-07-04 23:08:08.492851,40,minutes


In [588]:
core_df["age_word"].value_counts()

age_word
minutes    75
Name: count, dtype: int64

## Get next page

In [579]:
from time import sleep

for i in range(3):
    print("Fetching page " + str(i + 1))
    sleep(1)

Fetching page 1
Fetching page 2
Fetching page 3


In [580]:
get_next_url(100, "5_8r8ktl")
# https://www.reddit.com/subreddits/new?count=25&after=t5_8r8ktl

'https://www.reddit.com/subreddits/new?count=100&after=t5_8r8ktl'

In [593]:
core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"]

74    5_8resct
Name: page_id, dtype: object

In [590]:
last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
    if (len(core_df) > 1) else None
last_page_id

'5_8rescs'

In [591]:
# setup
core_df = pd.DataFrame()
result_limit = 25

# fetch each page
# if last resultset returned less than the limit, stop fetching
for page_nr in range(3): # swap to while loop after
    last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
        if (len(core_df) > 1) else None
    print(f"last page id => {last_page_id}")
    html = fetch_next_page(page_nr, result_limit, last_page_id)
    sleep(2)

    soup = BeautifulSoup(html, "lxml")
    print(soup.title)

    # parse each page, add to df
    r_els = soup.select("#siteTable > div.subreddit")
    core_df = parse_and_add_to_df(r_els, core_df)
    if len(r_els) < 25:
        print(len(r_els))
        break

last page id => None
0
None
https://www.reddit.com/subreddits/new
page: 0
Fetching https://www.reddit.com/subreddits/new
<title>subreddits</title>
start count: 0
final core_df len: 25
last page id => 5_8rf1a4
25
5_8rf1a4
page: 1
Fetching https://www.reddit.com/subreddits/new?count=25&after=t5_8rf1a4
<title>subreddits</title>
start count: 25
final core_df len: 50
last page id => 5_8rewyo
50
5_8rewyo
page: 2
Fetching https://www.reddit.com/subreddits/new?count=50&after=t5_8rewyo
<title>subreddits</title>
start count: 50
final core_df len: 75


In [592]:
core_df["page_id"].value_counts()
# FIX: LAST_PAGE_ID IS NOT CHANGING WHAT!

page_id
5_8rf5od    1
5_8revg0    1
5_8rew5j    1
5_8rew9p    1
5_8rewq5    1
           ..
5_8rf1a4    1
5_8rf1dq    1
5_8rf1tg    1
5_8rf1vb    1
5_8resct    1
Name: count, Length: 75, dtype: int64

In [595]:
core_df[~core_df["desc"].isna()]

Unnamed: 0,name,desc,page_id,num_subscribers,dt_retrieved,age_num,age_word
19,r/Anabolicminds,Annabolicminds is now live on Reddit,5_8rf22c,1,2023-07-04 23:09:53.816635,16,minutes
21,r/kiwihockwypgh,ice and dek hockey vids,5_8rf1vb,1,2023-07-04 23:09:53.817436,17,minutes
29,r/LabelBars,Labels you found that could be a rap bar shoul...,5_8rf108,1,2023-07-04 23:09:56.189381,21,minutes
32,r/rcwchatsworthga,welcome to the Renegade Championship Wrestling...,5_8rf02e,1,2023-07-04 23:09:56.190518,25,minutes
36,r/ReckoningEFed,Home of Reckoning Efed.,5_8rez0a,1,2023-07-04 23:09:56.192022,29,minutes
37,r/SWASD2,New official subreddit for the South Williamsp...,5_8reyyg,2,2023-07-04 23:09:56.192373,30,minutes
44,r/hegetsuz,HeGetsUz - submit your memes for HeGetsUz.com,5_8rexwb,1,2023-07-04 23:09:56.194912,34,minutes
51,r/Taliyaandgustavo_Pr,wellcome to https://newporntv.me/,5_8rewtt,2,2023-07-04 23:09:58.701896,39,minutes
52,r/DragonPuppetsClub,This community is for the people who love drag...,5_8rewq5,1,2023-07-04 23:09:58.702402,40,minutes
54,r/TickleLand,tigglemytoes community,5_8rew5j,1,2023-07-04 23:09:58.703250,42,minutes


## Caching

In [604]:
from pathlib import Path
pages_dir = "raw-pages/"

# setup
core_df = pd.DataFrame()
result_limit = 25

# create or get fetch log
log = Path(pages_dir + "log.csv")

if log.exists():
    print("log exists")
else:
    with open(log, "w") as l:
        print("test")
        l.write("fetch_dt, url, page_nr\n")

log exists


In [605]:
# fetch each page
# if last resultset returned less than the limit (100), stop fetching
for page_nr in range(3): # swap to while loop after
    dest = Path(pages_dir + str(page_nr) + ".html")
    
    if dest.exists(): # load it from file
        print(f"Already have {dest}, loading!")
        file = open(dest, "r")
        page_html = file.read()
        file.close()

    else: # fetch it!
        last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
            if (len(core_df) > 1) else None
        print(f"page #{page_nr}: last page id => {last_page_id}")
        page_html = fetch_next_page(page_nr, 25, last_page_id)
        
        # save to file
        with open(dest, "w") as f:
            f.write(page_html)

        with open(log, "a") as l:
            fetch_dt = datetime.now()
            url = get_next_url(page_nr * 25, last_page_id)
            l.write(",".join([str(fetch_dt), url, str(page_nr)]) + "\n")

    soup = BeautifulSoup(page_html, "lxml")
    print(soup.title)

    # parse each page, add to df
    r_els = soup.select("#siteTable > div.subreddit")
    core_df = parse_and_add_to_df(r_els, core_df)
    sleep(2)
    if len(r_els) < 25:
        print(len(r_els))
        break

Already have raw-pages/0.html, loading!
<title>subreddits</title>
start count: 0
final core_df len: 25
Already have raw-pages/1.html, loading!
<title>subreddits</title>
start count: 25
final core_df len: 50
Already have raw-pages/2.html, loading!
<title>subreddits</title>
start count: 50
final core_df len: 75


In [606]:
core_df["dt_retrieved"].max()

Timestamp('2023-07-04 23:39:37.885966')

In [607]:
core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"]

74    5_8reztl
Name: page_id, dtype: object