In [147]:
import lxml.html
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from pathlib import Path

## Setup

In [122]:
ident = (
    "Stephanie Andrews (jellomoat@gmail.com), " + 
    "scraping for educational purposes"
)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}

## Fetching data

In [155]:
# https://www.reddit.com/subreddits/new/?count=25&after=t5_8rmj0v


def get_next_url(count, fetch_limit=100, after_id=None):
    base_url = f"https://www.reddit.com/subreddits/new/?limit={fetch_limit}&show=all"

    # handle if first page
    if not after_id:
        print(base_url)
        return base_url
    else:
        return f"{base_url}&count={count}&after=t{after_id}&show=all"

def fetch_next_page(page_nr, count, fetch_limit=100, after_id=None):
    url = get_next_url(count, fetch_limit, after_id)
    print(f"page: {page_nr}")
    print(f"Fetching {url}")
    try:
        return requests.get(
            url,
            headers=headers
        )
    except:
        print("No more pages!")

In [181]:
# page_nr = 0
# html = fetch_next_page(page_nr)


def fetch_pages(start_pg, end_pg, core_df):
    # setup
    pages_dir = "raw-pages/"
    result_limit = 100
    results_count = 0

    # create or get fetch log
    log = Path(pages_dir + "log.csv")

    if log.exists():
        print("log exists")
    else:
        with open(log, "w") as l:
            print("test")
            l.write("fetch_dt, url, page_nr\n")

    # fetch each page
    # if last resultset returned less than the limit, stop fetching
    for page_nr in range(start_pg, end_pg): # swap to while loop after
        dest = Path(pages_dir + str(page_nr) + ".html")

        if dest.exists(): # load it from file
            print(f"Already have {dest}, loading!")
            file = open(dest, "r")
            page_html = file.read()
            file.close()

        else: # fetch it!
            last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
                if (len(core_df) > 1) else None
            print(f"page #{page_nr}: last page id => {last_page_id}")
            page_resp = fetch_next_page(page_nr, results_count, fetch_limit=100, after_id=last_page_id)
            page_html = page_resp.text
            sleep(2)

            # save to file
            with open(dest, "w") as f:
                f.write(page_html)

            with open(log, "a") as l:
                fetch_dt = datetime.now()
                url = page_resp.url
                l.write(",".join([str(fetch_dt), url, str(page_nr)]) + "\n")

        # parse each page, add to df
        soup = BeautifulSoup(page_html, "lxml")
        print(soup.title)

        r_els = soup.select("#siteTable > div.subreddit")
        core_df = parse_and_add_to_df(r_els, core_df, page_nr)
        last_count = len(r_els)
        print(f"last_count: {last_count}")
        results_count += last_count
        print(f"results_count: {results_count}")
        print(f"df count: {len(core_df)}")
        if len(r_els) < 10:
            print(len(r_els))
            break
    return core_df

core_df = pd.DataFrame()
core_df = fetch_pages(0, 500, core_df)

test
page #0: last page id => None
https://www.reddit.com/subreddits/new/?limit=100&show=all
page: 0
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all
<title>subreddits</title>
start count: 0
final core_df len: 88
last_count: 88
results_count: 88
df count: 88
page #1: last page id => 5_8rsnhi
page: 1
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=88&after=t5_8rsnhi&show=all
<title>subreddits</title>
start count: 88
final core_df len: 179
last_count: 91
results_count: 179
df count: 179
page #2: last page id => 5_8rsd08
page: 2
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=179&after=t5_8rsd08&show=all
<title>subreddits</title>
start count: 179
final core_df len: 266
last_count: 87
results_count: 266
df count: 266
page #3: last page id => 5_8rs2ew
page: 3
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=266&after=t5_8rs2ew&show=all
<title>subreddits</title>
start count: 266
final core_df len: 352

<title>subreddits</title>
start count: 2621
final core_df len: 2721
last_count: 100
results_count: 2721
df count: 2721
page #32: last page id => 5_8rhygl
page: 32
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=2721&after=t5_8rhygl&show=all
<title>subreddits</title>
start count: 2721
final core_df len: 2821
last_count: 100
results_count: 2821
df count: 2821
page #33: last page id => 5_8rhi3o
page: 33
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=2821&after=t5_8rhi3o&show=all
<title>subreddits</title>
start count: 2821
final core_df len: 2921
last_count: 100
results_count: 2921
df count: 2921
page #34: last page id => 5_8rh02i
page: 34
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=2921&after=t5_8rh02i&show=all
<title>subreddits</title>
start count: 2921
final core_df len: 3021
last_count: 100
results_count: 3021
df count: 3021
page #35: last page id => 5_8rgmf6
page: 35
Fetching https://www.reddit.com/subredd

<title>subreddits</title>
start count: 5721
final core_df len: 5821
last_count: 100
results_count: 5821
df count: 5821
page #63: last page id => 5_8r3mvb
page: 63
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=5821&after=t5_8r3mvb&show=all
<title>subreddits</title>
start count: 5821
final core_df len: 5921
last_count: 100
results_count: 5921
df count: 5921
page #64: last page id => 5_8r37ng
page: 64
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=5921&after=t5_8r37ng&show=all
<title>subreddits</title>
start count: 5921
final core_df len: 6021
last_count: 100
results_count: 6021
df count: 6021
page #65: last page id => 5_8r2mj1
page: 65
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6021&after=t5_8r2mj1&show=all
<title>subreddits</title>
start count: 6021
final core_df len: 6121
last_count: 100
results_count: 6121
df count: 6121
page #66: last page id => 5_8r2696
page: 66
Fetching https://www.reddit.com/subredd

<title>subreddits</title>
start count: 8821
final core_df len: 8921
last_count: 100
results_count: 8921
df count: 8921
page #94: last page id => 5_8qpvbl
page: 94
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=8921&after=t5_8qpvbl&show=all
<title>subreddits</title>
start count: 8921
final core_df len: 9021
last_count: 100
results_count: 9021
df count: 9021
page #95: last page id => 5_8qpc5x
page: 95
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9021&after=t5_8qpc5x&show=all
<title>subreddits</title>
start count: 9021
final core_df len: 9121
last_count: 100
results_count: 9121
df count: 9121
page #96: last page id => 5_8qoua4
page: 96
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9121&after=t5_8qoua4&show=all
<title>subreddits</title>
start count: 9121
final core_df len: 9221
last_count: 100
results_count: 9221
df count: 9221
page #97: last page id => 5_8qofn8
page: 97
Fetching https://www.reddit.com/subredd

<title>subreddits</title>
start count: 11921
final core_df len: 12021
last_count: 100
results_count: 12021
df count: 12021
page #125: last page id => 5_8qcbts
page: 125
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12021&after=t5_8qcbts&show=all
<title>subreddits</title>
start count: 12021
final core_df len: 12121
last_count: 100
results_count: 12121
df count: 12121
page #126: last page id => 5_8qbqty
page: 126
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12121&after=t5_8qbqty&show=all
<title>subreddits</title>
start count: 12121
final core_df len: 12221
last_count: 100
results_count: 12221
df count: 12221
page #127: last page id => 5_8qbbcc
page: 127
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12221&after=t5_8qbbcc&show=all
<title>subreddits</title>
start count: 12221
final core_df len: 12321
last_count: 100
results_count: 12321
df count: 12321
page #128: last page id => 5_8qarm7
page: 128
Fetching htt

<title>subreddits</title>
start count: 15021
final core_df len: 15121
last_count: 100
results_count: 15121
df count: 15121
page #156: last page id => 5_8pyiel
page: 156
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15121&after=t5_8pyiel&show=all
<title>subreddits</title>
start count: 15121
final core_df len: 15221
last_count: 100
results_count: 15221
df count: 15221
page #157: last page id => 5_8py88z
page: 157
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15221&after=t5_8py88z&show=all
<title>subreddits</title>
start count: 15221
final core_df len: 15321
last_count: 100
results_count: 15321
df count: 15321
page #158: last page id => 5_8pxtjm
page: 158
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15321&after=t5_8pxtjm&show=all
<title>subreddits</title>
start count: 15321
final core_df len: 15421
last_count: 100
results_count: 15421
df count: 15421
page #159: last page id => 5_8pxdo3
page: 159
Fetching htt

<title>subreddits</title>
start count: 18121
final core_df len: 18221
last_count: 100
results_count: 18221
df count: 18221
page #187: last page id => 5_8pkq91
page: 187
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18221&after=t5_8pkq91&show=all
<title>subreddits</title>
start count: 18221
final core_df len: 18321
last_count: 100
results_count: 18321
df count: 18321
page #188: last page id => 5_8pk669
page: 188
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18321&after=t5_8pk669&show=all
<title>subreddits</title>
start count: 18321
final core_df len: 18421
last_count: 100
results_count: 18421
df count: 18421
page #189: last page id => 5_8pjmbx
page: 189
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18421&after=t5_8pjmbx&show=all
<title>subreddits</title>
start count: 18421
final core_df len: 18521
last_count: 100
results_count: 18521
df count: 18521
page #190: last page id => 5_8pj99o
page: 190
Fetching htt

<title>subreddits</title>
start count: 21221
final core_df len: 21321
last_count: 100
results_count: 21321
df count: 21321
page #218: last page id => 5_8p6rdf
page: 218
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21321&after=t5_8p6rdf&show=all
<title>subreddits</title>
start count: 21321
final core_df len: 21421
last_count: 100
results_count: 21421
df count: 21421
page #219: last page id => 5_8p6ei6
page: 219
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21421&after=t5_8p6ei6&show=all
<title>subreddits</title>
start count: 21421
final core_df len: 21521
last_count: 100
results_count: 21521
df count: 21521
page #220: last page id => 5_8p5zft
page: 220
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21521&after=t5_8p5zft&show=all
<title>subreddits</title>
start count: 21521
final core_df len: 21621
last_count: 100
results_count: 21621
df count: 21621
page #221: last page id => 5_8p5lxa
page: 221
Fetching htt

<title>subreddits</title>
start count: 24321
final core_df len: 24421
last_count: 100
results_count: 24421
df count: 24421
page #249: last page id => 5_8oskr8
page: 249
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24421&after=t5_8oskr8&show=all
<title>subreddits</title>
start count: 24421
final core_df len: 24521
last_count: 100
results_count: 24521
df count: 24521
page #250: last page id => 5_8os2hf
page: 250
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24521&after=t5_8os2hf&show=all
<title>subreddits</title>
start count: 24521
final core_df len: 24621
last_count: 100
results_count: 24621
df count: 24621
page #251: last page id => 5_8orlya
page: 251
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24621&after=t5_8orlya&show=all
<title>subreddits</title>
start count: 24621
final core_df len: 24721
last_count: 100
results_count: 24721
df count: 24721
page #252: last page id => 5_8or53m
page: 252
Fetching htt

<title>subreddits</title>
start count: 27421
final core_df len: 27521
last_count: 100
results_count: 27521
df count: 27521
page #280: last page id => 5_8oerwu
page: 280
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=27521&after=t5_8oerwu&show=all
<title>subreddits</title>
start count: 27521
final core_df len: 27621
last_count: 100
results_count: 27621
df count: 27621
page #281: last page id => 5_8oedg2
page: 281
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=27621&after=t5_8oedg2&show=all
<title>subreddits</title>
start count: 27621
final core_df len: 27721
last_count: 100
results_count: 27721
df count: 27721
page #282: last page id => 5_8oe174
page: 282
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=27721&after=t5_8oe174&show=all
<title>subreddits</title>
start count: 27721
final core_df len: 27821
last_count: 100
results_count: 27821
df count: 27821
page #283: last page id => 5_8odk2a
page: 283
Fetching htt

<title>subreddits</title>
start count: 30521
final core_df len: 30621
last_count: 100
results_count: 30621
df count: 30621
page #311: last page id => 5_8nzimz
page: 311
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=30621&after=t5_8nzimz&show=all
<title>subreddits</title>
start count: 30621
final core_df len: 30721
last_count: 100
results_count: 30721
df count: 30721
page #312: last page id => 5_8nyxi9
page: 312
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=30721&after=t5_8nyxi9&show=all
<title>subreddits</title>
start count: 30721
final core_df len: 30821
last_count: 100
results_count: 30821
df count: 30821
page #313: last page id => 5_8nydlh
page: 313
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=30821&after=t5_8nydlh&show=all
<title>subreddits</title>
start count: 30821
final core_df len: 30921
last_count: 100
results_count: 30921
df count: 30921
page #314: last page id => 5_8nxtui
page: 314
Fetching htt

<title>subreddits</title>
start count: 33621
final core_df len: 33721
last_count: 100
results_count: 33721
df count: 33721
page #342: last page id => 5_8ngxht
page: 342
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=33721&after=t5_8ngxht&show=all
<title>subreddits</title>
start count: 33721
final core_df len: 33821
last_count: 100
results_count: 33821
df count: 33821
page #343: last page id => 5_8ngmj3
page: 343
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=33821&after=t5_8ngmj3&show=all
<title>subreddits</title>
start count: 33821
final core_df len: 33921
last_count: 100
results_count: 33921
df count: 33921
page #344: last page id => 5_8ng5ux
page: 344
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=33921&after=t5_8ng5ux&show=all
<title>subreddits</title>
start count: 33921
final core_df len: 34021
last_count: 100
results_count: 34021
df count: 34021
page #345: last page id => 5_8nfp5y
page: 345
Fetching htt

<title>subreddits</title>
start count: 36721
final core_df len: 36821
last_count: 100
results_count: 36821
df count: 36821
page #373: last page id => 5_8n2q3c
page: 373
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=36821&after=t5_8n2q3c&show=all
<title>subreddits</title>
start count: 36821
final core_df len: 36921
last_count: 100
results_count: 36921
df count: 36921
page #374: last page id => 5_8n2b1j
page: 374
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=36921&after=t5_8n2b1j&show=all
<title>subreddits</title>
start count: 36921
final core_df len: 37021
last_count: 100
results_count: 37021
df count: 37021
page #375: last page id => 5_8n1tny
page: 375
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=37021&after=t5_8n1tny&show=all
<title>subreddits</title>
start count: 37021
final core_df len: 37121
last_count: 100
results_count: 37121
df count: 37121
page #376: last page id => 5_8n1cv6
page: 376
Fetching htt

<title>subreddits</title>
start count: 39821
final core_df len: 39921
last_count: 100
results_count: 39921
df count: 39921
page #404: last page id => 5_8moulg
page: 404
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=39921&after=t5_8moulg&show=all
<title>subreddits</title>
start count: 39921
final core_df len: 40021
last_count: 100
results_count: 40021
df count: 40021
page #405: last page id => 5_8moe91
page: 405
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=40021&after=t5_8moe91&show=all
<title>subreddits</title>
start count: 40021
final core_df len: 40121
last_count: 100
results_count: 40121
df count: 40121
page #406: last page id => 5_8mo18v
page: 406
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=40121&after=t5_8mo18v&show=all
<title>subreddits</title>
start count: 40121
final core_df len: 40221
last_count: 100
results_count: 40221
df count: 40221
page #407: last page id => 5_8mnol1
page: 407
Fetching htt

<title>subreddits</title>
start count: 42921
final core_df len: 43021
last_count: 100
results_count: 43021
df count: 43021
page #435: last page id => 5_8md1bp
page: 435
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=43021&after=t5_8md1bp&show=all
<title>subreddits</title>
start count: 43021
final core_df len: 43121
last_count: 100
results_count: 43121
df count: 43121
page #436: last page id => 5_8mcr74
page: 436
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=43121&after=t5_8mcr74&show=all
<title>subreddits</title>
start count: 43121
final core_df len: 43221
last_count: 100
results_count: 43221
df count: 43221
page #437: last page id => 5_8mcgbl
page: 437
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=43221&after=t5_8mcgbl&show=all
<title>subreddits</title>
start count: 43221
final core_df len: 43321
last_count: 100
results_count: 43321
df count: 43321
page #438: last page id => 5_8mc0bg
page: 438
Fetching htt

<title>subreddits</title>
start count: 46021
final core_df len: 46121
last_count: 100
results_count: 46121
df count: 46121
page #466: last page id => 5_8lz9fj
page: 466
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=46121&after=t5_8lz9fj&show=all
<title>subreddits</title>
start count: 46121
final core_df len: 46221
last_count: 100
results_count: 46221
df count: 46221
page #467: last page id => 5_8lyu5q
page: 467
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=46221&after=t5_8lyu5q&show=all
<title>subreddits</title>
start count: 46221
final core_df len: 46321
last_count: 100
results_count: 46321
df count: 46321
page #468: last page id => 5_8lyfwj
page: 468
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=46321&after=t5_8lyfwj&show=all
<title>subreddits</title>
start count: 46321
final core_df len: 46421
last_count: 100
results_count: 46421
df count: 46421
page #469: last page id => 5_8ly1zc
page: 469
Fetching htt

<title>subreddits</title>
start count: 49121
final core_df len: 49221
last_count: 100
results_count: 49221
df count: 49221
page #497: last page id => 5_8lomb0
page: 497
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=49221&after=t5_8lomb0&show=all
<title>subreddits</title>
start count: 49221
final core_df len: 49321
last_count: 100
results_count: 49321
df count: 49321
page #498: last page id => 5_8lo9vx
page: 498
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=49321&after=t5_8lo9vx&show=all
<title>subreddits</title>
start count: 49321
final core_df len: 49421
last_count: 100
results_count: 49421
df count: 49421
page #499: last page id => 5_8lnvo1
page: 499
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=49421&after=t5_8lnvo1&show=all
<title>subreddits</title>
start count: 49421
final core_df len: 49521
last_count: 100
results_count: 49521
df count: 49521


In [182]:
core_df = fetch_pages(500, 1000, core_df)

log exists
page #500: last page id => 5_8lnk0m
page: 500
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=0&after=t5_8lnk0m&show=all
<title>subreddits</title>
start count: 49521
final core_df len: 49621
last_count: 100
results_count: 100
df count: 49621
page #501: last page id => 5_8ln6so
page: 501
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=100&after=t5_8ln6so&show=all
<title>subreddits</title>
start count: 49621
final core_df len: 49721
last_count: 100
results_count: 200
df count: 49721
page #502: last page id => 5_8lmu40
page: 502
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=200&after=t5_8lmu40&show=all
<title>subreddits</title>
start count: 49721
final core_df len: 49821
last_count: 100
results_count: 300
df count: 49821
page #503: last page id => 5_8lmgo3
page: 503
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=300&after=t5_8lmgo3&show=all
<title>subreddits</title>
start coun

<title>subreddits</title>
start count: 52521
final core_df len: 52621
last_count: 100
results_count: 3100
df count: 52621
page #531: last page id => 5_8lbgtq
page: 531
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3100&after=t5_8lbgtq&show=all
<title>subreddits</title>
start count: 52621
final core_df len: 52721
last_count: 100
results_count: 3200
df count: 52721
page #532: last page id => 5_8lb3uj
page: 532
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3200&after=t5_8lb3uj&show=all
<title>subreddits</title>
start count: 52721
final core_df len: 52821
last_count: 100
results_count: 3300
df count: 52821
page #533: last page id => 5_8laq9j
page: 533
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3300&after=t5_8laq9j&show=all
<title>subreddits</title>
start count: 52821
final core_df len: 52921
last_count: 100
results_count: 3400
df count: 52921
page #534: last page id => 5_8laeck
page: 534
Fetching https://ww

<title>subreddits</title>
start count: 55621
final core_df len: 55721
last_count: 100
results_count: 6200
df count: 55721
page #562: last page id => 5_8l0oni
page: 562
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6200&after=t5_8l0oni&show=all
<title>subreddits</title>
start count: 55721
final core_df len: 55821
last_count: 100
results_count: 6300
df count: 55821
page #563: last page id => 5_8l0ef9
page: 563
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6300&after=t5_8l0ef9&show=all
<title>subreddits</title>
start count: 55821
final core_df len: 55921
last_count: 100
results_count: 6400
df count: 55921
page #564: last page id => 5_8l044z
page: 564
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6400&after=t5_8l044z&show=all
<title>subreddits</title>
start count: 55921
final core_df len: 56021
last_count: 100
results_count: 6500
df count: 56021
page #565: last page id => 5_8kzui1
page: 565
Fetching https://ww

<title>subreddits</title>
start count: 58721
final core_df len: 58821
last_count: 100
results_count: 9300
df count: 58821
page #593: last page id => 5_8kosao
page: 593
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9300&after=t5_8kosao&show=all
<title>subreddits</title>
start count: 58821
final core_df len: 58921
last_count: 100
results_count: 9400
df count: 58921
page #594: last page id => 5_8ko5k8
page: 594
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9400&after=t5_8ko5k8&show=all
<title>subreddits</title>
start count: 58921
final core_df len: 59021
last_count: 100
results_count: 9500
df count: 59021
page #595: last page id => 5_8kngp3
page: 595
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9500&after=t5_8kngp3&show=all
<title>subreddits</title>
start count: 59021
final core_df len: 59121
last_count: 100
results_count: 9600
df count: 59121
page #596: last page id => 5_8kmygb
page: 596
Fetching https://ww

<title>subreddits</title>
start count: 61821
final core_df len: 61921
last_count: 100
results_count: 12400
df count: 61921
page #624: last page id => 5_8k7qqy
page: 624
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12400&after=t5_8k7qqy&show=all
<title>subreddits</title>
start count: 61921
final core_df len: 62021
last_count: 100
results_count: 12500
df count: 62021
page #625: last page id => 5_8k79td
page: 625
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12500&after=t5_8k79td&show=all
<title>subreddits</title>
start count: 62021
final core_df len: 62121
last_count: 100
results_count: 12600
df count: 62121
page #626: last page id => 5_8k6ted
page: 626
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12600&after=t5_8k6ted&show=all
<title>subreddits</title>
start count: 62121
final core_df len: 62221
last_count: 100
results_count: 12700
df count: 62221
page #627: last page id => 5_8k6bcz
page: 627
Fetching htt

<title>subreddits</title>
start count: 64921
final core_df len: 65021
last_count: 100
results_count: 15500
df count: 65021
page #655: last page id => 5_8jux9f
page: 655
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15500&after=t5_8jux9f&show=all
<title>subreddits</title>
start count: 65021
final core_df len: 65121
last_count: 100
results_count: 15600
df count: 65121
page #656: last page id => 5_8juhd6
page: 656
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15600&after=t5_8juhd6&show=all
<title>subreddits</title>
start count: 65121
final core_df len: 65221
last_count: 100
results_count: 15700
df count: 65221
page #657: last page id => 5_8ju5jl
page: 657
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15700&after=t5_8ju5jl&show=all
<title>subreddits</title>
start count: 65221
final core_df len: 65321
last_count: 100
results_count: 15800
df count: 65321
page #658: last page id => 5_8jtrp3
page: 658
Fetching htt

<title>subreddits</title>
start count: 68021
final core_df len: 68121
last_count: 100
results_count: 18600
df count: 68121
page #686: last page id => 5_8jgl75
page: 686
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18600&after=t5_8jgl75&show=all
<title>subreddits</title>
start count: 68121
final core_df len: 68221
last_count: 100
results_count: 18700
df count: 68221
page #687: last page id => 5_8jfzoe
page: 687
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18700&after=t5_8jfzoe&show=all
<title>subreddits</title>
start count: 68221
final core_df len: 68321
last_count: 100
results_count: 18800
df count: 68321
page #688: last page id => 5_8jfiff
page: 688
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18800&after=t5_8jfiff&show=all
<title>subreddits</title>
start count: 68321
final core_df len: 68421
last_count: 100
results_count: 18900
df count: 68421
page #689: last page id => 5_8jf25k
page: 689
Fetching htt

<title>subreddits</title>
start count: 71121
final core_df len: 71221
last_count: 100
results_count: 21700
df count: 71221
page #717: last page id => 5_8j2vif
page: 717
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21700&after=t5_8j2vif&show=all
<title>subreddits</title>
start count: 71221
final core_df len: 71321
last_count: 100
results_count: 21800
df count: 71321
page #718: last page id => 5_8j2izr
page: 718
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21800&after=t5_8j2izr&show=all
<title>subreddits</title>
start count: 71321
final core_df len: 71421
last_count: 100
results_count: 21900
df count: 71421
page #719: last page id => 5_8j26lu
page: 719
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21900&after=t5_8j26lu&show=all
<title>subreddits</title>
start count: 71421
final core_df len: 71521
last_count: 100
results_count: 22000
df count: 71521
page #720: last page id => 5_8j1v81
page: 720
Fetching htt

<title>subreddits</title>
start count: 74221
final core_df len: 74321
last_count: 100
results_count: 24800
df count: 74321
page #748: last page id => 5_8ir0b3
page: 748
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24800&after=t5_8ir0b3&show=all
<title>subreddits</title>
start count: 74321
final core_df len: 74421
last_count: 100
results_count: 24900
df count: 74421
page #749: last page id => 5_8iql8i
page: 749
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24900&after=t5_8iql8i&show=all
<title>subreddits</title>
start count: 74421
final core_df len: 74521
last_count: 100
results_count: 25000
df count: 74521
page #750: last page id => 5_8iq5dm
page: 750
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=25000&after=t5_8iq5dm&show=all
<title>subreddits</title>
start count: 74521
final core_df len: 74621
last_count: 100
results_count: 25100
df count: 74621
page #751: last page id => 5_8ipo1m
page: 751
Fetching htt

<title>subreddits</title>
start count: 77321
final core_df len: 77421
last_count: 100
results_count: 27900
df count: 77421
page #779: last page id => 5_8ieefr
page: 779
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=27900&after=t5_8ieefr&show=all
<title>subreddits</title>
start count: 77421
final core_df len: 77521
last_count: 100
results_count: 28000
df count: 77521
page #780: last page id => 5_8idyyz
page: 780
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=28000&after=t5_8idyyz&show=all
<title>subreddits</title>
start count: 77521
final core_df len: 77621
last_count: 100
results_count: 28100
df count: 77621
page #781: last page id => 5_8idmdy
page: 781
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=28100&after=t5_8idmdy&show=all
<title>subreddits</title>
start count: 77621
final core_df len: 77721
last_count: 100
results_count: 28200
df count: 77721
page #782: last page id => 5_8idaum
page: 782
Fetching htt

<title>subreddits</title>
start count: 80421
final core_df len: 80521
last_count: 100
results_count: 31000
df count: 80521
page #810: last page id => 5_8i25ai
page: 810
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=31000&after=t5_8i25ai&show=all
<title>subreddits</title>
start count: 80521
final core_df len: 80621
last_count: 100
results_count: 31100
df count: 80621
page #811: last page id => 5_8i1oon
page: 811
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=31100&after=t5_8i1oon&show=all
<title>subreddits</title>
start count: 80621
final core_df len: 80721
last_count: 100
results_count: 31200
df count: 80721
page #812: last page id => 5_8i12u5
page: 812
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=31200&after=t5_8i12u5&show=all
<title>subreddits</title>
start count: 80721
final core_df len: 80821
last_count: 100
results_count: 31300
df count: 80821
page #813: last page id => 5_8i0ixk
page: 813
Fetching htt

<title>subreddits</title>
start count: 83521
final core_df len: 83621
last_count: 100
results_count: 34100
df count: 83621
page #841: last page id => 5_8hp2y6
page: 841
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=34100&after=t5_8hp2y6&show=all
<title>subreddits</title>
start count: 83621
final core_df len: 83721
last_count: 100
results_count: 34200
df count: 83721
page #842: last page id => 5_8hoppn
page: 842
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=34200&after=t5_8hoppn&show=all
<title>subreddits</title>
start count: 83721
final core_df len: 83821
last_count: 100
results_count: 34300
df count: 83821
page #843: last page id => 5_8hodxo
page: 843
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=34300&after=t5_8hodxo&show=all
<title>subreddits</title>
start count: 83821
final core_df len: 83921
last_count: 100
results_count: 34400
df count: 83921
page #844: last page id => 5_8ho3go
page: 844
Fetching htt

<title>subreddits</title>
start count: 86621
final core_df len: 86721
last_count: 100
results_count: 37200
df count: 86721
page #872: last page id => 5_8hcvtz
page: 872
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=37200&after=t5_8hcvtz&show=all
<title>subreddits</title>
start count: 86721
final core_df len: 86821
last_count: 100
results_count: 37300
df count: 86821
page #873: last page id => 5_8hce4o
page: 873
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=37300&after=t5_8hce4o&show=all
<title>subreddits</title>
start count: 86821
final core_df len: 86921
last_count: 100
results_count: 37400
df count: 86921
page #874: last page id => 5_8hc1zx
page: 874
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=37400&after=t5_8hc1zx&show=all
<title>subreddits</title>
start count: 86921
final core_df len: 87021
last_count: 100
results_count: 37500
df count: 87021
page #875: last page id => 5_8hblfs
page: 875
Fetching htt

<title>subreddits</title>
start count: 89721
final core_df len: 89821
last_count: 100
results_count: 40300
df count: 89821
page #903: last page id => 5_8h0437
page: 903
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=40300&after=t5_8h0437&show=all
<title>subreddits</title>
start count: 89821
final core_df len: 89921
last_count: 100
results_count: 40400
df count: 89921
page #904: last page id => 5_8gzru5
page: 904
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=40400&after=t5_8gzru5&show=all
<title>subreddits</title>
start count: 89921
final core_df len: 90021
last_count: 100
results_count: 40500
df count: 90021
page #905: last page id => 5_8gzdm2
page: 905
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=40500&after=t5_8gzdm2&show=all
<title>subreddits</title>
start count: 90021
final core_df len: 90121
last_count: 100
results_count: 40600
df count: 90121
page #906: last page id => 5_8gz09s
page: 906
Fetching htt

<title>subreddits</title>
start count: 92821
final core_df len: 92921
last_count: 100
results_count: 43400
df count: 92921
page #934: last page id => 5_8gozff
page: 934
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=43400&after=t5_8gozff&show=all
<title>subreddits</title>
start count: 92921
final core_df len: 93021
last_count: 100
results_count: 43500
df count: 93021
page #935: last page id => 5_8gonr4
page: 935
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=43500&after=t5_8gonr4&show=all
<title>subreddits</title>
start count: 93021
final core_df len: 93121
last_count: 100
results_count: 43600
df count: 93121
page #936: last page id => 5_8gobtl
page: 936
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=43600&after=t5_8gobtl&show=all
<title>subreddits</title>
start count: 93121
final core_df len: 93221
last_count: 100
results_count: 43700
df count: 93221
page #937: last page id => 5_8gnzdi
page: 937
Fetching htt

<title>subreddits</title>
start count: 95921
final core_df len: 96021
last_count: 100
results_count: 46500
df count: 96021
page #965: last page id => 5_8gcwmw
page: 965
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=46500&after=t5_8gcwmw&show=all
<title>subreddits</title>
start count: 96021
final core_df len: 96121
last_count: 100
results_count: 46600
df count: 96121
page #966: last page id => 5_8gcicw
page: 966
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=46600&after=t5_8gcicw&show=all
<title>subreddits</title>
start count: 96121
final core_df len: 96221
last_count: 100
results_count: 46700
df count: 96221
page #967: last page id => 5_8gc4m4
page: 967
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=46700&after=t5_8gc4m4&show=all
<title>subreddits</title>
start count: 96221
final core_df len: 96321
last_count: 100
results_count: 46800
df count: 96321
page #968: last page id => 5_8gbslp
page: 968
Fetching htt

<title>subreddits</title>
start count: 99021
final core_df len: 99121
last_count: 100
results_count: 49600
df count: 99121
page #996: last page id => 5_8fxux2
page: 996
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=49600&after=t5_8fxux2&show=all
<title>subreddits</title>
start count: 99121
final core_df len: 99221
last_count: 100
results_count: 49700
df count: 99221
page #997: last page id => 5_8fxhcs
page: 997
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=49700&after=t5_8fxhcs&show=all
<title>subreddits</title>
start count: 99221
final core_df len: 99321
last_count: 100
results_count: 49800
df count: 99321
page #998: last page id => 5_8fx3iv
page: 998
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=49800&after=t5_8fx3iv&show=all
<title>subreddits</title>
start count: 99321
final core_df len: 99421
last_count: 100
results_count: 49900
df count: 99421
page #999: last page id => 5_8fwpcy
page: 999
Fetching htt

In [184]:
core_df = fetch_pages(1000, 1250, core_df)

log exists
page #1000: last page id => 5_8fwdhu
page: 1000
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=0&after=t5_8fwdhu&show=all
<title>subreddits</title>
start count: 99521
final core_df len: 99621
last_count: 100
results_count: 100
df count: 99621
page #1001: last page id => 5_8fw0wv
page: 1001
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=100&after=t5_8fw0wv&show=all
<title>subreddits</title>
start count: 99621
final core_df len: 99721
last_count: 100
results_count: 200
df count: 99721
page #1002: last page id => 5_8fvn1r
page: 1002
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=200&after=t5_8fvn1r&show=all
<title>subreddits</title>
start count: 99721
final core_df len: 99821
last_count: 100
results_count: 300
df count: 99821
page #1003: last page id => 5_8fv612
page: 1003
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=300&after=t5_8fv612&show=all
<title>subreddits</title>
st

<title>subreddits</title>
start count: 102521
final core_df len: 102621
last_count: 100
results_count: 3100
df count: 102621
page #1031: last page id => 5_8fjlof
page: 1031
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3100&after=t5_8fjlof&show=all
<title>subreddits</title>
start count: 102621
final core_df len: 102721
last_count: 100
results_count: 3200
df count: 102721
page #1032: last page id => 5_8fj99q
page: 1032
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3200&after=t5_8fj99q&show=all
<title>subreddits</title>
start count: 102721
final core_df len: 102821
last_count: 100
results_count: 3300
df count: 102821
page #1033: last page id => 5_8fivur
page: 1033
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3300&after=t5_8fivur&show=all
<title>subreddits</title>
start count: 102821
final core_df len: 102921
last_count: 100
results_count: 3400
df count: 102921
page #1034: last page id => 5_8fihz9
page: 1034

<title>subreddits</title>
start count: 105521
final core_df len: 105621
last_count: 100
results_count: 6100
df count: 105621
page #1061: last page id => 5_8f8ch3
page: 1061
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6100&after=t5_8f8ch3&show=all
<title>subreddits</title>
start count: 105621
final core_df len: 105721
last_count: 100
results_count: 6200
df count: 105721
page #1062: last page id => 5_8f7wf8
page: 1062
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6200&after=t5_8f7wf8&show=all
<title>subreddits</title>
start count: 105721
final core_df len: 105821
last_count: 100
results_count: 6300
df count: 105821
page #1063: last page id => 5_8f7izs
page: 1063
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6300&after=t5_8f7izs&show=all
<title>subreddits</title>
start count: 105821
final core_df len: 105921
last_count: 100
results_count: 6400
df count: 105921
page #1064: last page id => 5_8f76ud
page: 1064

<title>subreddits</title>
start count: 108521
final core_df len: 108621
last_count: 100
results_count: 9100
df count: 108621
page #1091: last page id => 5_8exnvg
page: 1091
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9100&after=t5_8exnvg&show=all
<title>subreddits</title>
start count: 108621
final core_df len: 108721
last_count: 100
results_count: 9200
df count: 108721
page #1092: last page id => 5_8ex8h0
page: 1092
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9200&after=t5_8ex8h0&show=all
<title>subreddits</title>
start count: 108721
final core_df len: 108821
last_count: 100
results_count: 9300
df count: 108821
page #1093: last page id => 5_8ewuub
page: 1093
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9300&after=t5_8ewuub&show=all
<title>subreddits</title>
start count: 108821
final core_df len: 108921
last_count: 100
results_count: 9400
df count: 108921
page #1094: last page id => 5_8ewkye
page: 1094

<title>subreddits</title>
start count: 111521
final core_df len: 111621
last_count: 100
results_count: 12100
df count: 111621
page #1121: last page id => 5_8el0v0
page: 1121
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12100&after=t5_8el0v0&show=all
<title>subreddits</title>
start count: 111621
final core_df len: 111721
last_count: 100
results_count: 12200
df count: 111721
page #1122: last page id => 5_8ekny0
page: 1122
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12200&after=t5_8ekny0&show=all
<title>subreddits</title>
start count: 111721
final core_df len: 111821
last_count: 100
results_count: 12300
df count: 111821
page #1123: last page id => 5_8ekbv8
page: 1123
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12300&after=t5_8ekbv8&show=all
<title>subreddits</title>
start count: 111821
final core_df len: 111921
last_count: 100
results_count: 12400
df count: 111921
page #1124: last page id => 5_8ejz8z
pag

<title>subreddits</title>
start count: 114521
final core_df len: 114621
last_count: 100
results_count: 15100
df count: 114621
page #1151: last page id => 5_8e92bc
page: 1151
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15100&after=t5_8e92bc&show=all
<title>subreddits</title>
start count: 114621
final core_df len: 114721
last_count: 100
results_count: 15200
df count: 114721
page #1152: last page id => 5_8e8nxo
page: 1152
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15200&after=t5_8e8nxo&show=all
<title>subreddits</title>
start count: 114721
final core_df len: 114821
last_count: 100
results_count: 15300
df count: 114821
page #1153: last page id => 5_8e88v8
page: 1153
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15300&after=t5_8e88v8&show=all
<title>subreddits</title>
start count: 114821
final core_df len: 114921
last_count: 100
results_count: 15400
df count: 114921
page #1154: last page id => 5_8e7vfc
pag

<title>subreddits</title>
start count: 117521
final core_df len: 117621
last_count: 100
results_count: 18100
df count: 117621
page #1181: last page id => 5_8dxlgt
page: 1181
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18100&after=t5_8dxlgt&show=all
<title>subreddits</title>
start count: 117621
final core_df len: 117721
last_count: 100
results_count: 18200
df count: 117721
page #1182: last page id => 5_8dx68h
page: 1182
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18200&after=t5_8dx68h&show=all
<title>subreddits</title>
start count: 117721
final core_df len: 117821
last_count: 100
results_count: 18300
df count: 117821
page #1183: last page id => 5_8dwpx6
page: 1183
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18300&after=t5_8dwpx6&show=all
<title>subreddits</title>
start count: 117821
final core_df len: 117921
last_count: 100
results_count: 18400
df count: 117921
page #1184: last page id => 5_8dwb0w
pag

<title>subreddits</title>
start count: 120521
final core_df len: 120621
last_count: 100
results_count: 21100
df count: 120621
page #1211: last page id => 5_8dknyy
page: 1211
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21100&after=t5_8dknyy&show=all
<title>subreddits</title>
start count: 120621
final core_df len: 120721
last_count: 100
results_count: 21200
df count: 120721
page #1212: last page id => 5_8dk983
page: 1212
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21200&after=t5_8dk983&show=all
<title>subreddits</title>
start count: 120721
final core_df len: 120821
last_count: 100
results_count: 21300
df count: 120821
page #1213: last page id => 5_8djxik
page: 1213
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21300&after=t5_8djxik&show=all
<title>subreddits</title>
start count: 120821
final core_df len: 120921
last_count: 100
results_count: 21400
df count: 120921
page #1214: last page id => 5_8djk00
pag

<title>subreddits</title>
start count: 123521
final core_df len: 123621
last_count: 100
results_count: 24100
df count: 123621
page #1241: last page id => 5_8d8z36
page: 1241
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24100&after=t5_8d8z36&show=all
<title>subreddits</title>
start count: 123621
final core_df len: 123721
last_count: 100
results_count: 24200
df count: 123721
page #1242: last page id => 5_8d8i83
page: 1242
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24200&after=t5_8d8i83&show=all
<title>subreddits</title>
start count: 123721
final core_df len: 123821
last_count: 100
results_count: 24300
df count: 123821
page #1243: last page id => 5_8d7v30
page: 1243
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24300&after=t5_8d7v30&show=all
<title>subreddits</title>
start count: 123821
final core_df len: 123921
last_count: 100
results_count: 24400
df count: 123921
page #1244: last page id => 5_8d77np
pag

In [185]:
core_df = fetch_pages(1250, 1500, core_df)

log exists
page #1250: last page id => 5_8d4nv3
page: 1250
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=0&after=t5_8d4nv3&show=all
<title>subreddits</title>
start count: 124521
final core_df len: 124621
last_count: 100
results_count: 100
df count: 124621
page #1251: last page id => 5_8d49ht
page: 1251
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=100&after=t5_8d49ht&show=all
<title>subreddits</title>
start count: 124621
final core_df len: 124721
last_count: 100
results_count: 200
df count: 124721
page #1252: last page id => 5_8d3tlx
page: 1252
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=200&after=t5_8d3tlx&show=all
<title>subreddits</title>
start count: 124721
final core_df len: 124821
last_count: 100
results_count: 300
df count: 124821
page #1253: last page id => 5_8d3gry
page: 1253
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=300&after=t5_8d3gry&show=all
<title>subreddits</

<title>subreddits</title>
start count: 127521
final core_df len: 127621
last_count: 100
results_count: 3100
df count: 127621
page #1281: last page id => 5_8crmcm
page: 1281
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3100&after=t5_8crmcm&show=all
<title>subreddits</title>
start count: 127621
final core_df len: 127721
last_count: 100
results_count: 3200
df count: 127721
page #1282: last page id => 5_8cr8la
page: 1282
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3200&after=t5_8cr8la&show=all
<title>subreddits</title>
start count: 127721
final core_df len: 127821
last_count: 100
results_count: 3300
df count: 127821
page #1283: last page id => 5_8cqt0y
page: 1283
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=3300&after=t5_8cqt0y&show=all
<title>subreddits</title>
start count: 127821
final core_df len: 127921
last_count: 100
results_count: 3400
df count: 127921
page #1284: last page id => 5_8cqbu4
page: 1284

<title>subreddits</title>
start count: 130521
final core_df len: 130621
last_count: 100
results_count: 6100
df count: 130621
page #1311: last page id => 5_8ce9e2
page: 1311
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6100&after=t5_8ce9e2&show=all
<title>subreddits</title>
start count: 130621
final core_df len: 130721
last_count: 100
results_count: 6200
df count: 130721
page #1312: last page id => 5_8cdr4h
page: 1312
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6200&after=t5_8cdr4h&show=all
<title>subreddits</title>
start count: 130721
final core_df len: 130821
last_count: 100
results_count: 6300
df count: 130821
page #1313: last page id => 5_8cddsj
page: 1313
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=6300&after=t5_8cddsj&show=all
<title>subreddits</title>
start count: 130821
final core_df len: 130921
last_count: 100
results_count: 6400
df count: 130921
page #1314: last page id => 5_8cd2zk
page: 1314

<title>subreddits</title>
start count: 133521
final core_df len: 133621
last_count: 100
results_count: 9100
df count: 133621
page #1341: last page id => 5_8c2dd2
page: 1341
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9100&after=t5_8c2dd2&show=all
<title>subreddits</title>
start count: 133621
final core_df len: 133721
last_count: 100
results_count: 9200
df count: 133721
page #1342: last page id => 5_8c20g2
page: 1342
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9200&after=t5_8c20g2&show=all
<title>subreddits</title>
start count: 133721
final core_df len: 133821
last_count: 100
results_count: 9300
df count: 133821
page #1343: last page id => 5_8c1p2e
page: 1343
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=9300&after=t5_8c1p2e&show=all
<title>subreddits</title>
start count: 133821
final core_df len: 133921
last_count: 100
results_count: 9400
df count: 133921
page #1344: last page id => 5_8c1bl5
page: 1344

<title>subreddits</title>
start count: 136521
final core_df len: 136621
last_count: 100
results_count: 12100
df count: 136621
page #1371: last page id => 5_8br08n
page: 1371
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12100&after=t5_8br08n&show=all
<title>subreddits</title>
start count: 136621
final core_df len: 136721
last_count: 100
results_count: 12200
df count: 136721
page #1372: last page id => 5_8bqnso
page: 1372
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12200&after=t5_8bqnso&show=all
<title>subreddits</title>
start count: 136721
final core_df len: 136821
last_count: 100
results_count: 12300
df count: 136821
page #1373: last page id => 5_8bq74h
page: 1373
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=12300&after=t5_8bq74h&show=all
<title>subreddits</title>
start count: 136821
final core_df len: 136921
last_count: 100
results_count: 12400
df count: 136921
page #1374: last page id => 5_8bpqwg
pag

<title>subreddits</title>
start count: 139521
final core_df len: 139621
last_count: 100
results_count: 15100
df count: 139621
page #1401: last page id => 5_8be83v
page: 1401
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15100&after=t5_8be83v&show=all
<title>subreddits</title>
start count: 139621
final core_df len: 139721
last_count: 100
results_count: 15200
df count: 139721
page #1402: last page id => 5_8bdskp
page: 1402
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15200&after=t5_8bdskp&show=all
<title>subreddits</title>
start count: 139721
final core_df len: 139821
last_count: 100
results_count: 15300
df count: 139821
page #1403: last page id => 5_8bdf2h
page: 1403
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=15300&after=t5_8bdf2h&show=all
<title>subreddits</title>
start count: 139821
final core_df len: 139921
last_count: 100
results_count: 15400
df count: 139921
page #1404: last page id => 5_8bd1h8
pag

<title>subreddits</title>
start count: 142521
final core_df len: 142621
last_count: 100
results_count: 18100
df count: 142621
page #1431: last page id => 5_8b016p
page: 1431
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18100&after=t5_8b016p&show=all
<title>subreddits</title>
start count: 142621
final core_df len: 142721
last_count: 100
results_count: 18200
df count: 142721
page #1432: last page id => 5_8azfpy
page: 1432
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18200&after=t5_8azfpy&show=all
<title>subreddits</title>
start count: 142721
final core_df len: 142821
last_count: 100
results_count: 18300
df count: 142821
page #1433: last page id => 5_8ayt3u
page: 1433
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=18300&after=t5_8ayt3u&show=all
<title>subreddits</title>
start count: 142821
final core_df len: 142921
last_count: 100
results_count: 18400
df count: 142921
page #1434: last page id => 5_8ay6x0
pag

<title>subreddits</title>
start count: 145521
final core_df len: 145621
last_count: 100
results_count: 21100
df count: 145621
page #1461: last page id => 5_8am4vi
page: 1461
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21100&after=t5_8am4vi&show=all
<title>subreddits</title>
start count: 145621
final core_df len: 145721
last_count: 100
results_count: 21200
df count: 145721
page #1462: last page id => 5_8alpne
page: 1462
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21200&after=t5_8alpne&show=all
<title>subreddits</title>
start count: 145721
final core_df len: 145821
last_count: 100
results_count: 21300
df count: 145821
page #1463: last page id => 5_8alagk
page: 1463
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=21300&after=t5_8alagk&show=all
<title>subreddits</title>
start count: 145821
final core_df len: 145921
last_count: 100
results_count: 21400
df count: 145921
page #1464: last page id => 5_8akwq6
pag

<title>subreddits</title>
start count: 148521
final core_df len: 148621
last_count: 100
results_count: 24100
df count: 148621
page #1491: last page id => 5_8aaj7s
page: 1491
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24100&after=t5_8aaj7s&show=all
<title>subreddits</title>
start count: 148621
final core_df len: 148721
last_count: 100
results_count: 24200
df count: 148721
page #1492: last page id => 5_8aa4tg
page: 1492
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24200&after=t5_8aa4tg&show=all
<title>subreddits</title>
start count: 148721
final core_df len: 148821
last_count: 100
results_count: 24300
df count: 148821
page #1493: last page id => 5_8a9qqp
page: 1493
Fetching https://www.reddit.com/subreddits/new/?limit=100&show=all&count=24300&after=t5_8a9qqp&show=all
<title>subreddits</title>
start count: 148821
final core_df len: 148921
last_count: 100
results_count: 24400
df count: 148921
page #1494: last page id => 5_8a9d8m
pag

In [187]:
core_df.to_csv("./070623_df_0_1499_v2.csv")

In [183]:
len(core_df)
# html

99521

In [180]:
core_df["page_nr"].value_counts()

page_nr
1    91
0    88
Name: count, dtype: int64

## Exploring the data...

In [107]:
# dom = lxml.html.fromstring(html)
soup = BeautifulSoup(html, "lxml")
soup.title

<title>subreddits</title>

In [108]:
# r_els = soup.cssselect("div > #siteTable")[0].cssselect("div > .subreddit")
# r_els = soup.find("div", id="siteTable").find_all("div", attrs={"class": "subreddit"})
# r_els = soup.select_one("div[id=siteTable]").find_all("div", attrs={"class": "subreddit"})
r_els = soup.select("#siteTable > div.subreddit")

len(r_els)

89

In [109]:
# lxml.html.tostring(r_els[0])
r_els[0]

<div class="thing id-t5_8rse3j odd subreddit" data-fullname="t5_8rse3j" data-gildings="0" data-type="subreddit" data-whitelist-status="" id="thing_t5_8rse3j" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="Twitter_dating" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/Twitter_dating/">r/Twitter_dating: Twitter_dating</a></p><p class="tagline"><span class="score dislikes" title="0"><span class="number">0</span> <span class="word">subscribers</span></span><span class="score unvoted" title="1"><span class="number">1</span> <span class="word">subscriber</span></span><span class="score likes" title="2"><span class="number">2</span> <span class="word">subscribers</span></span>, a community for 48 seconds</p><ul cl

In [110]:
# all text for a single subreddit element, incl children
# r_els[0].text_content()
r_els[0].text

'joinleaver/Twitter_dating: Twitter_dating0 subscribers1 subscriber2 subscribers, a community for 48 secondsreport'

In [111]:
# all text for all subreddits on page
# text_content = [r.text_content().lstrip("joinleave").rstrip("report") for r in r_els]
text_content = [r.text.lstrip("joinleave").rstrip("report") for r in r_els]
text_content[:3]

['r/Twitter_dating: Twitter_dating0 subscribers1 subscriber2 subscribers, a community for 48 seconds',
 'r/Linabelfioreeee_New: Linabelfioreeee_New0 subscribers1 subscriber2 subscribers, a community for 55 seconds',
 'r/gotohellspez: gotohellspez0 subscribers1 subscriber2 subscribers, a community for 1 minu']

## Parsing the data

In [177]:
from datetime import datetime

def get_all_desc(desc_elements_list):
    if len(desc_elements_list) > 0:
        return " ".join([el.text for el in desc_elements_list])
    else:
        ""

def parse_and_add_to_df(els_list, start_df, page_nr):
    print(f"start count: {len(start_df)}")
    search_str = re.compile(r"a community for (.*)report")
    parsed_subs_list = []

    for r in els_list:
        sub_data = {
        "name": r.select_one(".titlerow").text.split(":")[0],
        "desc": get_all_desc(r.select("div.md > p")),
        "sub_age_tup": (re.search(search_str, r.text).group(1).split()),
        "page_id": r.get("id").lstrip("thing_"),
        "num_subscribers": r.select_one("p.tagline > span.unvoted > span.number").text,
        "page_nr": page_nr,
        "dt_retrieved": datetime.now()
        }
        sub_data["age_num"] = sub_data["sub_age_tup"][0]
        sub_data["age_word"] = sub_data["sub_age_tup"][1]
        sub_data.pop("sub_age_tup")
        parsed_subs_list.append(sub_data)

#         print(f"name: {sub_data['name']}")
#         print(f"age: {sub_data['subs_num']} {sub_data['subs_word']}")
#         print(f"desc: {sub_data['desc']}")
#         print(sub_data["page_id"])
#         print("****")

#     print(parsed_subs_list)
    end_df = pd.concat([start_df, pd.DataFrame(parsed_subs_list)], ignore_index=True)
    print(f"final core_df len: {len(end_df)}")
    return end_df

core_df = pd.DataFrame()
print(r_els[0])
core_df = parse_and_add_to_df(r_els, core_df, page_nr=0)

<div class="thing id-t5_8rse3j odd subreddit" data-fullname="t5_8rse3j" data-gildings="0" data-type="subreddit" data-whitelist-status="" id="thing_t5_8rse3j" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="Twitter_dating" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/Twitter_dating/">r/Twitter_dating: Twitter_dating</a></p><p class="tagline"><span class="score dislikes" title="0"><span class="number">0</span> <span class="word">subscribers</span></span><span class="score unvoted" title="1"><span class="number">1</span> <span class="word">subscriber</span></span><span class="score likes" title="2"><span class="number">2</span> <span class="word">subscribers</span></span>, a community for 48 seconds</p><ul cl

In [176]:
core_df

Unnamed: 0,name,desc,page_id,num_subscribers,page_nr,dt_retrieved,age_num,age_word
0,r/Twitter_dating,,5_8rse3j,1,0,2023-07-06 08:21:25.154162,48,seconds
1,r/Linabelfioreeee_New,,5_8rse2p,1,0,2023-07-06 08:21:25.154850,55,seconds
2,r/gotohellspez,,5_8rsdzz,1,0,2023-07-06 08:21:25.155566,1,minute
3,r/naturefree,,5_8rsdpr,1,0,2023-07-06 08:21:25.156250,2,minutes
4,r/qidianhehdhu,,5_8rsdix,1,0,2023-07-06 08:21:25.156938,3,minutes
...,...,...,...,...,...,...,...,...
84,r/temptingyou69,,5_8rs3t4,1,0,2023-07-06 08:21:25.191254,46,minutes
85,r/gnhjui,,5_8rs3qd,1,0,2023-07-06 08:21:25.191637,47,minutes
86,r/Public_marketing09,Public marketing community. With a deep unders...,5_8rs3o2,2,0,2023-07-06 08:21:25.192085,47,minutes
87,r/TeespringFashionZone,"Hey there, I'm Tarun Pratap Singh as Teespring...",5_8rs3mv,1,0,2023-07-06 08:21:25.192507,47,minutes


In [136]:
# number of records with desc
core_df[~core_df["desc"].isna()]

Unnamed: 0,name,desc,page_id,num_subscribers,dt_retrieved,age_num,age_word
5,r/NinjaKitchenRestocks,We'll help you find Ninja Kitchen products lik...,5_8rsdhv,1,2023-07-06 07:25:01.025794,3,minutes
9,r/IndianGamingLaptop,Everything about the Gaming Laptops sold in In...,5_8rsd9i,1,2023-07-06 07:25:01.028818,4,minutes
17,r/TheDigitalPost,Life in the digital age,5_8rscgo,1,2023-07-06 07:25:01.034303,8,minutes
21,r/L2ckx,Hello welcome to my community have a nice day ...,5_8rsc4c,1,2023-07-06 07:25:01.035881,9,minutes
24,r/STLGobbo,A place for me and other small creators to pro...,5_8rsc2q,1,2023-07-06 07:25:01.037142,9,minutes
35,r/visitnewzealand,Community for anyone planning on visiting New ...,5_8rsa66,1,2023-07-06 07:25:01.041250,18,minutes
37,r/Fab_Stuff,"Here You can post suggestions , some textures ...",5_8rsa00,1,2023-07-06 07:25:01.042045,18,minutes
40,r/LynxCampsite,"A site for Lynx Landau enthusiasts, the cutest...",5_8rs9mb,1,2023-07-06 07:25:01.043214,20,minutes
41,r/Seamlessai,Seamless - The WORST and most UNETHICAL lead g...,5_8rs9l2,1,2023-07-06 07:25:01.043640,20,minutes
47,r/cuckik_fist,Selling Amateur Teens Dropboxes. Telegram: @ch...,5_8rs96t,5,2023-07-06 07:25:01.046024,22,minutes


In [114]:
core_df["age_word"].value_counts()

age_word
minutes    86
seconds     2
minute      1
Name: count, dtype: int64

## Get next page

In [61]:
from time import sleep

for i in range(3):
    print("Fetching page " + str(i + 1))
    sleep(1)

Fetching page 1
Fetching page 2
Fetching page 3


In [580]:
get_next_url(100, "5_8r8ktl")
# https://www.reddit.com/subreddits/new?count=25&after=t5_8r8ktl

'https://www.reddit.com/subreddits/new?count=100&after=t5_8r8ktl'

In [593]:
core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"]

74    5_8resct
Name: page_id, dtype: object

In [590]:
last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
    if (len(core_df) > 1) else None
last_page_id

'5_8rescs'

In [101]:
# setup
core_df = pd.DataFrame()
result_limit = 100

# fetch each page
# if last resultset returned less than the limit, stop fetching
for page_nr in range(3): # swap to while loop after
    last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
        if (len(core_df) > 1) else None
    print(f"last page id => {last_page_id}")
    html = fetch_next_page(page_nr, result_limit, last_page_id)
    sleep(2)

    soup = BeautifulSoup(html, "lxml")
    print(soup.title)

    # parse each page, add to df
    r_els = soup.select("#siteTable > div.subreddit")
    core_df = parse_and_add_to_df(r_els, core_df)
    if len(r_els) < 25:
        print(len(r_els))
        break

last page id => None
0
None
https://www.reddit.com/subreddits/new/?limit=50&show=all
page: 0
Fetching https://www.reddit.com/subreddits/new/?limit=50&show=all
<title>subreddits</title>
start count: 0
final core_df len: 44
last page id => 5_8rs8ex
50
5_8rs8ex
page: 1
Fetching https://www.reddit.com/subreddits/new/?limit=50&show=all&count=50&after=t5_8rs8ex&show=all
<title>subreddits</title>
start count: 44
final core_df len: 85
last page id => 5_8rs2uq
100
5_8rs2uq
page: 2
Fetching https://www.reddit.com/subreddits/new/?limit=50&show=all&count=100&after=t5_8rs2uq&show=all
<title>subreddits</title>
start count: 85
final core_df len: 128


In [102]:
core_df["page_id"].value_counts()
# FIX: LAST_PAGE_ID IS NOT CHANGING WHAT!

page_id
5_8rscyu    1
5_8rscv9    1
5_8rs15n    1
5_8rs171    1
5_8rs1n5    1
           ..
5_8rs8zp    1
5_8rs96t    1
5_8rs97d    1
5_8rs97f    1
5_8rrxsl    1
Name: count, Length: 128, dtype: int64

In [103]:
core_df[~core_df["desc"].isna()]

Unnamed: 0,name,desc,page_id,num_subscribers,dt_retrieved,age_num,age_word
10,r/L2ckx,Hello welcome to my community have a nice day ...,5_8rsc4c,1,2023-07-06 06:56:54.463267,4,minutes
13,r/STLGobbo,A place for me and other small creators to pro...,5_8rsc2q,1,2023-07-06 06:56:54.464370,4,minutes
24,r/visitnewzealand,Community for anyone planning on visiting New ...,5_8rsa66,1,2023-07-06 06:56:54.467916,13,minutes
26,r/Fab_Stuff,"Here You can post suggestions , some textures ...",5_8rsa00,1,2023-07-06 06:56:54.468584,14,minutes
29,r/LynxCampsite,"A site for Lynx Landau enthusiasts, the cutest...",5_8rs9mb,1,2023-07-06 06:56:54.469600,15,minutes
30,r/Seamlessai,Seamless - The WORST and most UNETHICAL lead g...,5_8rs9l2,1,2023-07-06 06:56:54.469979,16,minutes
36,r/cuckik_fist,Selling Amateur Teens Dropboxes. Telegram: @ch...,5_8rs96t,3,2023-07-06 06:56:54.471935,17,minutes
38,r/robloxfighting,just fight LOL,5_8rs8yc,1,2023-07-06 06:56:54.472603,18,minutes
40,r/parasite_the_maxim,This is a community dedicated to Anime Parasit...,5_8rs8re,1,2023-07-06 06:56:54.473212,19,minutes
52,r/stillGTA4,This sub is for the glorious men and women who...,5_8rs6z7,1,2023-07-06 06:56:57.116619,27,minutes


## Caching

In [604]:
from pathlib import Path
pages_dir = "raw-pages/"

# setup
core_df = pd.DataFrame()
result_limit = 25

# create or get fetch log
log = Path(pages_dir + "log.csv")

if log.exists():
    print("log exists")
else:
    with open(log, "w") as l:
        print("test")
        l.write("fetch_dt, url, page_nr\n")

log exists


In [605]:
# fetch each page
# if last resultset returned less than the limit (100), stop fetching
for page_nr in range(3): # swap to while loop after
    dest = Path(pages_dir + str(page_nr) + ".html")
    
    if dest.exists(): # load it from file
        print(f"Already have {dest}, loading!")
        file = open(dest, "r")
        page_html = file.read()
        file.close()

    else: # fetch it!
        last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
            if (len(core_df) > 1) else None
        print(f"page #{page_nr}: last page id => {last_page_id}")
        page_html = fetch_next_page(page_nr, 25, last_page_id)
        
        # save to file
        with open(dest, "w") as f:
            f.write(page_html)

        with open(log, "a") as l:
            fetch_dt = datetime.now()
            url = get_next_url(page_nr * 25, last_page_id)
            l.write(",".join([str(fetch_dt), url, str(page_nr)]) + "\n")

    soup = BeautifulSoup(page_html, "lxml")
    print(soup.title)

    # parse each page, add to df
    r_els = soup.select("#siteTable > div.subreddit")
    core_df = parse_and_add_to_df(r_els, core_df)
    sleep(2)
    if len(r_els) < 25:
        print(len(r_els))
        break

Already have raw-pages/0.html, loading!
<title>subreddits</title>
start count: 0
final core_df len: 25
Already have raw-pages/1.html, loading!
<title>subreddits</title>
start count: 25
final core_df len: 50
Already have raw-pages/2.html, loading!
<title>subreddits</title>
start count: 50
final core_df len: 75


In [606]:
core_df["dt_retrieved"].max()

Timestamp('2023-07-04 23:39:37.885966')

In [607]:
core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"]

74    5_8reztl
Name: page_id, dtype: object