In [206]:
import lxml.html
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from pathlib import Path
from time import sleep

## Setup

In [244]:
ident = (
    "Stephanie Andrews (jellomoat@gmail.com), " + 
    "scraping for educational purposes"
)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}

## Fetching and Cacheing functions

In [245]:
def get_next_url(count, fetch_limit=100, after_id=None):
    base_url = f"https://www.reddit.com/subreddits/popular/?limit={fetch_limit}&show=all"

    # handle if first page
    if not after_id:
        print(base_url)
        return base_url
    else:
        return f"{base_url}&count={count}&after=t{after_id}"

def fetch_next_page(page_nr, count, fetch_limit=100, after_id=None):
    url = get_next_url(count, fetch_limit, after_id)
    print(f"page: {page_nr}")
    print(f"Fetching {url}")
    try:
        return requests.get(
            url,
            headers=headers
        )
    except:
        print("No more pages!")

In [246]:
# page_nr = 0
# html = fetch_next_page(page_nr)

def fetch_pages(start_pg, end_pg, core_df):
    # setup
    pages_dir = "raw-pages-070723/"
    result_limit = 100
    results_count = 0

    # create or get fetch log
    log = Path(pages_dir + "log.csv")
    # create log df
    log_df = pd.read_csv(log)

    if log.exists():
        print("log exists")
    else:
        with open(log, "w") as l:
            print("test")
            l.write("fetch_dt, url, page_nr\n")

    # fetch each page
    # if last resultset returned less than the limit, stop fetching
    for page_nr in range(start_pg, end_pg): # swap to while loop after
        dest = Path(pages_dir + str(page_nr) + ".html")

        if dest.exists(): # load it from file
            print(f"Already have {dest}, loading!")
            file = open(dest, "r")
            page_html = file.read()
            file.close()

        else: # fetch it!
            last_page_id = core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"].values[0] \
                if (len(core_df) > 1) else None
            print(f"page #{page_nr}: last page id => {last_page_id}")
            page_resp = fetch_next_page(page_nr, results_count, fetch_limit=100, after_id=last_page_id)
            page_html = page_resp.text
            sleep(2)

            # save to file
            with open(dest, "w") as f:
                f.write(page_html)

            with open(log, "a") as l:
                fetch_dt = datetime.now()
                url = page_resp.url
                l.write(",".join([str(fetch_dt), url, str(page_nr)]) + "\n")

        # parse each page, add to df
        soup = BeautifulSoup(page_html, "lxml")
        print(soup.title)

        r_els = soup.select("#siteTable > div.subreddit")
        core_df = parse_and_add_to_df(r_els, core_df, page_nr, log_df["fetch_dt"][page_nr])
        last_count = len(r_els)
        print(f"last_count: {last_count}")
        results_count += last_count
        print(f"results_count: {results_count}")
        print(f"df count: {len(core_df)}")
        if len(r_els) < 10:
            print(len(r_els))
            break
    return core_df

## Parsing functions

In [247]:
def parse_and_add_to_df(els_list, start_df, page_nr, fetch_dt):
    print(f"start count: {len(start_df)}")
    search_str = re.compile(r"a community for (.*)report")
    parsed_subs_list = []

    for r in els_list:
        sub_data = {
        "name": r.select_one(".titlerow").text.split(":")[0],
        "desc": get_all_desc(r.select("div.md > p")),
        "sub_age_tup": (re.search(search_str, r.text).group(1).split()),
        "page_id": r.get("id").lstrip("thing_"),
        "num_subscribers": r.select_one("p.tagline > span.unvoted > span.number").text,
        "page_nr": page_nr,
        "date_retrieved": fetch_dt.split()[0]
        }
        sub_data["age_num"] = sub_data["sub_age_tup"][0]
        sub_data["age_word"] = sub_data["sub_age_tup"][1]
        sub_data.pop("sub_age_tup")
        parsed_subs_list.append(sub_data)

    end_df = pd.concat([start_df, pd.DataFrame(parsed_subs_list)], ignore_index=True)
    print(f"final core_df len: {len(end_df)}")
    return end_df

In [248]:
def get_all_desc(desc_elements_list):
    if len(desc_elements_list) > 0:
        return " ".join([el.text for el in desc_elements_list])
    else:
        ""

In [264]:
def convert_age_ago_to_created_ts(row):
    days_conversion = {
        "year": 365,
        "month": 30,
        "day": 1,
        "hour": (1/24)
    }
    days = days_conversion[row["age_word"].rstrip("s")] * int(row["age_num"])
    
    return datetime.strptime(row["date_retrieved"], "%Y-%m-%d") - timedelta(days=days)

In [265]:
# Execute this in final run
# core_df["created_dt"] = core_df.apply(lambda x: convert_age_ago_to_created_ts(x).year, axis=1)

## Get and Parse Data

In [266]:
core_df = pd.DataFrame()
core_df = fetch_pages(0, 50, core_df)

log exists
Already have raw-pages-070723/0.html, loading!
<title>subreddits</title>
start count: 0
final core_df len: 100
last_count: 100
results_count: 100
df count: 100
Already have raw-pages-070723/1.html, loading!
<title>subreddits</title>
start count: 100
final core_df len: 200
last_count: 100
results_count: 200
df count: 200
Already have raw-pages-070723/2.html, loading!
<title>subreddits</title>
start count: 200
final core_df len: 300
last_count: 100
results_count: 300
df count: 300
Already have raw-pages-070723/3.html, loading!
<title>subreddits</title>
start count: 300
final core_df len: 400
last_count: 100
results_count: 400
df count: 400
Already have raw-pages-070723/4.html, loading!
<title>subreddits</title>
start count: 400
final core_df len: 500
last_count: 100
results_count: 500
df count: 500
Already have raw-pages-070723/5.html, loading!
<title>subreddits</title>
start count: 500
final core_df len: 600
last_count: 100
results_count: 600
df count: 600
Already have raw-pa

In [267]:
# core_df = fetch_pages(500, 1000, core_df)

In [268]:
# core_df = fetch_pages(1000, 1250, core_df)

In [269]:
# core_df = fetch_pages(1250, 1500, core_df)

In [270]:
# core_df = fetch_pages(1500, 1750, core_df)

In [271]:
# core_df.to_csv("./070623_df_0_1499_v2.csv")

In [272]:
len(core_df)
# html

4088

In [273]:
core_df.head()

Unnamed: 0,name,desc,page_id,num_subscribers,page_nr,date_retrieved,age_num,age_word
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years
2,r/mildlyinfuriating,jugkfmghgug,5_2ubgg,5961250,0,2023-07-07,11,years
3,r/facepalm,/r/facepalm has gone private in protest of the...,5_2r5rp,7469361,0,2023-07-07,13,years
4,r/diablo4,Welcome to the un official Diablo 4 subreddit!...,5_2rzx9,746468,0,2023-07-07,12,years


In [274]:
core_df["created_dt"] = core_df.apply(lambda x: convert_age_ago_to_created_ts(x).year, axis=1)

In [276]:
core_df.head()

Unnamed: 0,name,desc,page_id,num_subscribers,page_nr,date_retrieved,age_num,age_word,created_dt
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135237,0,2023-07-07,14,years,2009
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41833971,0,2023-07-07,15,years,2008
2,r/mildlyinfuriating,jugkfmghgug,5_2ubgg,5961250,0,2023-07-07,11,years,2012
3,r/facepalm,/r/facepalm has gone private in protest of the...,5_2r5rp,7469361,0,2023-07-07,13,years,2010
4,r/diablo4,Welcome to the un official Diablo 4 subreddit!...,5_2rzx9,746468,0,2023-07-07,12,years,2011


In [96]:
core_df[0:101].to_csv("./subreddits_set_070723_dt_top_100.csv")

In [97]:
core_df[0:1001].to_csv("./subreddits_set_070723_dt_top_1000.csv")

## Exploring the data...

In [185]:
# dom = lxml.html.fromstring(html)
html = fetch_next_page(0, 0).text
soup = BeautifulSoup(html, "lxml")
soup.title

https://www.reddit.com/subreddits/popular/?limit=100&show=all
page: 0
Fetching https://www.reddit.com/subreddits/popular/?limit=100&show=all


<title>subreddits</title>

In [186]:
# r_els = soup.cssselect("div > #siteTable")[0].cssselect("div > .subreddit")
# r_els = soup.find("div", id="siteTable").find_all("div", attrs={"class": "subreddit"})
# r_els = soup.select_one("div[id=siteTable]").find_all("div", attrs={"class": "subreddit"})
r_els = soup.select("#siteTable > div.subreddit")

len(r_els)

100

In [187]:
# lxml.html.tostring(r_els[0])
r_els[0]

<div class="thing id-t5_2qs0k odd subreddit" data-fullname="t5_2qs0k" data-gildings="0" data-type="subreddit" data-whitelist-status="all_ads" id="thing_t5_2qs0k" onclick="click_thing(this)"><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" data-sr_name="Home" style=""><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a class="title" href="https://www.reddit.com/r/Home/">r/Home: Home</a></p><div class="description"><form action="#" class="usertext warn-on-unload" id="form-t5_2qs0kvqa" onsubmit="return post_form(this, 'editusertext')"><input name="thing_id" type="hidden" value="t5_2qs0k"/><div class="usertext-body may-blank-within md-container"><div class="md"><p>Everything home related: interior design, home improvement, architecture.</p>
</div>
</div></form></div><p class="tagline"><span class="score dislikes

In [197]:
# all text for a single subreddit element, incl children
# r_els[0].text_content()
r_els[0].text

'joinleaver/Home: HomeEverything home related: interior design, home improvement, architecture.\n\n135,308 subscribers135,309 subscribers135,310 subscribers, a community for 14 yearsreport'

In [198]:
# all text for all subreddits on page, incl children
# text_content = [r.text_content().lstrip("joinleave").rstrip("report") for r in r_els]
text_content = [r.text.lstrip("joinleave").rstrip("report") for r in r_els]
text_content[:2]

['r/Home: HomeEverything home related: interior design, home improvement, architecture.\n\n135,308 subscribers135,309 subscribers135,310 subscribers, a community for 14 years',
 'r/AskReddit: Ask Reddit...r/AskReddit is the place to ask and answer thought-provoking questions.\n\n41,836,673 subscribers41,836,674 subscribers41,836,675 subscribers, a community for 15 years']

In [199]:
test_parsing_df = pd.DataFrame()
test_parsing_df = parse_and_add_to_df(r_els, test_parsing_df, page_nr=0)

start count: 0
final core_df len: 100


In [203]:
test_parsing_df.tail(2)

Unnamed: 0,name,desc,page_id,num_subscribers,page_nr,dt_retrieved,age_num,age_word
98,r/FashionReps,Reddit's largest community for the discussion ...,5_31hcv,1325116,0,2023-07-07 23:35:05.314047,9,years
99,r/maybemaybemaybe,For those videos that make you think maybe...,5_38e1l,2709200,0,2023-07-07 23:35:05.314374,8,years


In [204]:
# core_df.loc[core_df["dt_retrieved"] == core_df["dt_retrieved"].max()]["page_id"]
last_page_id = test_parsing_df.loc[test_parsing_df["dt_retrieved"] == test_parsing_df["dt_retrieved"].max()]["page_id"].values[0] \
    if (len(test_parsing_df) > 1) else None
last_page_id

'5_38e1l'

In [205]:
# records with desc
test_parsing_df[~test_parsing_df["desc"].isna()].head(2)

Unnamed: 0,name,desc,page_id,num_subscribers,page_nr,dt_retrieved,age_num,age_word
0,r/Home,"Everything home related: interior design, home...",5_2qs0k,135309,0,2023-07-07 23:35:05.260853,14,years
1,r/AskReddit,r/AskReddit is the place to ask and answer tho...,5_2qh1i,41836674,0,2023-07-07 23:35:05.261666,15,years
