In [51]:
import lxml.html
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

## Setup

In [8]:
base_url = "https://www.reddit.com/subreddits/new"
ident = (
    "Stephanie Andrews (jellomoat@gmail.com), " + 
    "scraping for educational purposes"
)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}

## Fetching data

In [9]:
def get_next_url(count, after_id):
    if not count:
        return base_url
    else:
        return f"{base_url}?count={count}&after={after_id}"

def fetch_next_page(page_nr, after_id=None):
    print(f"page: {page_nr}")
    count = (page_nr - 1) * 25
    url = get_next_url(count, after_id)
    try:
        return requests.get(
            url,
            headers=headers
        ).text
    except:
        print("No more pages!")

In [10]:
page_nr = 0
html = fetch_next_page(page_nr)

page: 0


## Exploring the data...

In [11]:
dom = lxml.html.fromstring(html)

In [12]:
r_els = dom.cssselect("div > #siteTable")[0].cssselect("div > .subreddit")

In [13]:
lxml.html.tostring(r_els[0])

b'<div class=" thing id-t5_8r8mya odd  subreddit " id="thing_t5_8r8mya" onclick="click_thing(this)" data-fullname="t5_8r8mya" data-type="subreddit" data-gildings="0" data-whitelist-status=""><p class="parent"></p><div class="midcol"><span class="fancy-toggle-button subscribe-button toggle" style="" data-sr_name="everythingeverskies"><a class="option active add login-required" href="#" tabindex="100">join</a><a class="option remove" href="#">leave</a></span></div><div class="entry unvoted"><p class="titlerow"><a href="https://www.reddit.com/r/everythingeverskies/" class="title">r/everythingeverskies: everythingeverskies</a></p><p class="tagline"><span class="score dislikes" title="0"><span class="number">0</span> <span class="word">subscribers</span></span><span class="score unvoted" title="1"><span class="number">1</span> <span class="word">subscriber</span></span><span class="score likes" title="2"><span class="number">2</span> <span class="word">subscribers</span></span>, a community

In [14]:
# all text for a single subreddit element, incl children
r_els[0].text_content()

'joinleaver/everythingeverskies: everythingeverskies0 subscribers1 subscriber2 subscribers, a community for 1 minutereport'

In [15]:
# all text for all subreddits on page
text_content = [r.text_content().lstrip("joinleave").rstrip("report") for r in r_els]
text_content[:3]

['r/everythingeverskies: everythingeverskies0 subscribers1 subscriber2 subscribers, a community for 1 minu',
 'r/GHNowatHome: GHNowatHome0 subscribers1 subscriber2 subscribers, a community for 1 minu',
 'r/eircgazer_world: eircgazer_world0 subscribers1 subscriber2 subscribers, a community for 1 minu']

## Parsing the data

In [64]:
from datetime import datetime

def get_all_desc(desc_elements_list):
    if not desc_elements_list:
        return ""
    else:
        return " ".join([el.text_content() for el in desc_elements_list])

search_str = re.compile(r"a community for (.*)report")

parsed_subs_list = []

for r in r_els:
    # print((re.search(search_str, r.text_content())).group(0).rstrip("report").lstrip("a community for "))
    sub_data = {
    "name": r.cssselect("p.titlerow > a")[0].text_content(),
    "desc": get_all_desc(r.cssselect("div.md > p")),
    "sub_age_tup": (re.search(search_str, r.text_content())).group(0) \
        .rstrip("report").lstrip("a community for ").split(),
    "page_id": r.get("id").lstrip("thing_"),
    "dt_retrieved": datetime.now()
    }    
    sub_data["subs_num"] = sub_data["sub_age_tup"][0]
    sub_data["subs_word"] = sub_data["sub_age_tup"][1]
    sub_data.pop("sub_age_tup")
    parsed_subs_list.append(sub_data)
#     print(f"page id: {sub_data['page_id']}")
#     print(f"name: {sub_data['name']}")
#     print(f"desc: {sub_data['desc']}")
#     print(f"age: {sub_data['subs_num']} {sub_data['subs_word']}")
#     print(f"{sub_data['subs_word']}: {sub_data['subs_num']}")
#     print("****")

# print(parsed_subs_list)
    
df = pd.DataFrame(parsed_subs_list)


In [61]:
df.head(20)

Unnamed: 0,name,desc,page_id,dt_retrieved,subs_num,subs_word
0,r/everythingeverskies: everythingeverskies,,5_8r8mya,2023-07-04 08:35:02.139974,1,minu
1,r/GHNowatHome: GHNowatHome,,5_8r8mw3,2023-07-04 08:35:02.140347,1,minu
2,r/eircgazer_world: eircgazer_world,,5_8r8mqn,2023-07-04 08:35:02.140659,1,minu
3,r/helIokitty: helIokitty,,5_8r8mif,2023-07-04 08:35:02.140972,2,minutes
4,r/Ninecholas: Ninecholas,,5_8r8mfz,2023-07-04 08:35:02.141347,2,minutes
5,r/veidmainiai: veidmainiai,,5_8r8maw,2023-07-04 08:35:02.141687,3,minutes
6,r/MinecraftButGameplay: MinecraftButGameplay,,5_8r8m9k,2023-07-04 08:35:02.142045,3,minutes
7,r/latetjapublictest: latetjapublictest,,5_8r8m97,2023-07-04 08:35:02.142514,3,minutes
8,r/lifestyleupdates: lifestyleupdates,,5_8r8m4g,2023-07-04 08:35:02.142972,3,minutes
9,r/Moyennementenrageant: Moyennementenrageant,,5_8r8m1r,2023-07-04 08:35:02.143427,4,minutes
