In [1]:
import os, json, time
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
import praw
from dotenv import load_dotenv
load_dotenv() 

True

In [2]:
reddit = praw.Reddit(
    client_id     = os.environ["REDDIT_ID"],
    client_secret = os.environ["REDDIT_SECRET"],
    user_agent    = os.environ.get("REDDIT_UA","teen_scraper/0.1"),
    check_for_async=False
)

In [None]:
TARGET_SUBS   = [
    "teenagers", "AdviceForTeens", "mentalhealth",
    "relationship_advice", "AskReddit", "DecidingToBeBetter",
    "bullying", "dating_advice", "depression", "Parenting",
    "trueteenagers", "teenrelationships", "AdviceForTeens", "toddlers"
]


THEMES = {
    "mental":          ["anxiety", "depression", "therapy", "self-harm", "panic", "adhd"],
    "online_safety":   ["cyberbullying", "doxxing", "grooming", "sextortion", "privacy"],
    "dating":          ["dating", "crush", "breakup", "relationship", "red flags"]
}


RESULTS_PER_SUB  = 250       # submissions per sub to pull (adjust ↑ or ↓)
COMMENT_DEPTH    = 2         # levels of comment replies to collect
SLEEP_BETWEEN    = 1.0       # seconds – stay under rate limits
OUTFILE          = Path("teen_wellbeing_dump.jsonl")

In [4]:
def matched_theme(text):
    text_low = text.lower()
    for theme, kwds in THEMES.items():
        if any(k in text_low for k in kwds):
            return theme
    return None

with OUTFILE.open("w", encoding="utf-8") as f_out:
    for sub_name in TARGET_SUBS:
        sub = reddit.subreddit(sub_name)
        search_q = " OR ".join(set(sum(THEMES.values(), [])))
        for submission in tqdm(
            sub.search(search_q, limit=RESULTS_PER_SUB, sort="new", time_filter="year"),
            desc=f"r/{sub_name}"
        ):
            theme = matched_theme(submission.title + " " + submission.selftext)
            if not theme:
                continue 

            row = {
                "type":      "submission",
                "theme":     theme,
                "subreddit": sub_name,
                "title":     submission.title,
                "body":      submission.selftext,
            }
            f_out.write(json.dumps(row) + "\n")

            # fetch comments lazily
            submission.comments.replace_more(limit=0)
            for comment in submission.comments.list():
                if comment.depth > COMMENT_DEPTH:
                    continue
                theme_c = matched_theme(comment.body)
                if not theme_c:
                    continue
                row_c = {
                    "type":      "comment",
                    "theme":     theme_c,
                    "parent_sub": sub_name,
                    "body":      comment.body,
                }
                f_out.write(json.dumps(row_c) + "\n")
            time.sleep(SLEEP_BETWEEN)

print(f"✅  Finished. Data written to {OUTFILE.resolve()}")

r/teenagers: 0it [00:00, ?it/s]
r/AdviceForTeens: 0it [00:00, ?it/s]
r/mentalhealth: 0it [00:00, ?it/s]
r/relationship_advice: 0it [00:00, ?it/s]
r/AskReddit: 0it [00:00, ?it/s]
r/DecidingToBeBetter: 0it [00:00, ?it/s]
r/bullying: 0it [00:00, ?it/s]
r/dating_advice: 0it [00:00, ?it/s]
r/depression: 0it [00:00, ?it/s]
r/Parenting: 0it [00:00, ?it/s]
r/trueteenagers: 0it [00:00, ?it/s]
r/teenrelationships: 0it [00:00, ?it/s]
r/AdviceForTeens: 0it [00:00, ?it/s]
r/toddlers: 0it [00:00, ?it/s]

✅  Finished. Data written to C:\Users\LE NGUYEN DUY PHUC\Documents\AI_Lab_Data\teen_wellbeing_dump.jsonl



