In [1]:
from __future__ import annotations
import os, re, json, time, asyncio
from pathlib import Path
from typing import Tuple, List, Dict
import pandas as pd
import praw
from dotenv import load_dotenv
from tqdm import tqdm

In [2]:
load_dotenv()
reddit = praw.Reddit(
    client_id     = os.environ["REDDIT_ID_2"],
    client_secret = os.environ["REDDIT_SECRET_2"],
    user_agent    = os.environ.get("REDDIT_UA_2","teen_scraper/5"),
)

In [None]:
TARGET_SUBS = [
    # "TheGirlSurvivalGuide", "technology",
    "xbox","videogames"
]

THEMES = {
    "mental_health": [
        "anxiety", "depression", "self-harm", "panic", "adhd",
        "bipolar", "eating disorder", "anorexia", "bulimia", "trauma", "ptsd",
        "schizophrenia", "psychosis", "mood swings", "mental illness",
        "suicidal ideation", "suicide", "overdose", "hopelessness",
        "worthlessness", "self hatred", "self mutilation", "cutting",
        "panic attack", "insomnia", "hallucinations",
        "delusion", "paranoia", "dissociation", "intrusive thoughts",
        "derealization", "depersonalization", "loneliness", "isolation",
        "numb", "void", "mental breakdown", "meltdown", "shutdown",
        "sensory overload", "fear", "no will to live",
    ],
    "behavioral_health": [
        "anger", "rage", "addiction", "substance abuse", "alcohol abuse",
        "drug abuse", "binge drinking", "blackout", "impulse control",
        "stress", "burnout", "gambling addiction",
        "codependency", "people-pleasing", "manipulation", "aggression",
        "violent outburst", "fight", "punching", "reckless behavior", "risk-taking", "truancy", "runaway",
        "shoplifting", "stealing", "lying", "vandalism", "self sabotage",
        "executive dysfunction", "hoarding", "compulsive behavior",
        "social anxiety", "avoidance", "phobias", "procrastination",
    ],
    "online_safety": [
        "cyberbullying", "bullying", "harassment", "online harassment",
        "doxxing", "grooming", "groomer", "blackmail","clickbait",
        "predator", "online predators", "child exploitation", "sex trafficking",
        "nudes leak", "snapchat leak", "revenge porn", "catfish", "deepfake",
        "identity theft", "phishing", "malware", "hacked", "data breach",
        "scams", "swatting", "impersonation", "stalking", "online stalking",
        "hate speech", "death threat", "trolling", "flaming", "fake news",
        "disinformation", "misinformation", "sadfishing", "stranger danger",
    ],
    "dating": [
        "heartbreak", "toxic", "abuse", "emotional abuse",
        "physical abuse", "domestic violence", "sexual assault", "rape",
        "coercion", "cheating", "cheater", "gaslighting", "love bombing",
        "red flags", "jealousy", "insecurity", "obsession", "control",
        "manipulation", "breadcrumbing", "ghosting", "situationship",
        "mixed signals", "unrequited love", "abandonment", "attachment issues",
        "boundaries", "violated boundaries", "consent", "lack of consent",
        "sexting pressure", "nudes pressure", "stalking ex", "toxic ex",
        "hate relationship", "fight",
    ],
}

PATTERNS = {
    theme: [(kw, re.compile(rf"\b{re.escape(kw)}\b", re.I))
            for kw in kws]
    for theme, kws in THEMES.items()
}

YOUTH_TOKENS = [
    "teen", "teens", "teenager", "teenagers",
    "preteen", "preadolescent", "youth", "youngster",
    "high school", "high-schooler", "highschooler",
    "middle school", "middleschooler",
    "grade 6", "grade 7", "grade 8", "grade 9",
    "grade 10", "grade 11", "grade 12"
]
YOUTH_RGX = re.compile(r"\b(?:%s)\b" % "|".join(map(re.escape, YOUTH_TOKENS)), re.I)

AGE_NUM_RGX = re.compile(
    r"""
    (                
       \b(?:i[' ]?m|im)\s+         
       (1[0-9])                    
    | \b(1[0-9])[fm]\b             
    | \((1[0-9])[fm]\)             
    )
    """,
    re.I | re.X
)

In [4]:
def is_age(text):
    return bool(YOUTH_RGX.search(text) or AGE_NUM_RGX.search(text))

SEARCH_KEYWORDS = sorted({w for v in THEMES.values() for w in v})

def build_chunks(max_len = 50):
    out, cur, ln = [], [], 0
    for w in SEARCH_KEYWORDS:
        add = len(w) + 4
        if ln + add > max_len and cur:
            out.append(" OR ".join(cur)); cur, ln = [w], len(w)
        else:
            cur.append(w); ln += add
    if cur:
        out.append(" OR ".join(cur))
    return out

QUERY_CHUNKS = build_chunks()

def theme_match(text):
    for theme, regs in PATTERNS.items():
        for kw, rgx in regs:
            if rgx.search(text):
                return theme, kw
    return None, None

def summary(text, theme):
    sents = re.split(r"(?<=[.!?])\s+", text)
    hits = [
        s.strip() for s in sents
        if any(rgx.search(s) for _, rgx in PATTERNS[theme])
    ]
    if len(hits) >= 2:
        return f"{hits[0]} {hits[1]}"
    if len(hits) == 1:
        nxt = next((s for s in sents if s not in hits), "")
        return f"{hits[0]} {nxt}"
    return f"This post discusses {theme}."

In [5]:
SLEEP_SUB   = 2.0          
SLEEP_POST  = 0.5          
rows = []

for sub in TARGET_SUBS:
    sr   = reddit.subreddit(sub)
    roots_by_post = {}
    for c in sr.comments(limit=None):         
        if c.is_root:
            pid = c.link_id[3:]               
            roots_by_post.setdefault(pid, []).append(c.body)

    pool = {}
    for chunk in QUERY_CHUNKS:
        for s in sr.search(f"({chunk})", sort="new",
                           time_filter="all", limit=None):
            if s.id not in pool or s.score > pool[s.id].score:
                pool[s.id] = s

    if not pool:
        for s in sr.new(limit=None):
            pool[s.id] = s

    for s in tqdm(pool.values(), desc=f"r/{sub}"):
        roots = roots_by_post.get(s.id, [])      
        txt   = f"{s.title} {s.selftext} " + " ".join(roots)

        if not is_age(txt):
            time.sleep(SLEEP_POST); continue
        th, kw = theme_match(txt)
        if th is None:
            time.sleep(SLEEP_POST); continue

        rows.append(
            {
                "subreddit": sub,
                "theme"   : th,
                "keyword" : kw,
                "title"   : s.title,
                "body"    : summary(txt, th),
            }
        )
        time.sleep(SLEEP_POST)                    
    time.sleep(SLEEP_SUB)             
    
df = pd.DataFrame(rows)
outfile = Path("Reddit_data5.xlsx")
df.to_excel(outfile, index=False)  
print(f"Finished. Data written to {outfile.resolve()}")

r/TheGirlSurvivalGuide: 100%|██████████| 4554/4554 [38:08<00:00,  1.99it/s]
r/technology: 100%|██████████| 7484/7484 [1:02:37<00:00,  1.99it/s]


Finished. Data written to C:\Users\LE NGUYEN DUY PHUC\Documents\AI_Lab_Data\Reddit_data5.xlsx
