In [1]:
import requests
import time
import json
from datetime import datetime, timezone

SUBREDDIT = "archlinux"

HEADERS = {
    "User-Agent": "archlinux-json-scraper/1.0"
}


def clean_text(text):
    if not text:
        return ""
    return " ".join(text.replace("\n", " ").split())


def fetch_posts_last_xh(hours=4):
    cutoff = datetime.now(timezone.utc).timestamp() - (hours * 3600)
    after = None
    posts = []

    while True:
        url = f"https://www.reddit.com/r/{SUBREDDIT}/new.json?limit=100"
        if after:
            url += f"&after={after}"

        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.raise_for_status()
        data = resp.json()["data"]

        children = data["children"]
        if not children:
            break

        stop = False
        for child in children:
            post = child["data"]
            if post["created_utc"] >= cutoff:
                posts.append(post)
            else:
                stop = True
                break

        if stop:
            break

        after = data.get("after")
        if not after:
            break

        time.sleep(1)

    return posts


def extract_comments(children, lines, indent=""):
    for child in children:
        if child["kind"] != "t1":
            continue

        body = clean_text(child["data"].get("body"))
        if body:
            lines.append(f"{indent}- {body}")

        replies = child["data"].get("replies")
        if isinstance(replies, dict):
            extract_comments(
                replies["data"]["children"],
                lines,
                indent + "     "
            )


def fetch_post_content(post):
    url = f"https://www.reddit.com{post['permalink']}.json"
    resp = requests.get(url, headers=HEADERS, timeout=10)
    resp.raise_for_status()
    data = resp.json()

    post_data = data[0]["data"]["children"][0]["data"]
    comment_tree = data[1]["data"]["children"]

    lines = []

    time_str = datetime.fromtimestamp(
        post_data["created_utc"], tz=timezone.utc
    ).strftime("%Y-%m-%d %H:%M UTC")

    lines.append(time_str)
    lines.append(f" - {post_data['title']}")

    description = clean_text(post_data.get("selftext"))
    if description:
        lines.append("   Description:")
        lines.append(f"    {description}")

    if comment_tree:
        lines.append("   Comments:")
        extract_comments(comment_tree, lines)

    return {
        "content": "\n".join(lines)
    }


def build_content_json(posts):
    result = []

    for post in posts:
        result.append(fetch_post_content(post))
        time.sleep(1)

    return result


if __name__ == "__main__":
    posts = fetch_posts_last_xh()
    data = build_content_json(posts)
    print(json.dumps(data, indent=2))


[
  {
    "content": "2026-02-06 17:38 UTC\n - I built a minimalist Pomodoro Timer for i3wm"
  },
  {
    "content": "2026-02-06 17:28 UTC\n - black screen after bootloader\n   Description:\n    i was trying to install operazer applications(snake, razercommander, etc) to get my mice to 1600dpi, nothing was starting. everytime i tried to open them through cli, there was a import error or share library error concerning icu, fast forward to trying to fix icu. someone recommended me to uninstall icu and reinstall it, there goes my pacman, also my kitty broke because it couldnt find gi, did a reinstall of kitty before doing uninstall for icu, that fixed it. luckily before black screening i had created a boot drive for arch, i installed icu with the \\`pacman --root=/mnt -Syu icu\\` now my system wont boot. Also, i can see libvirt fails before i see the black screen with a cursor. what do i do? where do i even look? Should I just reinstall arch? I have a different partition for home director