### Below is a concise Python template that:
1. Loops over my subreddit list.
2. Downloads both submissions and comments for 2021â€“2025 from Arctic Shift's API.
3. Writes one raw file per subreddit.
4. Merges everything and filters by your keyword list into a final dataset.

In [1]:
import requests
import json
import time
import pandas as pd
from datetime import datetime
from pathlib import Path

## Obtain base_url and search UIs
from Arctic Shift Github README (https://github.com/ArthurHeitmann/arctic_shift/blob/master/api/README.md)

In [2]:
BASE_URL_POSTS = "https://arctic-shift.photon-reddit.com/api/posts/search"
BASE_URL_COMMENTS = "https://arctic-shift.photon-reddit.com/api/comments/search"

# True = single-subreddit test; False = full run
TEST_MODE = False

In [None]:
# Search parameters

# Core topics
ALL_SUBREDDITS = [
    "economy",
    "Economics",
    "AskEconomics",
    "personalfinance",
    "povertyfinance",
    "financialindependence",
    "PersonalFinanceCanada",
    "investing",
    "stocks",
    "RealEstate",
    "RealEstateInvesting",
    "personalfinancebanking",
    "creditcards",
]

# Search period (from...to...)
AFTER_DATE = "2021-01-01"
BEFORE_DATE = "2024-03-31"

In [4]:
### CHANGE TEST_MODE TO FALSE FOR FORMAL SCRAPING
TEST_SUBREDDIT = "personalfinance"

if TEST_MODE:
    SUBREDDITS = [TEST_SUBREDDIT]
else:
    SUBREDDITS = ALL_SUBREDDITS

In [5]:
# Base path
BASE_PATH = Path("/Users/apple/Desktop/30112_python/Scrape_Reddit/scraped_data")

# Raw output folder inside that path
OUT_DIR = BASE_PATH / "arctic_raw"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [6]:
# Keywords to search for if needed later

KEYWORDS = [
    # Core inflation/econ
    "inflation",
    "cost of living",
    "cost-of-living",
    "high cost of living",
    "living costs",
    "price increases",
    "rising prices",
    "cpi",
    "consumer price index",
    "interest rates",
    "mortgage rates",
    "fed",
    "federal reserve",
    "rate hike",
    "rate hikes",
    "rate increase",
    "rate increases",
    # Everyday expenses
    "gas prices",
    "gas price",
    "grocery prices",
    "grocery bill",
    "food prices",
    "food bill",
    "rent increase",
    "rent hike",
    "higher rent",
    "rent is too high",
    "housing costs",
    "housing affordability",
    "property taxes",
    "electric bill",
    "electricity bill",
    "energy bill",
    "heating bill",
    "gas bill",
    "utility bills",
    "utilities",
    # Income / strain
    "wage stagnation",
    "wages not keeping up",
    "real wages",
    "paycheck to paycheck",
    "making ends meet",
    "can't afford",
    "cannot afford",
]
KEYWORDS_LOWER = [k.lower() for k in KEYWORDS]

### Helper functions

>Fetch posts

In [7]:
def fetch_posts(subreddit: str, after: str, before: str, outfile: Path):
    current_before = before

    with outfile.open("w", encoding="utf-8") as f_out:
        while True:
            params = {
                "subreddit": subreddit,
                "after": after,
                "before": current_before,
                "limit": "auto",
                "sort": "desc",
            }
            r = requests.get(BASE_URL_POSTS, params=params, timeout=60)
            r.raise_for_status()
            data = r.json()

            # NEW: extract the list from the 'data' field
            items = data.get("data", []) if isinstance(data, dict) else data

            if not isinstance(items, list) or not items:
                break

            for it in items:
                f_out.write(json.dumps(it) + "\n")

            created_values = [it.get("created_utc") for it in items if "created_utc" in it]
            if not created_values:
                break

            oldest = min(created_values)
            if not isinstance(oldest, (int, float)):
                break
            if oldest <= 0:
                break

            next_before_ts = oldest - 1
            current_before = str(next_before_ts)
            time.sleep(0.2)

>Fetch comments

In [8]:
def fetch_comments(subreddit: str, after: str, before: str, outfile: Path):
    current_before = before

    with outfile.open("w", encoding="utf-8") as f_out:
        while True:
            params = {
                "subreddit": subreddit,
                "after": after,
                "before": current_before,
                "limit": "auto",
                "sort": "desc",
            }
            r = requests.get(BASE_URL_COMMENTS, params=params, timeout=60)
            r.raise_for_status()
            data = r.json()

            items = data.get("data", []) if isinstance(data, dict) else data

            if not isinstance(items, list) or not items:
                break

            for it in items:
                f_out.write(json.dumps(it) + "\n")

            created_values = [it.get("created_utc") for it in items if "created_utc" in it]
            if not created_values:
                break

            oldest = min(created_values)
            if not isinstance(oldest, (int, float)):
                break
            if oldest <= 0:
                break

            next_before_ts = oldest - 1
            current_before = str(next_before_ts)
            time.sleep(0.2)

In [9]:
def match_keywords(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in KEYWORDS_LOWER)

In [None]:
def build_filtered_dataset():
    matched_posts = []
    matched_post_ids = set()
    matched_comments = []

    # Submissions
    for subreddit in SUBREDDITS:
        sub_safe = subreddit.replace("/", "_")
        posts_path = OUT_DIR / f"{sub_safe}_posts_2021_2024.jsonl"
        if not posts_path.exists():
            continue

        with posts_path.open("r", encoding="utf-8") as f:
            for line in f:
                d = json.loads(line)
                title = d.get("title", "") or ""
                body = d.get("selftext", "") or ""
                text = f"{title} {body}"
                if match_keywords(text):
                    post_id = d.get("id")
                    matched_posts.append(
                        {
                            "type": "submission",
                            "id": post_id,
                            "link_id": post_id,
                            "parent_id": None,
                            "subreddit": d.get("subreddit"),
                            "title": title,
                            "body": body,
                            "created_utc": d.get("created_utc"),
                            "score": d.get("score"),
                            "num_comments": d.get("num_comments"),
                        }
                    )
                    if post_id is not None:
                        matched_post_ids.add(post_id)

    # Comments
    for subreddit in SUBREDDITS:
        sub_safe = subreddit.replace("/", "_")
        comments_path = OUT_DIR / f"{sub_safe}_comments_2021_2024.jsonl"
        if not comments_path.exists():
            continue

        with comments_path.open("r", encoding="utf-8") as f:
            for line in f:
                d = json.loads(line)
                body = d.get("body", "") or ""
                link_id = d.get("link_id")
                parent_id = d.get("parent_id")

                norm_link_id = link_id
                if isinstance(link_id, str) and link_id.startswith("t3_"):
                    norm_link_id = link_id[3:]

                keep = False
                if match_keywords(body):
                    keep = True
                elif norm_link_id and norm_link_id in matched_post_ids:
                    keep = True

                if keep:
                    matched_comments.append(
                        {
                            "type": "comment",
                            "id": d.get("id"),
                            "link_id": norm_link_id,
                            "parent_id": parent_id,
                            "subreddit": d.get("subreddit"),
                            "title": None,
                            "body": body,
                            "created_utc": d.get("created_utc"),
                            "score": d.get("score"),
                            "num_comments": None,
                        }
                    )

    df = pd.DataFrame(matched_posts + matched_comments)
    out_name = (
        "reddit_inflation_2021_2024_posts_and_comments_TEST.parquet"
        if TEST_MODE
        else "reddit_inflation_2021_2024_posts_and_comments.parquet"
    )
    df.to_parquet(BASE_PATH / out_name, engine="pyarrow", index=False)

### Running

In [None]:
if __name__ == "__main__":
    print(f"TEST_MODE = {TEST_MODE}")
    print("Subreddits:", SUBREDDITS)
    print("Date range:", AFTER_DATE, "to", BEFORE_DATE)
    print("Base path:", BASE_PATH)
    print("Raw output dir:", OUT_DIR)

    for sub in SUBREDDITS:
        print(f"\nDownloading posts for r/{sub}...")
        sub_safe = sub.replace("/", "_")
        posts_out = OUT_DIR / f"{sub_safe}_posts_2021_2025.jsonl"
        comments_out = OUT_DIR / f"{sub_safe}_comments_2021_2025.jsonl"

        fetch_posts(sub, AFTER_DATE, BEFORE_DATE, posts_out)
        print(f"Saved posts to {posts_out}")

        print(f"Downloading comments for r/{sub}...")
        fetch_comments(sub, AFTER_DATE, BEFORE_DATE, comments_out)
        print(f"Saved comments to {comments_out}")

    print("\nBuilding filtered dataset...")
    build_filtered_dataset()
    print("Done.")

TEST_MODE = False
Subreddits: ['economy', 'Economics', 'AskEconomics', 'personalfinance', 'povertyfinance', 'financialindependence', 'PersonalFinanceCanada', 'investing', 'stocks', 'RealEstate', 'RealEstateInvesting', 'personalfinancebanking', 'creditcards']
Date range: 2020-01-01 to 2025-12-31
Base path: /Users/apple/Desktop/30112_python/Scrape_Reddit/scraped_data
Raw output dir: /Users/apple/Desktop/30112_python/Scrape_Reddit/scraped_data/arctic_raw

Downloading posts for r/economy...
Saved posts to /Users/apple/Desktop/30112_python/Scrape_Reddit/scraped_data/arctic_raw/economy_posts_2020_2025.jsonl
Downloading comments for r/economy...


KeyboardInterrupt: 