# Data Scraping

In [13]:
import json
from pathlib import Path

import pandas as pd
import praw

To scrape posts from subreddits, you have to first put your reddit app's `client_id`, `client_secret` and `user_agent` in a `secrets.json` file in the project root directory. Here is [a blog post on the Reddit API](https://www.jcchouinard.com/get-reddit-api-credentials-with-praw/), in case you don't know what they are.

In [14]:
with open("../secrets.json", "r") as f:
    secrets = json.load(f)

CLIENT_ID = secrets["client_id"]
CLIENT_SECRET = secrets["client_secret"]
USER_AGENT = secrets["user_agent"]

Define some more configuration variables.

In [15]:
# Which subreddits to scrape
SUBREDDIT_NAMES = ["MachineLearning", "LearnMachineLearning"]

# The maximum number of posts to scrape per subreddit
# Actual number may be smaller than this because some posts are link posts with no text
LIMIT = 1000

# Where the data should be saved
OUTPUT_DIR = Path("../data/raw/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_FILEPATH = OUTPUT_DIR / "reddit_posts.csv"

A helper function for scraping a subreddit.

In [16]:
def scrape_one_subreddit(subreddit):
    posts = []
    for post in subreddit.new(limit=LIMIT):
        # Skip if the post is not a text post
        if not post.is_self:
            continue
        # Skip if the title is missing
        if not post.title:
            continue
        entry = {
            "id": post.id,
            "created_utc": post.created_utc,
            "title": post.title,
            "selftext": post.selftext,
            "subreddit_name": subreddit_name,
        }
        posts.append(entry)
    print(f"Scraped {len(posts)} posts from r/{subreddit_name}")
    return posts

Start scraping!

In [17]:
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
)

data = []
for subreddit_name in SUBREDDIT_NAMES:
    subreddit = reddit.subreddit(subreddit_name)
    data += scrape_one_subreddit(subreddit)
headers = ["id", "created_utc", "title", "selftext", "subreddit_name"]
df = pd.DataFrame(data, columns=headers)
df.to_csv(OUTPUT_FILEPATH, index=False)

Scraped 847 posts from r/MachineLearning
Scraped 715 posts from r/LearnMachineLearning
