# Different Approaches to get Reddit Data - Best so far is chunking

In [5]:
!pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


# Basic API Usage on Keyword (can only get 100 posts)

In [27]:
import praw

# Set up your Reddit API client
reddit = praw.Reddit(
    client_id='B1rBDjk9qOvg0EdpLEaQqg',
    client_secret='ecjKS7nacU-boSOqMwZa8JL2em0g8Q',
    user_agent='your_user_agent'
)

# Search for posts with the keyword "Northwestern University"
posts = reddit.subreddit('all').search('Northwestern University', sort='relevance', limit=1000)

# Extract data
for post in posts:
    print(f"Title: {post.title}")
    print(f"Score: {post.score}")
    print(f"URL: {post.url}")
    print(f"Created: {post.created_utc}")
    print(f"Subreddit: {post.subreddit}")
    print(f"Text: {post.selftext}")
    print("\n")


Title: Thoughts on Northwestern University
Score: 2
URL: https://www.reddit.com/r/ApplyingToCollege/comments/1du9mxd/thoughts_on_northwestern_university/
Created: 1719996566.0
Subreddit: ApplyingToCollege
Text: For starters, I am going into my Sophomore year of High School. I’m young, I know, and have been taking a look into the various colleges and universities that have peaked my interest. I want to study Journalism and my dream goal is to write for CNN or possibly help produce specials. Northwestern University has from what I’ve seen, a very solid Journalism program but everywhere I look, all I see are these massive price points. My question is: is it worth it? Scholarships are a possibility as I’m second in my class with a 4.727 GPA. I plan on taking as many English and World AP Classes as reasonably possible. I have a loose tie to the school as my mom’s uncle is an alumni. An additional question I have would be: What classes or clubs would you recommend to show that I can particip

In [18]:
# Count posts matching the keyword
post_count = sum(1 for _ in reddit.subreddit('all').search('Northwestern University', sort='relevance'))

print(f"Total number of posts found: {post_count}")


Total number of posts found: 100


# Try access all Posts

In [24]:
YOUR_CLIENT_ID='B1rBDjk9qOvg0EdpLEaQqg',
YOUR_CLIENT_SECRET='ecjKS7nacU-boSOqMwZa8JL2em0g8Q',
YOUR_USER_AGENT='your_user_agent'

# Store the posts and comments locally

In [31]:
import praw
import time
import json
import os

# Set up your Reddit API client
reddit = praw.Reddit(
    client_id='B1rBDjk9qOvg0EdpLEaQqg',
    client_secret='ecjKS7nacU-boSOqMwZa8JL2em0g8Q',
    user_agent='your_user_agent'
)

def fetch_posts_and_comments(subreddit_name, query, output_dir, batch_size=10000):
    subreddit = reddit.subreddit(subreddit_name)
    limit = 100  # Fetch 100 posts per request (maximum allowed per API request)
    after = None
    posts_collected = []
    file_count = 0

    while True:
        try:
            results = subreddit.search(query, sort='relevance', limit=limit, params={'after': after})
            batch_posts = list(results)
            
            if not batch_posts:  # If no posts are returned, we've reached the end
                break

            for submission in batch_posts:
                post_data = {
                    'post_id': submission.id,
                    'title': submission.title,
                    'selftext': submission.selftext,
                    'created_utc': submission.created_utc,
                    'subreddit': submission.subreddit.display_name,
                    'score': submission.score,
                    'num_comments': submission.num_comments,
                    'ups': submission.ups,
                    'downs': submission.downs,
                    'url': submission.url,
                    'comments': []
                }

                # Fetch comments
                submission.comments.replace_more(limit=None)  # Expand all comments
                for comment in submission.comments.list():
                    post_data['comments'].append({
                        'comment_id': comment.id,
                        'body': comment.body,
                        'created_utc': comment.created_utc,
                        'score': comment.score,
                        'parent_id': comment.parent_id
                    })

                posts_collected.append(post_data)

                # Save intermediate results
                if len(posts_collected) >= batch_size:
                    file_count += 1
                    output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(posts_collected, f, ensure_ascii=False, indent=4)
                    print(f"Saved {len(posts_collected)} posts to {output_file}")
                    posts_collected = []  # Reset for the next batch

            after = batch_posts[-1].id  # Set 'after' to the last post's ID
            time.sleep(2)  # Avoid hitting Reddit's rate limit
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    # Save remaining posts if any
    if posts_collected:
        file_count += 1
        output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(posts_collected, f, ensure_ascii=False, indent=4)
        print(f"Saved remaining {len(posts_collected)} posts to {output_file}")

# Parameters
subreddit_name = 'all'
query = 'Northwestern University'
output_dir = './downloads/reddit_data'  # Directory to store files
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Run the scraper
fetch_posts_and_comments(subreddit_name, query, output_dir)


An error occurred: received 429 HTTP response
Saved remaining 23 posts to ./reddit_data/posts_batch_1.json


In [32]:
import praw
import time
import json
import os

# Set up your Reddit API client
reddit = praw.Reddit(
    client_id='B1rBDjk9qOvg0EdpLEaQqg',
    client_secret='ecjKS7nacU-boSOqMwZa8JL2em0g8Q',
    user_agent='your_user_agent'
)

def fetch_posts_and_comments(subreddit_name, query, output_dir, batch_size=10000):
    subreddit = reddit.subreddit(subreddit_name)
    limit = 100  # Fetch 100 posts per request (maximum allowed per API request)
    after = None
    posts_collected = []
    file_count = 0
    processed_ids = set()

    # Load existing batch files to get processed post IDs
    for filename in os.listdir(output_dir):
        if filename.endswith('.json'):
            with open(os.path.join(output_dir, filename), 'r', encoding='utf-8') as f:
                batch = json.load(f)
                for post in batch:
                    processed_ids.add(post['post_id'])
    print(f"Resuming with {len(processed_ids)} posts already processed.")

    while True:
        try:
            results = subreddit.search(query, sort='relevance', limit=limit, params={'after': after})
            batch_posts = list(results)
            
            if not batch_posts:  # If no posts are returned, we've reached the end
                break

            for submission in batch_posts:
                if submission.id in processed_ids:
                    continue  # Skip posts already processed

                post_data = {
                    'post_id': submission.id,
                    'title': submission.title,
                    'selftext': submission.selftext,
                    'created_utc': submission.created_utc,
                    'subreddit': submission.subreddit.display_name,
                    'score': submission.score,
                    'num_comments': submission.num_comments,
                    'ups': submission.ups,
                    'downs': submission.downs,
                    'url': submission.url,
                    'comments': []
                }

                # Fetch comments with retry and rate-limit handling
                try:
                    submission.comments.replace_more(limit=None)
                    for comment in submission.comments.list():
                        post_data['comments'].append({
                            'comment_id': comment.id,
                            'body': comment.body,
                            'created_utc': comment.created_utc,
                            'score': comment.score,
                            'parent_id': comment.parent_id
                        })
                    time.sleep(2)  # Add delay between fetching posts and comments
                except praw.exceptions.APIException as e:
                    if 'RATELIMIT' in str(e):
                        print("Rate limit exceeded. Retrying in 1 minute...")
                        time.sleep(60)
                        continue  # Retry the current submission
                    else:
                        raise e

                posts_collected.append(post_data)

                # Save intermediate results
                if len(posts_collected) >= batch_size:
                    file_count += 1
                    output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(posts_collected, f, ensure_ascii=False, indent=4)
                    print(f"Saved {len(posts_collected)} posts to {output_file}")
                    posts_collected = []  # Reset for the next batch

            after = batch_posts[-1].id  # Set 'after' to the last post's ID
            time.sleep(5)  # Avoid hitting Reddit's rate limit
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    # Save remaining posts if any
    if posts_collected:
        file_count += 1
        output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(posts_collected, f, ensure_ascii=False, indent=4)
        print(f"Saved remaining {len(posts_collected)} posts to {output_file}")

# Parameters
subreddit_name = 'all'
query = 'Northwestern University'
output_dir = './downloads/reddit_data'  # Directory to store files
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Run the scraper
fetch_posts_and_comments(subreddit_name, query, output_dir)


Resuming with 23 posts already processed.


KeyboardInterrupt: 

# Only Posts & comments with text

In [34]:
import praw
import time
import json
import os

# Set up your Reddit API client
reddit = praw.Reddit(
    client_id='B1rBDjk9qOvg0EdpLEaQqg',
    client_secret='ecjKS7nacU-boSOqMwZa8JL2em0g8Q',
    user_agent='your_user_agent'
)

def fetch_posts_and_comments(subreddit_name, query, output_dir, batch_size=100):
    subreddit = reddit.subreddit(subreddit_name)
    limit = 100  # Fetch 100 posts per request (maximum allowed per API request)
    after = None
    posts_collected = []
    file_count = 0
    processed_ids = set()

    # Load existing batch files to get processed post IDs
    for filename in os.listdir(output_dir):
        if filename.endswith('.json'):
            with open(os.path.join(output_dir, filename), 'r', encoding='utf-8') as f:
                batch = json.load(f)
                for post in batch:
                    processed_ids.add(post['post_id'])
    print(f"Resuming with {len(processed_ids)} posts already processed.")

    while True:
        try:
            results = subreddit.search(query, sort='relevance', limit=limit, params={'after': after})
            batch_posts = list(results)
            
            if not batch_posts:  # If no posts are returned, we've reached the end
                break

            for submission in batch_posts:
                if submission.id in processed_ids:
                    continue  # Skip posts already processed

                # Filter by selftext length
                if len(submission.selftext) <= 100:
                    continue  # Skip posts with selftext <= 100 characters

                post_data = {
                    'post_id': submission.id,
                    'title': submission.title,
                    'selftext': submission.selftext,
                    'created_utc': submission.created_utc,
                    'subreddit': submission.subreddit.display_name,
                    'score': submission.score,
                    'num_comments': submission.num_comments,
                    'ups': submission.ups,
                    'downs': submission.downs,
                    'url': submission.url,
                    'comments': []
                }

                # Fetch comments for the post
                try:
                    submission.comments.replace_more(limit=None)
                    for comment in submission.comments.list():
                        post_data['comments'].append({
                            'comment_id': comment.id,
                            'body': comment.body,
                            'created_utc': comment.created_utc,
                            'score': comment.score,
                            'parent_id': comment.parent_id
                        })
                    time.sleep(2)  # Add delay between fetching posts and comments
                except praw.exceptions.APIException as e:
                    if 'RATELIMIT' in str(e):
                        print("Rate limit exceeded. Retrying in 1 minute...")
                        time.sleep(60)
                        continue  # Retry the current submission
                    else:
                        raise e

                posts_collected.append(post_data)

                # Save intermediate results
                if len(posts_collected) >= batch_size:
                    file_count += 1
                    output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
                    with open(output_file, 'w', encoding='utf-8') as f:
                        json.dump(posts_collected, f, ensure_ascii=False, indent=4)
                    print(f"Saved {len(posts_collected)} posts to {output_file}")
                    posts_collected = []  # Reset for the next batch

            after = batch_posts[-1].id  # Set 'after' to the last post's ID
            time.sleep(5)  # Avoid hitting Reddit's rate limit
        except Exception as e:
            print(f"An error occurred: {e}")
            break

    # Save remaining posts if any
    if posts_collected:
        file_count += 1
        output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(posts_collected, f, ensure_ascii=False, indent=4)
        print(f"Saved remaining {len(posts_collected)} posts to {output_file}")

# Parameters
subreddit_name = 'all'
query = 'Northwestern University'
output_dir = './downloads/reddit_data_text'  # Directory to store files
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Run the scraper
fetch_posts_and_comments(subreddit_name, query, output_dir)


Resuming with 0 posts already processed.
An error occurred: received 429 HTTP response
Saved remaining 9 posts to ./reddit_data_text/posts_batch_1.json


# Chunking instead of "after" --> BEST SO FAR

In [39]:
import praw
import time
import json
import os

# Set up your Reddit API client
reddit = praw.Reddit(
    client_id='B1rBDjk9qOvg0EdpLEaQqg',
    client_secret='ecjKS7nacU-boSOqMwZa8JL2em0g8Q',
    user_agent='your_user_agent'
)

def fetch_subreddit_posts(subreddit_name, output_dir, batch_size=1000, chunk_size=100):
    subreddit = reddit.subreddit(subreddit_name)
    posts_collected = []
    file_count = 0
    processed_ids = set()

    # Load existing batch files to get processed post IDs
    for filename in os.listdir(output_dir):
        if filename.endswith('.json'):
            with open(os.path.join(output_dir, filename), 'r', encoding='utf-8') as f:
                batch = json.load(f)
                for post in batch:
                    processed_ids.add(post['post_id'])
    print(f"Resuming with {len(processed_ids)} posts already processed.")

    # Fetch posts in chunks without relying on 'after' pagination
    # We'll just keep fetching new posts in chunks of size 'chunk_size'
    post_iterator = subreddit.top(limit=None)  # or subreddit.new() based on your preference

    while True:
        try:
            batch_posts = []
            for _ in range(chunk_size):
                try:
                    submission = next(post_iterator)  # Get the next post
                except StopIteration:
                    break  # No more posts left to process

                if submission.id in processed_ids:
                    continue  # Skip posts already processed

                # Filter by selftext length (only process posts with selftext > 100 characters)
                if len(submission.selftext) <= 100:
                    continue  # Skip posts with selftext <= 100 characters

                post_data = {
                    'post_id': submission.id,
                    'title': submission.title,
                    'selftext': submission.selftext,
                    'created_utc': submission.created_utc,
                    'subreddit': submission.subreddit.display_name,
                    'score': submission.score,
                    'num_comments': submission.num_comments,
                    'ups': submission.ups,
                    'downs': submission.downs,
                    'url': submission.url
                }

                batch_posts.append(post_data)

            if not batch_posts:  # If no posts were fetched, we've reached the end
                break

            # Add the fetched posts to the collected posts list
            posts_collected.extend(batch_posts)

            # Save intermediate results in batches
            if len(posts_collected) >= batch_size:
                file_count += 1
                output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(posts_collected, f, ensure_ascii=False, indent=4)
                print(f"Saved {len(posts_collected)} posts to {output_file}")
                posts_collected = []  # Reset for the next batch

            time.sleep(2)  # Avoid hitting Reddit's rate limit

        except praw.exceptions.PRAWException as e:
            print(f"An error occurred: {e}")
            break

    # Save remaining posts if any
    if posts_collected:
        file_count += 1
        output_file = os.path.join(output_dir, f"posts_batch_{file_count}.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(posts_collected, f, ensure_ascii=False, indent=4)
        print(f"Saved remaining {len(posts_collected)} posts to {output_file}")

# Parameters
subreddit_name = 'northwestern'  # Replace with the name of the subreddit (without 'r/')
output_dir = './downloads/reddit_data_subreddit'  # Directory to store files
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Run the scraper
fetch_subreddit_posts(subreddit_name, output_dir)


Resuming with 0 posts already processed.
Saved remaining 483 posts to ./reddit_data_subreddit/posts_batch_1.json


# Connect via Pushshift.io (not tried yet)

In [3]:
import requests

# Make a query to Pushshift to get Reddit posts
url = 'https://api.pushshift.io/reddit/search/submission/'
params = {
    'q': 'Northwestern',
    'size': 100,  # Number of posts to retrieve
}

response = requests.get(url, params=params)

# Check the status code to ensure the request was successful
if response.status_code == 200:
    # Print the raw response to understand its structure
    response_data = response.json()
    
    # Check if the 'data' key exists in the response
    if 'data' in response_data:
        posts = response_data['data']
        
        # Process and print the posts
        for post in posts:
            print(f"Title: {post['title']}")
            print(f"URL: {post['url']}")
            print(f"Subreddit: {post['subreddit']}")
            print(f"Created: {post['created_utc']}")
            print("\n")
    else:
        print("No data found or incorrect response format:", response_data)
else:
    print(f"Request failed with status code: {response.status_code}")
    print("Response:", response.text)


Request failed with status code: 403
Response: {"detail":"Not authenticated"}
