In [3]:
!pip install --upgrade praw
!pip install pandas

## Authentication

We import PRAW and authenticate with the Reddit API.

In [2]:
import praw
import pandas as pd
import os
import json

config = json.load(open(os.path.join(os.getcwd(), 'config.json')))
config

In [3]:
reddit = praw.Reddit(
    client_id= config['clientID'],
    client_secret= config['secret'],
    password= config['password'],
    user_agent="CS242 scraper assignment at UC Riverside",
    username= config['username'],
)

## Retrieving Top 100 Hot Posts of Several Subreddits

We retrieve the top 1000 hot posts of several subreddits. We use the `subreddit` method to retrieve the top 1000 hot posts of a subreddit. We use the `hot` method to retrieve the top 1000 hot posts of the subreddit. We use the `limit` method to limit the number of posts retrieved to 1000. For each post, we retrieve the top 1000 maximum top-level comments. We set the `comment_sort` attribute to "top" to retrieve the top comments. We set the `comment_limit` attribute to 100 to limit the comments retrieved to a maximum of 1000. We use the `replace_more` function to remove instances of 'more comments' when iterating through the comments. Afterward, we write these posts into a JSON file in the `posts` directory until we reach 500 MB of data.

In [1]:
subreddits = config['subreddits']

# used to continue from the last subreddit we were retrieving data from
# so that we won't have to go back to the begining
start_subreddit = 'lifeofnorman'

# start the subreddits list with the one we want to start at
subreddits = subreddits[subreddits.index(start_subreddit):]

# prevent duplicate posts by creating a set that tracks the id of all posts
existing_posts = set()

# we only need 500 MB worth of data, so we'll stop once we reach it
MAX_SIZE = 500 * 1000000

for subreddit in subreddits:
    subreddit_posts = []
    for post in reddit.subreddit(subreddit).hot(limit=1000):
        try:
            # prevent existing posts and NSFW posts from being added
            if not post.over_18 and post.id not in existing_posts:
                existing_posts.add(post.id)
                
                subreddit_post_comments = []
                post.comment_sort = "top"
                post.comment_limit = 1000
                post.comments.replace_more(limit=0)
                for comment in post.comments:
                    subreddit_post_comments.append({
                        "author": comment.author,
                        "text": comment.body, 
                        "upvotes": comment.score,  
                        "created_utc": comment.created_utc 
                    })
                #end for
                
                subreddit_posts.append({
                    "title": post.title,
                    "author": post.author,
                    "url":  post.url,
                    "text": post.selftext,
                    "upvote_ratio": post.upvote_ratio,
                    "created_utc": post.created_utc,
                    "comments": subreddit_post_comments
                })
            #end if 
        except Exception as err: 
            print("Something went wrong: ", err)
    #end for

    try:
        if not os.path.exists(os.path.join(os.getcwd(), 'posts')):
            raise OSError("'/posts' folder not found.")
        #end if
        
        subreddit_df = pd.DataFrame(subreddit_posts)
        subreddit_df.to_json('posts/' + subreddit + '.json', default_handler=str, orient='records')
        print(f'Successfully wrote r/{subreddit} posts to file.')
    except Exception as err:
        print('Something went wrong: ', err)

    try:
        cur_size = 0
        for subreddit_json_file in os.scandir(os.path.join(os.getcwd(), 'posts')):
            cur_size+=os.path.getsize(subreddit_json_file)
        print('current size', cur_size)
        if cur_size >= MAX_SIZE:
            print('Reached 500 MB. Terminating subreddit data retrieval.')
            break
        #end if      
    except Exception as err:
        print('Something went wrong: ', err)
#end for

print('Successfully retrieved data for all subreddits.')