In [None]:
!pip install --upgrade praw
!pip install pandas

## Authentication

We import PRAW and authenticate with the Reddit API.

In [None]:
import praw
import pandas as pd
import os
import json

config = json.load(open(os.path.join(os.getcwd(), 'config.json')))
config

In [None]:
reddit = praw.Reddit(
    client_id= config['clientID'],
    client_secret= config['secret'],
    password= config['password'],
    user_agent="CS242 scraper assignment at UC Riverside",
    username= config['username'],
)

## Retrieving Top 100 Hot Posts of Several Subreddits

We retrieve the top 100 hot posts of several subreddits. We use the `subreddit` method to retrieve the top 100 hot posts of a subreddit. We use the `hot` method to retrieve the top 100 hot posts of the subreddit. We use the `limit` method to limit the number of posts retrieved to 100. Afterward, we write these posts into a JSON file in the `posts` directory until we reach 500 MB of data.

In [None]:
subreddits = config['subreddits']

# used to continue from the last subreddit we were retrieving data from
# so that we won't have to go back to the begining
start_subreddit = 'ucr'

# start the subreddits list with the one we want to start at
subreddits = subreddits[subreddits.index(start_subreddit)+1:]

# prevent duplicate posts by creating a set that tracks the id of all posts
existing_posts = set()

# we only need 500 MB worth of data, so we'll stop once we reach it
MAX_SIZE = 500 * 1000000

for subreddit in subreddits:
  subreddit_posts = []
  for post in reddit.subreddit(subreddit).hot(limit=100):
    try:
      # prevent existing posts and NSFW posts from being added
      if not post.over_18 and post.id not in existing_posts:
        existing_posts.add(post.id)
        subreddit_posts.append({
          "title": post.title,
          "author": post.author,
          "url":  post.url,
          "text": post.selftext,
          "upvote_ratio": post.upvote_ratio,
          "created_utc": post.created_utc
        })

      # todo: add comments here, there is a comments attribute on the post variable
      # comments are in CommentForest (https://praw.readthedocs.io/en/stable/code_overview/other/commentforest.html#praw.models.comment_forest.CommentForest) attribute on PRAW
      # we have to determine how many levels deep we will retrieve comments

    except Exception as err: 
      print("Something went wrong: ", err)

  try:
    if not os.path.exists(os.path.join(os.getcwd(), 'posts')):
      raise OSError("'/posts' folder not found.")

    subreddit_df = pd.DataFrame(subreddit_posts)
    subreddit_df.to_json('posts/' + subreddit + '.json', default_handler=str, orient='records')
    print(f'Successfully wrote r/{subreddit} posts to file.')
  except Exception as err:
    print('Something went wrong: ', err)

  try:
    cur_size = 0
    for subreddit_json_file in os.scandir(os.path.join(os.getcwd(), 'posts')):
      cur_size+=os.path.getsize(subreddit_json_file)
    if cur_size >= MAX_SIZE:
      print('Reached 500 MB. Terminating subreddit data retrieval.')
      break
  except Exception as err:
    print('Something went wrong: ', err)

print('Successfully retrieved data for all subreddits.')