<a href="https://colab.research.google.com/github/lagodw/RedditBot/blob/master/reddit_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install praw

In [2]:
#! usr/bin/env python3
import praw
import pandas as pd
import datetime as dt
from praw.models import MoreComments
import json

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# need to upload file containing reddit account info and secret into json file
f = open('reddit_info.json',)
reddit_info = json.load(f)
reddit = praw.Reddit(client_id=reddit_info['client_id'], \
                     client_secret=reddit_info['client_secret'], \
                     user_agent=reddit_info['user_agent'], \
                     username=reddit_info['username'], \
                     password=reddit_info['password'],
                     check_for_async=False)

In [None]:
# list of subreddits with mostly text posts
# subreddits = ['IAmA', 'interestingasfuck', 'books', 'tifu', 'news', 'todayilearned', 'damnthatsinteresting',
#               'science', 'movies', 'lifeprotips', 'gaming', 'TwoXChromosomes', 'funny', 'explainlikeimfive',
#               'mildlyinteresting', 'AskReddit', 'personalfinance', 'Jokes']
# alternatively can simply grab the top overall posts
subreddits = ['all']
append = True
write_output = True
if 'all' in subreddits:
  limit = None
else:
  limit = 1000

if append == True:
  output = pd.read_csv('/content/drive/MyDrive/reddit_scrape.csv')
else:
  output = pd.DataFrame()
obs_start = output.shape[0]

# pull list of previous threads to make sure we don't duplicate
try:
  old_id_list = list(output.parent_id.unique())
except:
  old_id_list = []
# print out occasionally to check progress
if limit == None:
  print_every = 100
else:
  print_every = limit / 10


# structure of model will to predict comments based on the text from the parent comment/post
# therefore focus on collecting all comments and their parent text
for sub in subreddits:
  i = 0
  start = dt.datetime.now()
  print(f'starting {sub}')

  subreddit = reddit.subreddit(sub)
  hot_sub = subreddit.hot(limit = limit)

  for submission in hot_sub:
    if str(submission.id) not in old_id_list:

      i += 1
      if i % print_every == 0:
        print('starting thread {} of subreddit {}'.format(i, sub))

      # append to existing id list to ensure we don't duplicate in future
      old_id_list += str(submission.id)

      comments = pd.DataFrame()
      posts = pd.DataFrame()

      current_subreddit = str(submission.subreddit)
      title = submission.title
      post_text = submission.selftext
      post_id = submission.id

      # limiting factor for speed is Reddit API's 30 query/min where any query of parent text counts as additional query
      # to avoid extra requests, instead only record the parent id and match it with other comments 
      # keep a record of all posts since they will be the parent of any first comments
      posts = posts.append(pd.DataFrame({
          'comment_text': [post_text],
          'comment_id': [str(post_id)]
      }), ignore_index = True)

      submission.comments.replace_more(limit = 0)

      if len(submission.comments.list()) > 0:
        for comment in submission.comments.list():

          comments = comments.append(pd.DataFrame({
              'subreddit': [current_subreddit],
              'title': [title],
              'parent_id': [str(comment.parent())],
              'comment_id': [comment.id],
              'comment_text': [comment.body]
          }), ignore_index = True)

        # create lookup table containing all comments and posts that can then be matched with every comment to obtain parent text
        comment_table = comments[['comment_id', 'comment_text']]
        lookup_table = posts.append(comment_table)
        lookup_table = lookup_table.rename(columns = {'comment_text': 'parent_text', 'comment_id': 'parent_id'})

        comments = pd.merge(comments, lookup_table, how = 'left', on = 'parent_id')
        output = output.append(comments, ignore_index = True)

  if write_output == True:
    output.to_csv('/content/drive/MyDrive/reddit_scrape.csv', index = False)
  end = dt.datetime.now()
  print(f'finished {sub}, took {str(end - start).split(".")[0]}')
print(f'added {output.shape[0] - obs_start} observations')