# Setup

Tutorial for reddit scraping: https://www.geeksforgeeks.org/scraping-reddit-using-python/

In [15]:
import praw
import pandas as pd
from praw.models import MoreComments

# ids for scraping (from christians setup)
client_id = 'Ut5UgaAMOEWBELtYRWnw0g'
client_secret = '5xGs1w6mav5Ke685afpG28Q8nfusmg'
user_agent = 'polarity search'

# Scraping

First we initialize a read-only instance. A read-only instance can only scrape publicly available information and cannot upvote or otherwise interact like users can.

In [2]:
# Read-only instance
reddit_read_only = praw.Reddit(client_id=client_id,         # your client id
                               client_secret=client_secret,      # your client secret
                               user_agent=user_agent)        # your user agent

## Getting comments on a specific post

This code scrapes over the comments of a specified post. It looks only at the lead comments (none of the replies to comments). It only goes over the first 112 comments for some reason.

In [35]:
def scrape_post(url):
    # Creating a submission object
    submission = reddit_read_only.submission(url=url)

    post_authors = []
    post_comments = []

    for comment in submission.comments:
        if type(comment) == MoreComments:
            continue

        post_authors.append(comment.author)
        post_comments.append(comment.body)

    post_dict = {'author': post_authors, 'comment': post_comments}
    post_df = pd.DataFrame(post_dict)
    
    return post_df

In [38]:
df = scrape_post("https://www.reddit.com/r/politics/comments/10h1dc6/trump_must_pay_hillary_clinton_171631_in_legal/")
print(df.shape)
df.head()

(112, 2)


Unnamed: 0,author,comment
0,AutoModerator,"\nAs a reminder, this subreddit [is for civil ..."
1,PandaMuffin1,> Hours after Middlebrooks' filing became pub...
2,Dagonet_the_Motley,LOL imagine all the suckers who gave money to ...
3,lufecaep,They should double it every time he tries to a...
4,FortySixAndYou,"So, this is on top of the $1M in sanctions tha..."


## Getting top month posts on specified subreddit
This code grabs the top 100 posts of the past month and saves various information on them into a dictionary

In [40]:
def scrape_top_month(subreddit):
    # specifying subreddit
    subreddit = reddit_read_only.subreddit(subreddit)

    # Specifying to look at top posts of the current month
    posts = subreddit.top("month")

    # Initializing dictionary to save post data to
    posts_dict = {"Title": [], "Post Text": [],
                  "ID": [], "Score": [],
                  "Total Comments": [], "Post URL": []
                  }

    # Loop for saving post details
    for post in posts:
        # Title of each post
        posts_dict["Title"].append(post.title)

        # Text inside a post
        posts_dict["Post Text"].append(post.selftext)

        # Unique ID of each post
        posts_dict["ID"].append(post.id)

        # The score of a post
        posts_dict["Score"].append(post.score)

        # Total number of comments inside the post
        posts_dict["Total Comments"].append(post.num_comments)

        # URL of each post
        posts_dict["Post URL"].append('https://www.reddit.com'+f'{post.permalink}')
    
    return posts_dict

In [41]:
dict_ = scrape_top_month('politics')

Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month")


In [42]:
# post samples
print(dict_['Title'][0])
print(dict_['Post Text'][0])
print(dict_['ID'][0])
print(dict_['Score'][0])
print(dict_['Total Comments'][0])
print(dict_['Post URL'][0])
print(len(dict_['Title']))

The American public no longer believes the Supreme Court is impartial

1092xhl
81842
3812
https://www.reddit.com/r/politics/comments/1092xhl/the_american_public_no_longer_believes_the/
100


## Getting comments on top monthly posts on multiple subreddits

In [43]:
# OLD
def scrape_multiple(subreddits):
    
    subreddit_df_dict = dict()
    
    # looping through subreddits
    for subreddit in subreddits:
        posts_dict = scrape_top_month(subreddit) # getting top of the month post info
        
        subreddit_post_dfs = []
        
        # looping through posts
        for idx, url in enumerate(posts_dict['Post URL']):
            # information about the post
            post_info = [posts_dict['Title'][idx],
                         posts_dict['Post Text'][idx],
                         posts_dict['ID'][idx],
                         posts_dict['Score'][idx],
                         posts_dict['Total Comments'][idx],
                         posts_dict['Post URL'][idx]]
            
            # df for comments on the post
            comment_df = scrape_post(url)
            
            pre_df_dict = {'post_info': post_info, 'comment_df': comment_df}
            post_df = pd.DataFrame(pre_df_dict)
        
            subreddit_post_dfs.append(post_df)
        
        subreddit_df_dict[subreddit] = subreddit_post_dfs
    
    subbreddit_df = pd.DataFrame(subredit_df_dict)
    
    return subreddit_df

In [52]:
def scrape_multiple_save(subreddits):
    '''scrapes and saves subreddit comments to csv files
       Naming convention is: SUBREDDIT_POSTID.csv / SUBREDDIT_POSTID_INFO.txt'''
    # looping through subreddits
    for subreddit in subreddits:
        posts_dict = scrape_top_month(subreddit) # getting top of the month post info
        
        # looping through posts
        for idx, url in enumerate(posts_dict['Post URL']):
            # information about the post
            post_info = [posts_dict['Title'][idx],
                         posts_dict['Post Text'][idx],
                         posts_dict['ID'][idx],
                         posts_dict['Score'][idx],
                         posts_dict['Total Comments'][idx],
                         posts_dict['Post URL'][idx]]
            
            # df for comments on the post
            comment_df = scrape_post(url)
            
            comment_df.to_csv(f'../data/{subreddit}_{post_info[2]}.csv', index=False)
            
    print('Done!')
    return None

In [54]:
scrape_multiple_save(['politics', 'gaming', 'MaraudersGame', 'EscapefromTarkov', 'SatisfactoryGame'])

Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month")


Done!


In [44]:
df = scrape_multiple(['politics', 'gaming'])

Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month")


ValueError: Buffer has wrong number of dimensions (expected 1, got 2)