# Setup

Tutorial for reddit scraping: https://www.geeksforgeeks.org/scraping-reddit-using-python/

In [1]:
import praw
import pandas as pd
from praw.models import MoreComments

# ids for scraping (from christians setup)
client_id = 'Ut5UgaAMOEWBELtYRWnw0g'
client_secret = '5xGs1w6mav5Ke685afpG28Q8nfusmg'
user_agent = 'polarity search'

# Scraping

First we initialize a read-only instance. A read-only instance can only scrape publicly available information and cannot upvote or otherwise interact like users can.

In [2]:
# Read-only instance
reddit_read_only = praw.Reddit(client_id=client_id,         # your client id
                               client_secret=client_secret,      # your client secret
                               user_agent=user_agent)        # your user agent

## Getting comments on a specific post

This code scrapes over the comments of a specified post. It looks only at the lead comments (none of the replies to comments). It only goes over the first 112 comments for some reason.

In [14]:
def scrape_post(url, all_comments=False):
    # Creating a submission object
    submission = reddit_read_only.submission(url=url)
    
    # should get all top level comments on the post
    if all_comments==True:
        submission.comments.replace_more(limit=None)

    post_authors = []
    post_comments = []

    for comment in submission.comments:
        if type(comment) == MoreComments:
            continue

        post_authors.append(comment.author)
        post_comments.append(comment.body)

    post_dict = {'author': post_authors, 'comment': post_comments}
    post_df = pd.DataFrame(post_dict)
    
    return post_df

In [4]:
df = scrape_post("https://www.reddit.com/r/MaraudersGame/comments/ylxsq4/marauders_be_like/")

iv1i25s
iv18ujc
iv3ks39
iv19egq
iv0plt0
iv1t4z6
iv1v1xc
iv1aj7l
iv1p2k7
iv21u69
iv3l2xe
iv3rxbj


In [5]:
print(df.shape)
df.head()

(12, 2)


Unnamed: 0,author,comment
0,Lozsta,Why is there not a toggle to turn that off. I ...
1,OpossumHades,...that destroyed ÖRTH
2,JEClockwork,For 70 years we have long lived in the shadows...
3,l3lNova,Ok but real talk that movie was wack
4,sw4mpy_1,Well no more!!!!


In [122]:
df = scrape_post("https://www.reddit.com/r/politics/comments/1092xhl/the_american_public_no_longer_believes_the/")

j3vqpob
j3vt4y9
j3vt4qp
j3vrddo
j3vtqy7
j3vs7oi
j3w7n18
j3vquhk
j3vsyea
j3w35hr
j3vredy
j3vzw4y
j3vtqnp
j3w17ku
j3vzrk3
j3w2729
j3vulzh
j3vuwow
j3w0qro
j3vtp8w
j3vzazw
j3w636b
j3w0ghe
j3w0uqk
j3vzyvh
j3w0j0s
j3vw8qa
j3w7n36
j3vynpx
j3vyelf
j3wf8xh
j3vz4xx
j3vu1z2
j3vuusy
j3vy2li
j3w1v3g
j3w2d09
j3w3hux
j3wg94u
j3xaxqh
j3vr2av
j3vxkcr
j3vy5pa
j3vyi7s
j3w039k
j3w5edi
j3weeiz
j3wf0ki
j3wfsno
j3wgiag
j3wgnx0
j3wm2bg
j3wm6al
j3yppcz
j3yqr4c
j3yrvib
j3ysut3
j3wf92w
j3vv60h
j3vyju0
j3vuavn
j3vzl4u
j3wfdii
j3vwr6f
j3vxliz
j3vyx60
j3w0prt
j3w0rbi
j3w0wd2
j3w1867
j3w1cj9
j3w2o1j
j3w2yl1
j3w358c
j3w3dzg
j3w4x42
j3wankp
j3wdltc
j3wfipg
j3wlmrp
j3wnddi
j3wnez6
j3wng8p
j3wnil8
j3wog0e
j3wp3ff
j3wvi9k
j3wy6sx
j3wyb6d
j3wyc3u
j3x33b4
j3x39dh
j3x5fea
j3x5fyy
j3x5kk7
j3xai9c
j3xf8jz
j3xk240
j3xlnlw
j3xlta8
j3xm3x8
j3xmutr
j3xnppk
j3xqakf
j3xsdvi
j3y1jim
j3yguck
j3yhiu7
j3ypobl
<MoreComments count=2421, children=['j3wlfya', 'j3wbsar', 'j3w8jd0', '...']>


In [6]:
print(df.shape)
df.head()

(12, 2)


Unnamed: 0,author,comment
0,Lozsta,Why is there not a toggle to turn that off. I ...
1,OpossumHades,...that destroyed ÖRTH
2,JEClockwork,For 70 years we have long lived in the shadows...
3,l3lNova,Ok but real talk that movie was wack
4,sw4mpy_1,Well no more!!!!


## Getting top month posts on specified subreddit
This code grabs the top 100 posts of the past month and saves various information on them into a dictionary

In [7]:
def scrape_top_month(subreddit, ppsr=100):
    # specifying subreddit
    subreddit = reddit_read_only.subreddit(subreddit)

    # Specifying to look at top posts of the current month
    posts = subreddit.top("month", limit=ppsr)

    # Initializing dictionary to save post data to
    posts_dict = {"Title": [], "Post Text": [],
                  "ID": [], "Score": [],
                  "Total Comments": [], "Post URL": []
                  }

    # Loop for saving post details
    for post in posts:
        # Title of each post
        posts_dict["Title"].append(post.title)

        # Text inside a post
        posts_dict["Post Text"].append(post.selftext)

        # Unique ID of each post
        posts_dict["ID"].append(post.id)

        # The score of a post
        posts_dict["Score"].append(post.score)

        # Total number of comments inside the post
        posts_dict["Total Comments"].append(post.num_comments)

        # URL of each post
        posts_dict["Post URL"].append('https://www.reddit.com'+f'{post.permalink}')
    
    return posts_dict

In [8]:
dict_ = scrape_top_month('politics')

Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month", limit=ppsr)


In [9]:
# post samples
print(dict_['Title'][0])
print(dict_['Post Text'][0])
print(dict_['ID'][0])
print(dict_['Score'][0])
print(dict_['Total Comments'][0])
print(dict_['Post URL'][0])
print(len(dict_['Title']))

Trump Must Pay Hillary Clinton $171,631 in Legal Fees Over Bogus Lawsuit

10h1dc6
68515
2003
https://www.reddit.com/r/politics/comments/10h1dc6/trump_must_pay_hillary_clinton_171631_in_legal/
100


In [10]:
dict_ = scrape_top_month('politics', ppsr=150)

Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month", limit=ppsr)


In [11]:
# post samples
print(dict_['Title'][0])
print(dict_['Post Text'][0])
print(dict_['ID'][0])
print(dict_['Score'][0])
print(dict_['Total Comments'][0])
print(dict_['Post URL'][0])
print(len(dict_['Title']))

Trump Must Pay Hillary Clinton $171,631 in Legal Fees Over Bogus Lawsuit

10h1dc6
68509
2003
https://www.reddit.com/r/politics/comments/10h1dc6/trump_must_pay_hillary_clinton_171631_in_legal/
150


## Getting comments on top monthly posts on multiple subreddits

In [16]:
def scrape_multiple_save(subreddits, ppsr=100, all_comments=False):
    '''scrapes and saves subreddit comments to csv files
       Naming convention is: SUBREDDIT_POSTID.csv / SUBREDDIT_POSTID_INFO.txt'''
    
    
    if all_comments==False:
        print(f'Scraping {ppsr} posts per subreddit and ~100 comments per post')
    else:
        print(f'Scraping {ppsr} posts per subreddit and all comments per post')
    
    # looping through subreddits
    for subreddit in subreddits:
        print(f'Scraping r/{subreddit}...')
        
        # initialize dictionary for saving all comments and post info
        sub_dict = {'post_title': [],
                    'post_text': [],
                    'post_id': [],
                    'post_score': [],
                    'post_total_comments': [],
                    'post_url': [],
                    'comment_author': [],
                    'comment_text': []}
        
        posts_dict = scrape_top_month(subreddit, ppsr) # getting top of the month post info
        
        # looping through posts
        for idx, url in enumerate(posts_dict['Post URL']):
            
            # df for comments on the post
            comment_df = scrape_post(url, all_comments=all_comments)
            
            # looping through comments on post and appending all comment info to sub_dict
            for row_idx, row in comment_df.iterrows():
                sub_dict['post_title'].append(posts_dict['Title'][idx])
                sub_dict['post_text'].append(posts_dict['Post Text'][idx])
                sub_dict['post_id'].append(posts_dict['ID'][idx])
                sub_dict['post_score'].append(posts_dict['Score'][idx])
                sub_dict['post_total_comments'].append(posts_dict['Total Comments'][idx])
                sub_dict['post_url'].append(posts_dict['Post URL'][idx])
                sub_dict['comment_author'].append(row['author'])
                sub_dict['comment_text'].append(row['comment'])
            
        # changing sub_dict to pandas dataframe
        sub_df = pd.DataFrame.from_dict(sub_dict)
        
        # saving to csv
        sub_df.to_csv(f'../data/{subreddit}.csv', index=False)
        
    print('Done!')
    return None

In [17]:
scrape_multiple_save(['politics', 'gaming', 'MaraudersGame', 'EscapefromTarkov', 'SatisfactoryGame'])

Scraping 100 posts per subreddit and ~100 comments per post
Scraping r/politics...


Call this function with 'time_filter' as a keyword argument.
  posts = subreddit.top("month", limit=ppsr)


Scraping r/gaming...
Scraping r/MaraudersGame...
Scraping r/EscapefromTarkov...
Scraping r/SatisfactoryGame...
Done!
