# 01_Fetch_Posts_and_Save

Purpose of this notebook:
 - Fetch N rounds of 500 posts from a selected subreddit
 - Save all rounds to a file for post-processing

In [1]:
## SET GLOBALS HERE, then just run the rest of the file all at once
N = 8

# Leave ONE of these uncommented, or make a new one -- some ideas
subreddit_to_fetch = 'DnD'
subreddit_to_fetch = 'CallOfDuty'

nposts = N*500
print(f"These settings will fetch {nposts} posts from \'{subreddit_to_fetch}\'")

These settings will fetch 4000 posts from 'CallOfDuty'


## Imports and prep

In [2]:
import requests
import pandas as pd
import time

In [3]:
# Global options to increase rows and columns displayed
pd.set_option('display.max_columns', None)
# pd.reset_option(“max_columns”) # to reset back to limited columns

pd.set_option("max_rows", None)

In [4]:
# Main url for the API to access reddit
url = 'https://api.pushshift.io/reddit/search/submission' # EVERYTHING on reddit

# Example of just a subreddit
# https://api.pushshift.io/reddit/search/submission?subreddit=boardgames
#  https://www.youtube.com/watch?v=AcrjEWsMi_E&feature=youtu.be
params = {
    'subreddit':'boardgames',
    'size':25
}
# 25 is the default number of posts to get, can set as high as 500

## Fetch and save routine

In [5]:
def fetch_posts(topic, before_date=0):

    params = {
        'subreddit'   : topic,
        'size'        : 500        # get 500 posts at a time
    }
    
    # Set a date limit only if one is passed in; otherwise, grab most recent
    if before_date > 0:
        params['before'] = before_date
    
    print(f"About to send request:\n\t{url}\n\t{params}")
    
    res = requests.get(url, params)
    code = res.status_code
    if (code != 200):
        print(f"Exit with code {code}")
        return
    
    data = res.json()  
    posts = data['data']
    
    return posts

## Make the fetch

In [6]:
# Make N calls, spacing them apart
list_of_posts = []
#N = 5
before_time = 0
for i in range(N):
    posts = fetch_posts(subreddit_to_fetch,before_time)
    
    first_post = posts[0]['created_utc']
    last_post  = posts[-1]['created_utc']
    ep_human_first = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(first_post))
    ep_human_last  = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(last_post))
    print(f"Fetch {i+1} complete: most recent post was {ep_human_first} and earliest was {ep_human_last}\n")
    
    # save and get ready for the next loop
    list_of_posts += posts
    before_time = last_post
    time.sleep(5) # min 3 second delay - do this during your scraping in the requests.get() portion

About to send request:
	https://api.pushshift.io/reddit/search/submission
	{'subreddit': 'CallOfDuty', 'size': 500}
Fetch 1 complete: most recent post was 2020-05-28 12:19:22 and earliest was 2020-05-26 09:13:36

About to send request:
	https://api.pushshift.io/reddit/search/submission
	{'subreddit': 'CallOfDuty', 'size': 500, 'before': 1590509616}
Fetch 2 complete: most recent post was 2020-05-26 09:11:12 and earliest was 2020-05-24 05:11:25

About to send request:
	https://api.pushshift.io/reddit/search/submission
	{'subreddit': 'CallOfDuty', 'size': 500, 'before': 1590322285}
Fetch 3 complete: most recent post was 2020-05-24 05:11:01 and earliest was 2020-05-21 21:27:34

About to send request:
	https://api.pushshift.io/reddit/search/submission
	{'subreddit': 'CallOfDuty', 'size': 500, 'before': 1590121654}
Fetch 4 complete: most recent post was 2020-05-21 21:18:11 and earliest was 2020-05-20 07:00:38

About to send request:
	https://api.pushshift.io/reddit/search/submission
	{'subre

## Convert to DataFrame and Save

In [7]:
df = pd.DataFrame(list_of_posts)
df.shape

(4000, 80)

In [8]:
# key columns: subreddit, 'selftext' (main text), 'title'
df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_template_id', 'link_flair_text', 'link_flair_text_color',
       'link_flair_type', 'locked', 'media_only', 'no_follow', 'num_comments',
       'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink',
       'pinned', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'suggested_sort',
       'thumbnail', 'tit

In [9]:
df.head(2)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,awarders,can_mod_post,contest_mode,created_utc,domain,full_link,gildings,id,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_robot_indexable,is_self,is_video,link_flair_background_color,link_flair_richtext,link_flair_template_id,link_flair_text,link_flair_text_color,link_flair_type,locked,media_only,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,pinned,pwls,retrieved_on,score,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_subscribers,subreddit_type,suggested_sort,thumbnail,title,total_awards_received,treatment_tags,upvote_ratio,url,whitelist_status,wls,removed_by_category,post_hint,preview,thumbnail_height,thumbnail_width,media,media_embed,secure_media,secure_media_embed,link_flair_css_class,poll_data,crosspost_parent,crosspost_parent_list,author_flair_background_color,author_flair_text_color,banned_by,edited,media_metadata,author_cakeday,author_flair_template_id
0,[],False,ant_thomas93,,[],,text,t2_4mw0pp9a,False,False,[],False,False,1590693562,self.CallOfDuty,https://www.reddit.com/r/CallOfDuty/comments/g...,{},gscqtp,True,False,False,False,True,True,False,#f06292,"[{'e': 'text', 't': 'Question'}]",bf289bc0-7434-11e9-8860-0e4c6d542516,Question,light,richtext,False,False,True,0,0,False,all_ads,/r/CallOfDuty/comments/gscqtp/bo2_got_a_question/,False,6.0,1590693568,1,I tried MW2 for ages earlier and couldn’t get ...,True,False,False,CallOfDuty,t5_2rcq2,310597,public,top,self,[BO2] got a question,0,[],0.99,https://www.reddit.com/r/CallOfDuty/comments/g...,all_ads,6.0,,,,,,,,,,,,,,,,,,,,
1,[],False,cameronlund08,,[],,text,t2_6lpa2fga,False,False,[],False,False,1590693467,self.CallOfDuty,https://www.reddit.com/r/CallOfDuty/comments/g...,{},gscpo5,False,False,False,False,False,True,False,#f06292,"[{'e': 'text', 't': 'Question'}]",bf289bc0-7434-11e9-8860-0e4c6d542516,Question,light,richtext,False,False,True,0,0,False,all_ads,/r/CallOfDuty/comments/gscpo5/cod_most_populat...,False,6.0,1590693473,1,[removed],True,False,False,CallOfDuty,t5_2rcq2,310597,public,top,self,"[COD] Most Populated COD’s? (XB1, MP)",0,[],1.0,https://www.reddit.com/r/CallOfDuty/comments/g...,all_ads,6.0,moderator,,,,,,,,,,,,,,,,,,,


In [10]:
# generate a unique file name
ts = time.gmtime()
mytimestamp = time.strftime("%Y-%m-%d_%H%M%S", ts)

fname = '../posts/' + subreddit_to_fetch + '_' + str(nposts) + '_' + mytimestamp + '.csv'
fname

'../posts/CallOfDuty_4000_2020-05-28_192353.csv'

In [11]:
df.to_csv(fname,index=False)