In [4]:
import time
import requests
import pandas as pd
import datetime as dt

In [5]:
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 300):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score']
    
    #establish base url and stem
    
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}"
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500"
    
    #instantiate empty list
    posts = []
    
    #implement for loop with time.sleep(2)
    
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(1)
        
    full = pd.concat(posts, sort = False)
    
    if kind == 'submission':
        
        #select desired columns
        
        full = full[SUBFIELDS]
        
        #drop duplicated
        
        full.drop_duplicates(inplace = True)
        
#         full = full.loc[full['is_self'] == True]
    
    #create timestamp column
    
    full['timestamp'] = full['created_utc'].map(dt.date.fromtimestamp)
    
    print('Query Complete!')
    
    return full
        

In [6]:
results = query_pushshift('TheOnion')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=30d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=90d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=120d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=150d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=180d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=210d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=240d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=270d
Querying from: https://api.pushshift.io/reddit/search/submission?su

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2400d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2430d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2460d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2490d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2520d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2550d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2580d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2610d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=2640d
Querying from: https://api.pushshift.io/reddit/search/s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4770d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4800d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4830d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4860d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4890d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4920d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4950d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=4980d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=5010d
Querying from: https://api.pushshift.io/reddit/search/s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7140d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7170d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7200d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7230d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7260d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7290d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7320d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=TheOnion&size=500&after=7380d
Querying from: https://api.pushshift.io/reddit/search/s

In [7]:
results.shape

(8343, 8)

In [8]:
results.to_csv('theonion.csv', index = False)