In [1]:
import requests
import json, time, datetime, math
import numpy as np
import pandas as pd
import re
from requests_futures.sessions import FuturesSession

def print_time(msg, unix):
    print(msg, time.ctime(int(unix)))
    
def get_readable_time(unix):
    return [time.ctime(int(u)) for u in unix]

def print_progress(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '+'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    ETA = (total - iteration) * 60/REQUESTS_PER_MIN
    estimated = datetime.datetime.now() + datetime.timedelta(seconds=ETA)
    ETA = estimated - datetime.datetime.now()
    print('\r%s |%s| %s%% %s - ETA: %s - %s' % (prefix, bar, percent, suffix, ETA, estimated), end = '\r')
    # Print New Line on Complete
    if iteration == total: 
        print()
        
def retrieve_data(url):
    r = requests.get(url)    
    if(r.status_code >= 400):
        print('Sleeping for one minute...')
        time.sleep(60)
        return retrieve_data(url)
    else:
        return r

def query_data(url, *args, **kwargs):
    payload = {}
    if kwargs is not None:
        for key, value in kwargs.items():
            payload[key] = value
    print(payload)
    
    new_url = url
    if args is not None:
        for value in args:
            new_url = new_url + value + '/'
    
    print(new_url)
    
    r = requests.get(url, params=payload)    
    if(400 <= r.status_code < 500):
        print(r.status_code, ' Trying again')
        time.sleep(1)
        return query_data(url, *args, **kwargs)
    elif r.status_code >= 500:
        print('Server error:', r.status_code, r.content)
        print(r.request.body, r.request.headers)
        return None
    else:
        return r.json()
            

In [2]:
URL_COM = 'https://api.pushshift.io/reddit/search/comment/'
URL_SUB = 'https://api.pushshift.io/reddit/search/submission/'

response = query_data(URL_COM, q='bitcoin', size=10, after='1d', subreddit='bitcoin')
popular_subreddits = query_data(URL_COM, q='bitcoin', size=0, aggs='subreddit', after='30d')

#print(response['data'][0])

{'q': 'bitcoin', 'size': 10, 'after': '1d', 'subreddit': 'bitcoin'}
https://api.pushshift.io/reddit/search/comment/
{'q': 'bitcoin', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/


In [3]:
subreddit_list = []
idx = 0
for item in popular_subreddits['aggs']['subreddit']:
    subreddit_list.append(item)
    if idx > 20:
        break
    idx = idx + 1

TIME_STEP = 6
DAYS = 3

data_temp = []
for idx, sub in enumerate(subreddit_list): 
    data_temp = []
    for idx in range(int(24/TIME_STEP * DAYS)):
        _before = str(TIME_STEP*idx) + 'h'
        _after = str(TIME_STEP*(idx+1)) + 'h'
        r = query_data(URL_SUB, subreddit=sub['key'], size=500, before=_before, after=_after)['data']
        data_temp.append(pd.DataFrame(r))
        print(len(r))
    break
    
    
fields = ['author', 'subreddit', 'parent_id', 'score', 'link_id']

idx = 0
print('idx', '\t', '\t'.join(fields))
for item in response['data']:
    row = []
    for field in fields:
        row.append(str(item[field]))
    #print(idx, '\t', '\t'.join(row))
    idx = idx + 1
    #print(item['author'], item['subreddit'], item)
    


{'subreddit': 'Bitcoin', 'size': 500, 'before': '0h', 'after': '6h'}
https://api.pushshift.io/reddit/search/submission/
129
{'subreddit': 'Bitcoin', 'size': 500, 'before': '6h', 'after': '12h'}
https://api.pushshift.io/reddit/search/submission/
116
{'subreddit': 'Bitcoin', 'size': 500, 'before': '12h', 'after': '18h'}
https://api.pushshift.io/reddit/search/submission/
96
{'subreddit': 'Bitcoin', 'size': 500, 'before': '18h', 'after': '24h'}
https://api.pushshift.io/reddit/search/submission/
113
{'subreddit': 'Bitcoin', 'size': 500, 'before': '24h', 'after': '30h'}
https://api.pushshift.io/reddit/search/submission/
171
{'subreddit': 'Bitcoin', 'size': 500, 'before': '30h', 'after': '36h'}
https://api.pushshift.io/reddit/search/submission/
285
{'subreddit': 'Bitcoin', 'size': 500, 'before': '36h', 'after': '42h'}
https://api.pushshift.io/reddit/search/submission/
288
{'subreddit': 'Bitcoin', 'size': 500, 'before': '42h', 'after': '48h'}
https://api.pushshift.io/reddit/search/submission/


In [7]:
#sequential calls
print_fields = ['id', 'subreddit','author','score', 'num_comments']
bitcoin_submissions = pd.concat(data_temp, ignore_index=True)
#print(bitcoin_submissions.sort_values(by=['num_comments'], ascending=False)[print_fields])

comment_ids = []

for i, sub in bitcoin_submissions.iterrows():
    print(sub.loc['id'])
    comment_ids.extend(query_data('https://api.pushshift.io/reddit/submission/comment_ids/' + sub.loc['id'])['data'])
    if i > 10:
        break

print(comment_ids)
    

7uzye6
7uzz9o
7v0057
7v02kl
7v05o9
7v06jt
7v07n7
7v07vc
7v090i
7v09p1
7v0ahv
7v0aoj
['dtoesxs', 'dtofzr1', 'dtoiozn', 'dtoj2mr', 'dtogmo2', 'dtogqkj', 'dtogt06', 'dtogwts', 'dtohhlc', 'dtohnv4', 'dtojiu0', 'dtondef', 'dtorha7', 'dtoi855', 'dtoivru', 'dtojvoi', 'dtonpfv', 'dtoo9p6', 'dtop9z4', 'dtor6q2', 'dtos01q', 'dtos4an', 'dtohvze', 'dtoio2n', 'dtoizko', 'dtoj8ya', 'dtojc0p', 'dtojqj3', 'dtojsjx', 'dtojuoy', 'dtorlrj', 'dtougmk']


In [4]:
#get comment ids for every submission obtained above
#concurrent calls
print_fields = ['id', 'subreddit','author','score', 'num_comments']
bitcoin_submissions = pd.concat(data_temp, ignore_index=True)
bitcoin_submissions = bitcoin_submissions.sort_values(by=['num_comments'], ascending=False).reset_index(drop=True)
#print(bitcoin_submissions[print_fields])

fut = []
comment_ids = []
num_calls = 10
with FuturesSession(max_workers=10) as session:
    for i, sub in bitcoin_submissions.iterrows():
        fut.append(session.get('https://api.pushshift.io/reddit/submission/comment_ids/' + sub.loc['id']))
        print('fut:', i)
        if i >= num_calls - 1:
            break
        
    for i in range(len(fut)):
        print(i)
        tmp = fut[i].result()
        tmp = tmp.json()['data']
        comment_ids.extend(tmp)

print(len(comment_ids))        


fut: 0
fut: 1
fut: 2
fut: 3
fut: 4
fut: 5
fut: 6
fut: 7
fut: 8
fut: 9
0
1
2
3
4
5
6
7
8
9
9355


In [5]:
#get data for every comment
#print(comment_ids[-1])

print(','.join(comment_ids[0:1]))
#r = query_data(URL_COM, ids=','.join(comment_ids[0:2]))

NUM_CONCAT = 500
fut = []
comment_data = []
with FuturesSession(max_workers=5) as session:
    _done = 0
    idx = 0
    while _done != -1:
        print('working')
        _end = _done + NUM_CONCAT
        if _end >= len(comment_ids) + 1:
            _end = -1
        fut.append(session.get('https://api.pushshift.io/reddit/search/comment/?ids=' + ','.join(comment_ids[_done:_end])))
        _done = _end
    
    for i in range(len(fut)):
        print(i)
        tmp = fut[i].result()
        tmp = tmp.json()['data']
        comment_data.extend(tmp)
        
    #print(r)
print(len(comment_data))

dtkd1na
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
9354


In [9]:
comments_df = pd.DataFrame(comment_data)
print(comment_data[0])

{'approved_at_utc': None, 'author': 'Asdn1220', 'author_flair_background_color': '', 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_text': None, 'author_flair_text_color': 'dark', 'author_flair_type': 'text', 'banned_at_utc': None, 'body': 'I am panicking', 'can_mod_post': False, 'collapsed': False, 'collapsed_reason': None, 'created_utc': 1517479281, 'distinguished': None, 'edited': False, 'id': 'dtkd1na', 'is_submitter': False, 'link_id': 't3_7uhqjf', 'mod_note': None, 'mod_reason_by': None, 'mod_reason_title': None, 'parent_id': 't3_7uhqjf', 'permalink': '/r/Bitcoin/comments/7uhqjf/daily_discussion_february_01_2018/dtkd1na/', 'retrieved_on': 1517479283, 'rte_mode': 'markdown', 'score': 1, 'stickied': False, 'subreddit': 'Bitcoin', 'subreddit_id': 't5_2s3qj'}
