In [96]:
import requests
import json, time, datetime, math
import numpy as np
import pandas as pd
import re
import os
from requests_futures.sessions import FuturesSession

now = datetime.datetime.now()
ROOT_PATH = 'data/'
DATA_PATH = '-'.join(str(x) for x in [(now.year), now.month, now.day])

def print_time(msg, unix):
    print(msg, time.ctime(int(unix)))
    
def get_readable_time(unix):
    return [time.ctime(int(u)) for u in unix]

def print_progress(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, errors = 0, fill = '+'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s - errors: %s' % (prefix, bar, percent, suffix, errors), end = '\r')
    # Print New Line on Complete
    if iteration == total - 1: 
        print()
        
def retrieve_data(url):
    r = requests.get(url)    
    if(r.status_code >= 400):
        print('Sleeping for one minute...')
        time.sleep(60)
        return retrieve_data(url)
    else:
        return r

def query_data(url, *args, **kwargs):
    payload = {}
    if kwargs is not None:
        for key, value in kwargs.items():
            payload[key] = value
    print(payload)
    
    new_url = url
    if args is not None:
        for value in args:
            new_url = new_url + value + '/'
    print(new_url)
    r = requests.get(url, params=payload)    
    if(400 <= r.status_code < 500):
        print(r.status_code, ' Trying again')
        time.sleep(1)
        return query_data(url, *args, **kwargs)
    elif r.status_code >= 500:
        print('Server error:', r.status_code, r.content)
        print(r.request.body, r.request.headers)
        return None
    else:
        return r.json()

def query_data_future(session, url, *args, **kwargs):
    payload = {}
    if kwargs is not None:
        for key, value in kwargs.items():
            payload[key] = value
    #print(payload)
    new_url = url
    if args is not None:
        for value in args:
            new_url = new_url + value + '/'
    #print(new_url)
    r = session.get(url, params=payload)    
    return r

def save_data(df, name):
    if os.path.isdir(ROOT_PATH + DATA_PATH) == False:
        os.mkdir(ROOT_PATH + DATA_PATH)
        print('Creating', DATA_PATH, 'directory...')
    df.to_pickle(ROOT_PATH + DATA_PATH + '/' + name)
    

URL_COM = 'https://api.pushshift.io/reddit/search/comment/'
URL_SUB = 'https://api.pushshift.io/reddit/search/submission/'

QUERIES = ['bitcoin', 'ethereum', 'crypto', 'ripple', 'litecoin', 'btc']
DROP_SUBREDDITS = ['AskReddit', 'news', 'Sexsells']
TOP_SUBREDDITS_TO_QUERY = 20
DOWNLOAD_NEW_DATA = False
LOAD_DATA_PATH = '2018-2-5'

In [81]:
# Retrieve top 10 subreddits for each query
popular_subreddits = []
popular_subreddits_df = pd.DataFrame()
if DOWNLOAD_NEW_DATA:    
    for query in QUERIES:
        tmp_data = query_data(URL_COM, q=query, size=0, aggs='subreddit', after='30d')['aggs']['subreddit'][0:TOP_SUBREDDITS_TO_QUERY]
        popular_subreddits.extend(tmp_data)

    popular_subreddits_df = pd.DataFrame(popular_subreddits)
    popular_subreddits_df = popular_subreddits_df.drop_duplicates('key').reset_index(drop=True)
    popular_subreddits_df = popular_subreddits_df[~popular_subreddits_df['key'].isin(DROP_SUBREDDITS)].reset_index(drop=True)
    save_data(popular_subreddits_df, 'popular_subreddits.pkl')
else:
    popular_subreddits_df = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'popular_subreddits.pkl')
    popular_subreddits = popular_subreddits_df.to_dict()
    
#TEMP - DELETE
print(popular_subreddits_df)

{'q': 'bitcoin', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/
{'q': 'ethereum', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/
{'q': 'crypto', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/
{'q': 'ripple', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/
{'q': 'litecoin', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/
{'q': 'btc', 'size': 0, 'aggs': 'subreddit', 'after': '30d'}
https://api.pushshift.io/reddit/search/comment/
    bg_count  doc_count                  key     score
0    1521817      40812              Bitcoin   2.68179
1     676344      27633                  btc   4.08564
2    1559250      22551       CryptoCurrency   1.44627
3     153798       9978           BitcoinAll   6.48773
4      45201       6630  noncensored_bitcoin  14.66

In [118]:
TIME_STEP = 6
START_AFTER = 24
DAYS = 30

sub_fields = ['author', 'subreddit', 'score', 'num_comments']
sub_fields_save = ['author', 'subreddit_id', 'subreddit', 'score', 'num_comments', 'id', 'created_utc', 'retrieved_on', 'num_crossposts', 'title', 'url', 'stickied', 'pinned', 'gilded', ]

submissions = []
submissions_df = pd.DataFrame()
fut = []
errors = []
with FuturesSession(max_workers=10) as session:
    for i, sub in popular_subreddits_df.iterrows(): 
        for idx in range(int(24/TIME_STEP * DAYS)):
            _before = str(TIME_STEP*idx + START_AFTER) + 'h'
            _after = str(TIME_STEP*(idx+1) + START_AFTER) + 'h'
            fut.append(query_data_future(session, URL_SUB, subreddit=sub['key'], size=500, before=_before, after=_after))
        if i > 1:
            pass
            #break
            
    for idx in range(len(fut)):
        print_progress(idx, len(fut), prefix = 'Start', suffix = str(idx) + '/' + str(len(fut)), length=50, errors=len(errors))
        tmp = {}
        try:
            tmp = fut[idx].result().json()['data']
            submissions.append(pd.DataFrame(tmp))
            if idx and idx % 30 == 0:
                submissions_df = pd.concat(submissions)[sub_fields_save].query('num_comments>1').reset_index(drop=True)
                save_data(submissions_df, 'submissions_df.pkl')
        except:
            errors.append(fut[idx]) 
    
    submissions_df = pd.concat(submissions)[sub_fields_save].query('num_comments>1').reset_index(drop=True)
    save_data(submissions_df, 'submissions_df.pkl')
    print('Done...')


Done...+++++++++++++++++++++++++++++++++++++++++++++++++-| 100.0% 4559/4560 - errs: 0


In [139]:
submissions_df = (submissions_df
      .sort_values(by=['num_comments'], ascending=False)
      .query('subreddit!="worldnews"&author!="AutoModerator"&author!="[deleted]"')
      .reset_index(drop=True))
print(submissions_df[sub_fields])

                      author       subreddit  score  num_comments
0       SuperficialPickle444       garlicoin  52479         49636
1                     Amidza      technology  70405          6870
2                 skyler4722       garlicoin  64137          6544
3        Religion__of__Peace       garlicoin   2433          5099
4           acacia-club-road      technology  34667          4838
5                       mvea      technology  92622          4489
6                    Z_staff    pcmasterrace    613          4443
7              Gabriel-Lewis  CryptoCurrency  18038          4231
8                       mvea      technology  57675          3767
9                       mvea      technology  26800          3674
10       A_Internet_Stranger  CryptoCurrency  35935          3396
11                      mvea      technology  34105          3362
12                    speckz      technology  47406          3358
13            MichaelRahmani      technology  18870          3304
14        

In [4]:
#get comment ids for every submission obtained above
#concurrent calls
fut = []
comment_ids = []
num_calls = 10
with FuturesSession(max_workers=50) as session:
    for i, sub in bitcoin_submissions.iterrows():
        fut.append(session.get('https://api.pushshift.io/reddit/submission/comment_ids/' + sub.loc['id']))
        print('fut:', i)
        if i >= num_calls - 1:
            pass
            break
        
    for i in range(len(fut)):
        print(i)
        tmp = fut[i].result()
        tmp = tmp.json()['data']
        comment_ids.extend(tmp)

print(len(comment_ids))        


fut: 0
fut: 1
fut: 2
fut: 3
fut: 4
fut: 5
fut: 6
fut: 7
fut: 8
fut: 9
0
1
2
3
4
5
6
7
8
9
9355


In [5]:
#get data for every comment
#print(comment_ids[-1])

print(','.join(comment_ids[0:1]))
#r = query_data(URL_COM, ids=','.join(comment_ids[0:2]))

NUM_CONCAT = 500
fut = []
comment_data = []
with FuturesSession(max_workers=5) as session:
    _done = 0
    idx = 0
    while _done != -1:
        print('working')
        _end = _done + NUM_CONCAT
        if _end >= len(comment_ids) + 1:
            _end = -1
        fut.append(session.get('https://api.pushshift.io/reddit/search/comment/?ids=' + ','.join(comment_ids[_done:_end])))
        _done = _end
    
    for i in range(len(fut)):
        print(i)
        tmp = fut[i].result()
        tmp = tmp.json()['data']
        comment_data.extend(tmp)
        
    #print(r)
print(len(comment_data))

dtkd1na
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
9354


In [9]:
comments_df = pd.DataFrame(comment_data)
print(comment_data[0])

{'approved_at_utc': None, 'author': 'Asdn1220', 'author_flair_background_color': '', 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_text': None, 'author_flair_text_color': 'dark', 'author_flair_type': 'text', 'banned_at_utc': None, 'body': 'I am panicking', 'can_mod_post': False, 'collapsed': False, 'collapsed_reason': None, 'created_utc': 1517479281, 'distinguished': None, 'edited': False, 'id': 'dtkd1na', 'is_submitter': False, 'link_id': 't3_7uhqjf', 'mod_note': None, 'mod_reason_by': None, 'mod_reason_title': None, 'parent_id': 't3_7uhqjf', 'permalink': '/r/Bitcoin/comments/7uhqjf/daily_discussion_february_01_2018/dtkd1na/', 'retrieved_on': 1517479283, 'rte_mode': 'markdown', 'score': 1, 'stickied': False, 'subreddit': 'Bitcoin', 'subreddit_id': 't5_2s3qj'}
