In [1]:
import requests
import json, time, datetime, math
import numpy as np
import pandas as pd
import re
import os
from requests_futures.sessions import FuturesSession

now = datetime.datetime.now()
ROOT_PATH = 'data/'
DATA_PATH = '-'.join(str(x) for x in [(now.year), now.month, now.day])

def print_time(msg, unix):
    print(msg, time.ctime(int(unix)))
    
def get_readable_time(unix):
    return [time.ctime(int(u)) for u in unix]

def print_progress(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, errors = 0, fill = '+'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s - errors: %s' % (prefix, bar, percent, suffix, errors), end = '\r')
    # Print New Line on Complete
    if iteration == (total - 2): 
        print()
        
def retrieve_data(url):
    r = requests.get(url)    
    if(r.status_code >= 400):
        print('Sleeping for one minute...')
        time.sleep(60)
        return retrieve_data(url)
    else:
        return r

def query_data(url, *args, **kwargs):
    payload = {}
    if kwargs is not None:
        for key, value in kwargs.items():
            payload[key] = value
    print(payload)
    
    new_url = url
    if args is not None:
        for value in args:
            new_url = new_url + value + '/'
    print(new_url)
    r = requests.get(url, params=payload)    
    if(400 <= r.status_code < 500):
        print(r.status_code, ' Trying again')
        time.sleep(1)
        return query_data(url, *args, **kwargs)
    elif r.status_code >= 500:
        print('Server error:', r.status_code, r.content)
        print(r.request.body, r.request.headers)
        return None
    else:
        return r.json()

def query_data_future(session, url, *args, **kwargs):
    payload = {}
    if kwargs is not None:
        for key, value in kwargs.items():
            payload[key] = value
    #print(payload)
    new_url = url
    if args is not None:
        for value in args:
            new_url = new_url + value + '/'
    #print(new_url)
    r = session.get(url, params=payload)    
    return r

def save_data(df, name):
    if os.path.isdir(ROOT_PATH + DATA_PATH) == False:
        os.mkdir(ROOT_PATH + DATA_PATH)
        print('Creating', DATA_PATH, 'directory...')
    df.to_pickle(ROOT_PATH + DATA_PATH + '/' + name)
    

URL_COM = 'https://api.pushshift.io/reddit/search/comment/'
URL_SUB = 'https://api.pushshift.io/reddit/search/submission/'

QUERIES = ['bitcoin', 'ethereum', 'crypto', 'ripple', 'litecoin', 'btc']
DROP_SUBREDDITS = ['AskReddit', 'news', 'Sexsells']
TOP_SUBREDDITS_TO_QUERY = 20
DOWNLOAD_NEW_DATA = False
LOAD_DATA_PATH = '2018-2-5'

In [2]:
# Retrieve top 10 subreddits for each query
popular_subreddits = []
popular_subreddits_df = pd.DataFrame()
if DOWNLOAD_NEW_DATA:    
    for query in QUERIES:
        tmp_data = query_data(URL_COM, q=query, size=0, aggs='subreddit', after='30d')['aggs']['subreddit'][0:TOP_SUBREDDITS_TO_QUERY]
        popular_subreddits.extend(tmp_data)

    popular_subreddits_df = pd.DataFrame(popular_subreddits)
    popular_subreddits_df = popular_subreddits_df.drop_duplicates('key').reset_index(drop=True)
    popular_subreddits_df = popular_subreddits_df[~popular_subreddits_df['key'].isin(DROP_SUBREDDITS)].reset_index(drop=True)
    save_data(popular_subreddits_df, 'popular_subreddits.pkl')
else:
    popular_subreddits_df = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'popular_subreddits.pkl')
    popular_subreddits = popular_subreddits_df.to_dict()
    
#TEMP - DELETE
print(popular_subreddits_df)

    bg_count  doc_count                  key     score
0    1521817      40812              Bitcoin   2.68179
1     676344      27633                  btc   4.08564
2    1559250      22551       CryptoCurrency   1.44627
3     153798       9978           BitcoinAll   6.48773
4      45201       6630  noncensored_bitcoin  14.66782
5     285720       6140       BitcoinMarkets   2.14896
6     740446       5661            ethtrader   0.76454
7      62712       3596             Buttcoin   5.73415
8     290118       2595               Ripple   0.89446
9    1350445       1835         pcmasterrace   0.13588
10    180451       1797            RaiBlocks   0.99584
11     26680       1619             xyMarket   6.06822
12    179150       1560            investing   0.87078
13     52570       1440     BitcoinBeginners   2.73920
14    236659       1408             litecoin   0.59495
15    623007       1376           technology   0.22086
16   3211589       1342            worldnews   0.04179
17    1094

In [3]:
TIME_STEP = 6
START_AFTER = 24
DAYS = 30

sub_fields = ['id', 'author', 'subreddit', 'score', 'num_comments']
sub_fields_save = ['author', 'subreddit_id', 'subreddit', 'score', 'num_comments', 'id', 'created_utc', 'retrieved_on', 'num_crossposts', 'title', 'url', 'stickied', 'pinned', 'gilded', ]

submissions = []
submissions_df = pd.DataFrame()
fut = []
errors = []
if DOWNLOAD_NEW_DATA:
    with FuturesSession(max_workers=10) as session:
        for i, sub in popular_subreddits_df.iterrows(): 
            for idx in range(int(24/TIME_STEP * DAYS)):
                _before = str(TIME_STEP*idx + START_AFTER) + 'h'
                _after = str(TIME_STEP*(idx+1) + START_AFTER) + 'h'
                fut.append(query_data_future(session, URL_SUB, subreddit=sub['key'], size=500, before=_before, after=_after))
            if i > 1:
                pass
                #break

        for idx in range(len(fut)):
            print_progress(idx, len(fut), prefix = 'Start', suffix = str(idx) + '/' + str(len(fut)), length=50, errors=len(errors))
            tmp = {}
            try:
                tmp = fut[idx].result().json()['data']
                submissions.append(pd.DataFrame(tmp))
                if idx and idx % 30 == 0:
                    submissions_df = pd.concat(submissions)[sub_fields_save].query('num_comments>1').reset_index(drop=True)
                    save_data(submissions_df, 'submissions.pkl')
            except:
                errors.append(fut[idx]) 

        submissions_df = pd.concat(submissions)[sub_fields_save].query('num_comments>1').reset_index(drop=True)
        save_data(submissions_df, 'submissions.pkl')
        print('Downloading submissions done...')
else:
    submissions_df = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'submissions.pkl')
    print('Submissions successfully loaded')

Submissions successfully loaded


In [4]:
submissions_df = submissions_df.sort_values(by=['num_comments'], ascending=False).query('subreddit!="worldnews"&author!="AutoModerator"&author!="[deleted]"').reset_index(drop=True)
#submissions_df = submissions_df.query('subreddit=="Bitcoin"&num_comments>25').reset_index(drop=True)
#print(submissions_df[sub_fields])
print(submissions_df.info())
#print(submissions_df[1:104000]['num_comments'].sum())
#print(submissions_df[submissions_df['author'].str.contains('Bot')][sub_fields])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104127 entries, 0 to 104126
Data columns (total 14 columns):
author            104127 non-null object
subreddit_id      104127 non-null object
subreddit         104127 non-null object
score             104127 non-null int64
num_comments      104127 non-null int64
id                104127 non-null object
created_utc       104127 non-null int64
retrieved_on      104127 non-null int64
num_crossposts    104127 non-null int64
title             104127 non-null object
url               104127 non-null object
stickied          104127 non-null bool
pinned            104127 non-null bool
gilded            362 non-null float64
dtypes: bool(2), float64(1), int64(5), object(6)
memory usage: 9.7+ MB
None


In [7]:
#get comment ids for every submission obtained above
#concurrent calls
fut = []
comment_ids = []
comment_ids_df = pd.DataFrame()
errors = []
IDS_DOWNLOAD = False
if IDS_DOWNLOAD:
    with FuturesSession(max_workers=10) as session:
        print('Starting...')
        for i, sub in submissions_df.iterrows():
            fut.append(session.get('https://api.pushshift.io/reddit/submission/comment_ids/' + sub.loc['id']))

        for i in range(len(fut)):
            print_progress(i, len(fut), prefix = 'Start', suffix = str(i) + '/' + str(len(fut)), length=50, errors=len(errors))
            try:
                tmp = fut[i].result()
                tmp = tmp.json()['data']
                comment_ids.extend(tmp)
            except:
                print(fut[i].result().status_code)
                errors.append(fut[i])
                
        comment_ids_df = pd.DataFrame({
            'id': comment_ids
        })
        save_data(comment_ids_df, 'comment_ids.pkl')
else:
    comment_ids_df = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'comment_ids_39k.pkl')
    comment_ids = comment_ids_df['id'].values.tolist()
    print('Comment ids successfully loaded...')

print(len(comment_ids))
#print('https://api.pushshift.io/reddit/search/comment/?ids=' + ','.join(comment_ids[0:500]))

Comment ids successfully loaded...
1926303


In [8]:
#get data for every comment
NUM_CONCAT = 500
fut = []
errors = []
comments = []
comments_df = pd.DataFrame()
if False:
    with FuturesSession(max_workers=5) as session:
        _done = 0
        idx = 0
        while _done != -1:
            _end = _done + NUM_CONCAT
            if _end >= len(comment_ids) + 1:
                _end = -1
            fut.append(session.get('https://api.pushshift.io/reddit/search/comment/?ids=' + ','.join(comment_ids[_done:_end])))
            _done = _end

        for i in range(len(fut)):
            print_progress(i, len(fut), prefix = 'Start', suffix = str(i) + '/' + str(len(fut)), length=50, errors=len(errors))
            try:
                tmp = fut[i].result()
                tmp = tmp.json()['data']
                comments.extend(tmp)
                
                if i and i % 100 == 0:
                    comments_df = pd.DataFrame(comments)
                    save_data(comments_df, 'comments.pkl')
            except:
                errors.append(fut[i])
        
        comments_df = pd.DataFrame(comments)
        save_data(comments_df, 'comments.pkl')
else:
    comments_df = pd.read_pickle(ROOT_PATH + LOAD_DATA_PATH + '/' + 'comments.pkl')
    print('Comments successfully loaded...')
    
print(comments_df.info())

Creating 2018-2-6 directory...---------------------------| 2.6% 100/3853 - errors: 0
Start |+++++++++++++++++++++++++++++++++++++++++++++++++-| 99.9% 3851/3853 - errors: 0
<class 'pandas.core.frame.DataFrame'>+++++++++++++++++++-| 100.0% 3852/3853 - errors: 0
RangeIndex: 1926302 entries, 0 to 1926301
Data columns (total 31 columns):
approved_at_utc                  object
author                           object
author_cakeday                   object
author_flair_background_color    object
author_flair_css_class           object
author_flair_richtext            object
author_flair_text                object
author_flair_text_color          object
author_flair_type                object
banned_at_utc                    object
body                             object
can_mod_post                     bool
collapsed                        bool
collapsed_reason                 object
created_utc                      int64
distinguished                    object
edited                        

In [9]:
comments_df = pd.DataFrame(comment_data)
print(comment_data[0])

{'approved_at_utc': None, 'author': 'Asdn1220', 'author_flair_background_color': '', 'author_flair_css_class': None, 'author_flair_richtext': [], 'author_flair_text': None, 'author_flair_text_color': 'dark', 'author_flair_type': 'text', 'banned_at_utc': None, 'body': 'I am panicking', 'can_mod_post': False, 'collapsed': False, 'collapsed_reason': None, 'created_utc': 1517479281, 'distinguished': None, 'edited': False, 'id': 'dtkd1na', 'is_submitter': False, 'link_id': 't3_7uhqjf', 'mod_note': None, 'mod_reason_by': None, 'mod_reason_title': None, 'parent_id': 't3_7uhqjf', 'permalink': '/r/Bitcoin/comments/7uhqjf/daily_discussion_february_01_2018/dtkd1na/', 'retrieved_on': 1517479283, 'rte_mode': 'markdown', 'score': 1, 'stickied': False, 'subreddit': 'Bitcoin', 'subreddit_id': 't5_2s3qj'}
