In [1]:
import requests_cache
requests_cache.install_cache('web_cache')

In [2]:
import praw
import requests
from pymongo import MongoClient
import json
import datetime
from tqdm.notebook import tqdm
from IPython.display import JSON

In [3]:
mongo = MongoClient('172.17.0.1', 27017)
db = mongo['bad-vis']
raw = db['pushshift-raw']
reddit_raw = db['reddit-raw']
reddit_merge = db['reddit-merge']

In [4]:
crawler_queue = [
    {'r': 'dataisugly', 'limit': 1000}
]

In [5]:
def digest_submission (submission):
    if not raw.find_one({'id': submission['id']}, {'_id': 1}):
        return raw.insert_one(submission)
    else:
        return ''

def retrieve_pushshift (r, before_epoch, limit=1000):
    payload = {
        'subreddit': r,
        'sort': 'desc',
        'sort_type': 'created_utc',
        'before': before_epoch,
        'size': limit
    }
    reply = requests.get('https://api.pushshift.io/reddit/search/submission/', params=payload)
    if reply.status_code != 200:
        print(reply.status_code)
        print(json.dumps(payload))
    return reply.json()['data']
    
def retrieve (r, before_epoch, limit=200):
    api = PushshiftAPI(reddit)
    submissions = api.search_submissions(before=before_epoch, subreddit=r, limit=limit)
    return [submission for submission in submissions]

def submissions_iter (r, limit=200):
    before_epoch = int(datetime.datetime(2020, 1, 1).timestamp())
    last_submission_id = 0
    while True:
        submissions = retrieve_pushshift(r, before_epoch, limit)
        if last_submission_id == submissions[-1]['id']:
            print(f'Last: {submissions[-1]["id"]} {submissions[-1]["created_utc"]}')
            break

        for submission in submissions:
            yield submission

        before_epoch = submissions[-1]['created_utc'] + 1
        last_submission_id = submissions[-1]['id']
        print(f'New epoch: {before_epoch} {datetime.datetime.fromtimestamp(before_epoch).isoformat()}')

def retrieve_submissions (r, limit=200):
    return (digest_submission(s) for s in tqdm(submissions_iter(r, limit)))

In [6]:
for c in crawler_queue:
    submissions = [s for s in retrieve_submissions(r=c['r'], limit=c['limit'])]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

New epoch: 1551741197 2019-03-04T23:13:17
New epoch: 1519160110 2018-02-20T20:55:10
New epoch: 1490509975 2017-03-26T06:32:55
New epoch: 1459306492 2016-03-30T02:54:52
New epoch: 1428012462 2015-04-02T22:07:42
New epoch: 1353713658 2012-11-23T23:34:18
Last: 13otwh 1353713657



# Merge with reddit-raw

In [7]:
credentials = ""

with open('./reddit_credentials.json') as f:
    credentials = json.load(f)

In [8]:
reddit = praw.Reddit(client_id=credentials['client_id'],
                     client_secret=credentials['client_secret'],
                     password=credentials['password'],
                     user_agent=credentials['user_agent'],
                     username=credentials['username'])

In [12]:
class PRAWJSONEncoder(json.JSONEncoder):
    """Class to encode PRAW objects to JSON."""
    """From https://gist.github.com/jarhill0/6e6495706252d52573950c3820f533b0"""
    def default(self, obj):
        if isinstance(obj, praw.models.base.PRAWBase):
            obj_dict = {}
            for key, value in obj.__dict__.items():
                if not key.startswith('_'):
                    obj_dict[key] = value
            return obj_dict
        elif 'praw' in str(type(obj)):
            return
        else:
            return json.JSONEncoder.default(self, obj)

def retrieve_original_submissions ():
    for s in tqdm(raw.find()):
        if not reddit_raw.find_one({'id': s['id']}, {'_id': 1}) and not reddit_merge.find_one({'id': s['id']}, {'_id': 1}):
            original = reddit.submission(s['id'])
            name = original.name # trigger data fetching
            reddit_merge.insert_one(json.loads(json.dumps(original, cls=PRAWJSONEncoder)))

    return

In [13]:
retrieve_original_submissions()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [11]:
for s in tqdm(reddit_raw.find()):
    reddit_merge.insert_one(s)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


