In [1]:
import logging
import os
import pprint
logging.basicConfig(level=logging.INFO)
logging.getLogger('backoff').addHandler(logging.StreamHandler())

# http
from bs4 import BeautifulSoup
import requests
import json
import backoff
from datetime import datetime, date, timedelta
import time

# data viz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

# data
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re


# ['libraryofshadows', 'nosleep']

In [5]:
@backoff.on_exception(backoff.expo,
                      requests.exceptions.Timeout,
                      max_time=600)
@backoff.on_exception(backoff.expo,
                      requests.exceptions.RequestException,
                      max_time=300,
                      jitter=backoff.full_jitter)
def get_url(url, params={}):
    res = requests.get(url, params)
    if res.status_code in [429] or res.status_code >= 500:
        res.raise_for_status() 
    return res

In [14]:
def process_result(res, outfile):
    json_response = res.json()
    print("records retreived:", len(json_response['data']), flush=True)
    #pprint.pprint(json_response)
    with open(outfile, 'a+') as file:
        for data in json_response['data']:  
            json.dump(data, file)
            file.write('\n')
    

def iterate_subreddit(subreddit, datestart, dateend, days_delta=1):
    while datestart < dateend:
        print(datestart, datestart + timedelta(days=days_delta), flush=True)
        res = get_url('https://api.pushshift.io/reddit/search/submission/',
                      params={'subreddit':subreddit, 
                              'sort':'desc',
                              'sort_type':'created_utc',
                              'after':int(datestart.timestamp()),
                              'before':int((datestart + timedelta(days=days_delta)).timestamp()),
                              'size':2000})
        process_result(res, outfile=f'{subreddit}.txt')             
        datestart += timedelta(days=days_delta)
        time.sleep(2)
    print("done")



In [15]:
iterate_subreddit('nosleep', datetime(2018,3,18), datetime(2018,3,23))
# should fetch more than 1000 2018-03-19 00:00:00 2018-03-20 00:00:00

2018-03-18 00:00:00 2018-03-19 00:00:00
records retreived: 90
2018-03-19 00:00:00 2018-03-20 00:00:00
records retreived: 1000
2018-03-20 00:00:00 2018-03-21 00:00:00
records retreived: 571
2018-03-21 00:00:00 2018-03-22 00:00:00
records retreived: 568
2018-03-22 00:00:00 2018-03-23 00:00:00
records retreived: 124
done
