In [1]:
import requests
import requests.auth
import json
import time

In [2]:
def get_new_token(app_id, secret, user, pw):
    client_auth = requests.auth.HTTPBasicAuth(app_id, secret)

    post_data = {"grant_type": "password", "username": user, "password": pw}

    headers = {"User-Agent": f"data science class project .1 u/{user}"}
    response = requests.post("https://www.reddit.com/api/v1/access_token",
                         auth=client_auth, data=post_data, headers=headers)
    return response.json()

def print_time():
    cur_time = time.localtime()
    cur_hour, cur_min, cur_sec = cur_time.tm_hour, cur_time.tm_min, cur_time.tm_sec
    return print(cur_hour, cur_min, cur_sec)

def reddit_scrape(subreddit, app_id, secret, user, pw, pages=5, save_to_json=True, verbose=True):
    after = None
    posts = []
    params = {}
    ten_percentile = 0
    prev_query_time = 0
    start_time = time.time()
    response = get_new_token(app_id, secret, user, pw)
    user_agent = "data science class project 0.1 u/timdwyer_app"
    my_headers = {
        "Authorization": f"{response['token_type']} {response['access_token']}",
        "User-Agent": user_agent
     }

    for page in range(pages):
        if after:
            params['after'] = after

        time_elapsed = time.time() - prev_query_time
        if time_elapsed <= 1:
            time.sleep(1 - time_elapsed)

        token_time = time.time() - start_time
        if token_time >= 3600:
                response = get_new_token(app_id, secret, user, pw)
                my_headers = {
                    "Authorization": f"{response['token_type']} {response['access_token']}",
                    "User-Agent": user_agent
                }

        res = requests.get(f'https://reddit.com/r/{subreddit}.json', params=params, headers=my_headers)
        prev_query_time = time.time()

        cur_page = res.json()
        posts.extend(cur_page['data'].get('children'))
        after = cur_page['data']['after']


        if verbose and page/pages > ten_percentile:
            print_time()
            print(page)
            ten_percentile += .1

    print_time()


    if save_to_json:
        with open(f'./data/json/{subreddit}.json', 'w+') as file:
            json.dump(posts, file)
    else:
        return posts


The list `info` contains my `app_id, app_secret, user_name, password` in sequence. This will not work as written since the file with this information is not here as it is not for sharing. If you would like to test that this code runs you should create your own reddit app, and pass your secret and id along. 

In [3]:
info = []
with open('./reddit_login.txt', 'r') as file:
    for line in file:
        info.append(line.split()[1])

In [5]:
reddit_scrape('learnmath', *info, pages=2000)

13 4 4
1
13 8 44
201
13 13 25
401
13 18 11
601
13 22 44
801
13 27 16
1001
13 31 48
1201
13 36 19
1401
13 40 48
1600
13 45 18
1800
13 49 51


In [4]:
reddit_scrape('math', *info, pages=150)

14 5 25
1
14 5 46
16
14 6 8
31
14 6 29
46
14 6 50
61
14 7 11
76
14 7 43
91
14 8 3
106
14 8 23
120
14 8 44
135
14 9 4


In [None]:
reddit_scrape('actuary', *info, pages=2000)

In [None]:
reddit_scrape('datascience', *info, pages=2000)

In [None]:
reddit_scrape('learnpython', *info, pages=2000)

In [None]:
reddit_scrape('python', *info, pages=2000)